summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c295
-rw-r--r--fs/9p/acl.h8
-rw-r--r--fs/9p/vfs_addr.c4
-rw-r--r--fs/9p/vfs_dir.c2
-rw-r--r--fs/9p/vfs_inode_dotl.c4
-rw-r--r--fs/9p/xattr.c11
-rw-r--r--fs/9p/xattr.h2
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Kconfig.binfmt35
-rw-r--r--fs/Makefile1
-rw-r--r--fs/affs/super.c2
-rw-r--r--fs/afs/cmservice.c2
-rw-r--r--fs/afs/dir.c25
-rw-r--r--fs/afs/file.c4
-rw-r--r--fs/afs/fs_probe.c4
-rw-r--r--fs/afs/internal.h4
-rw-r--r--fs/afs/rxrpc.c10
-rw-r--r--fs/afs/server.c2
-rw-r--r--fs/afs/write.c4
-rw-r--r--fs/aio.c13
-rw-r--r--fs/anon_inodes.c2
-rw-r--r--fs/attr.c74
-rw-r--r--fs/bad_inode.c6
-rw-r--r--fs/binfmt_aout.c342
-rw-r--r--fs/binfmt_elf.c302
-rw-r--r--fs/binfmt_elf_fdpic.c7
-rw-r--r--fs/binfmt_misc.c8
-rw-r--r--fs/btrfs/Makefile6
-rw-r--r--fs/btrfs/accessors.c (renamed from fs/btrfs/struct-funcs.c)12
-rw-r--r--fs/btrfs/accessors.h1073
-rw-r--r--fs/btrfs/acl.c5
-rw-r--r--fs/btrfs/acl.h27
-rw-r--r--fs/btrfs/backref.c1022
-rw-r--r--fs/btrfs/backref.h204
-rw-r--r--fs/btrfs/bio.c381
-rw-r--r--fs/btrfs/bio.h127
-rw-r--r--fs/btrfs/block-group.c332
-rw-r--r--fs/btrfs/block-group.h69
-rw-r--r--fs/btrfs/block-rsv.c46
-rw-r--r--fs/btrfs/block-rsv.h15
-rw-r--r--fs/btrfs/btrfs_inode.h186
-rw-r--r--fs/btrfs/check-integrity.c4
-rw-r--r--fs/btrfs/compression.c119
-rw-r--r--fs/btrfs/compression.h11
-rw-r--r--fs/btrfs/ctree.c404
-rw-r--r--fs/btrfs/ctree.h3443
-rw-r--r--fs/btrfs/defrag.c1376
-rw-r--r--fs/btrfs/defrag.h22
-rw-r--r--fs/btrfs/delalloc-space.c74
-rw-r--r--fs/btrfs/delalloc-space.h6
-rw-r--r--fs/btrfs/delayed-inode.c309
-rw-r--r--fs/btrfs/delayed-inode.h36
-rw-r--r--fs/btrfs/delayed-ref.c21
-rw-r--r--fs/btrfs/dev-replace.c44
-rw-r--r--fs/btrfs/dev-replace.h12
-rw-r--r--fs/btrfs/dir-item.c60
-rw-r--r--fs/btrfs/dir-item.h42
-rw-r--r--fs/btrfs/discard.c112
-rw-r--r--fs/btrfs/disk-io.c560
-rw-r--r--fs/btrfs/disk-io.h42
-rw-r--r--fs/btrfs/export.c25
-rw-r--r--fs/btrfs/export.h3
-rw-r--r--fs/btrfs/extent-io-tree.c1766
-rw-r--r--fs/btrfs/extent-io-tree.h200
-rw-r--r--fs/btrfs/extent-tree.c113
-rw-r--r--fs/btrfs/extent-tree.h78
-rw-r--r--fs/btrfs/extent_io.c3286
-rw-r--r--fs/btrfs/extent_io.h82
-rw-r--r--fs/btrfs/extent_map.c422
-rw-r--r--fs/btrfs/extent_map.h8
-rw-r--r--fs/btrfs/file-item.c286
-rw-r--r--fs/btrfs/file-item.h69
-rw-r--r--fs/btrfs/file.c1273
-rw-r--r--fs/btrfs/file.h33
-rw-r--r--fs/btrfs/free-space-cache.c167
-rw-r--r--fs/btrfs/free-space-cache.h14
-rw-r--r--fs/btrfs/free-space-tree.c23
-rw-r--r--fs/btrfs/fs.c94
-rw-r--r--fs/btrfs/fs.h976
-rw-r--r--fs/btrfs/inode-item.c79
-rw-r--r--fs/btrfs/inode-item.h20
-rw-r--r--fs/btrfs/inode.c1438
-rw-r--r--fs/btrfs/ioctl.c972
-rw-r--r--fs/btrfs/ioctl.h17
-rw-r--r--fs/btrfs/locking.c26
-rw-r--r--fs/btrfs/locking.h77
-rw-r--r--fs/btrfs/lzo.c4
-rw-r--r--fs/btrfs/messages.c353
-rw-r--r--fs/btrfs/messages.h245
-rw-r--r--fs/btrfs/misc.h59
-rw-r--r--fs/btrfs/ordered-data.c75
-rw-r--r--fs/btrfs/ordered-data.h14
-rw-r--r--fs/btrfs/orphan.c1
-rw-r--r--fs/btrfs/orphan.h11
-rw-r--r--fs/btrfs/print-tree.c21
-rw-r--r--fs/btrfs/props.c13
-rw-r--r--fs/btrfs/props.h2
-rw-r--r--fs/btrfs/qgroup.c181
-rw-r--r--fs/btrfs/qgroup.h14
-rw-r--r--fs/btrfs/raid56.c2087
-rw-r--r--fs/btrfs/raid56.h37
-rw-r--r--fs/btrfs/rcu-string.h6
-rw-r--r--fs/btrfs/ref-verify.c3
-rw-r--r--fs/btrfs/reflink.c40
-rw-r--r--fs/btrfs/relocation.c114
-rw-r--r--fs/btrfs/relocation.h23
-rw-r--r--fs/btrfs/root-tree.c40
-rw-r--r--fs/btrfs/root-tree.h34
-rw-r--r--fs/btrfs/scrub.c718
-rw-r--r--fs/btrfs/scrub.h16
-rw-r--r--fs/btrfs/send.c1011
-rw-r--r--fs/btrfs/send.h26
-rw-r--r--fs/btrfs/space-info.c182
-rw-r--r--fs/btrfs/space-info.h87
-rw-r--r--fs/btrfs/subpage.c3
-rw-r--r--fs/btrfs/super.c642
-rw-r--r--fs/btrfs/super.h29
-rw-r--r--fs/btrfs/sysfs.c195
-rw-r--r--fs/btrfs/tests/btrfs-tests.c7
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c1
-rw-r--r--fs/btrfs/tests/extent-io-tests.c43
-rw-r--r--fs/btrfs/tests/free-space-tests.c22
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c3
-rw-r--r--fs/btrfs/tests/inode-tests.c68
-rw-r--r--fs/btrfs/tests/qgroup-tests.c88
-rw-r--r--fs/btrfs/transaction.c252
-rw-r--r--fs/btrfs/transaction.h22
-rw-r--r--fs/btrfs/tree-checker.c10
-rw-r--r--fs/btrfs/tree-checker.h35
-rw-r--r--fs/btrfs/tree-defrag.c132
-rw-r--r--fs/btrfs/tree-log.c2024
-rw-r--r--fs/btrfs/tree-log.h13
-rw-r--r--fs/btrfs/tree-mod-log.c36
-rw-r--r--fs/btrfs/tree-mod-log.h4
-rw-r--r--fs/btrfs/ulist.c38
-rw-r--r--fs/btrfs/ulist.h2
-rw-r--r--fs/btrfs/uuid-tree.c5
-rw-r--r--fs/btrfs/uuid-tree.h12
-rw-r--r--fs/btrfs/verity.c9
-rw-r--r--fs/btrfs/verity.h28
-rw-r--r--fs/btrfs/volumes.c456
-rw-r--r--fs/btrfs/volumes.h128
-rw-r--r--fs/btrfs/xattr.c4
-rw-r--r--fs/btrfs/zlib.c6
-rw-r--r--fs/btrfs/zoned.c209
-rw-r--r--fs/btrfs/zoned.h12
-rw-r--r--fs/btrfs/zstd.c4
-rw-r--r--fs/buffer.c195
-rw-r--r--fs/cachefiles/io.c77
-rw-r--r--fs/cachefiles/namei.c122
-rw-r--r--fs/ceph/acl.c3
-rw-r--r--fs/ceph/addr.c6
-rw-r--r--fs/ceph/caps.c87
-rw-r--r--fs/ceph/dir.c2
-rw-r--r--fs/ceph/export.c3
-rw-r--r--fs/ceph/file.c30
-rw-r--r--fs/ceph/inode.c41
-rw-r--r--fs/ceph/ioctl.c2
-rw-r--r--fs/ceph/locks.c4
-rw-r--r--fs/ceph/mds_client.c11
-rw-r--r--fs/ceph/mds_client.h6
-rw-r--r--fs/ceph/mdsmap.c2
-rw-r--r--fs/ceph/snap.c3
-rw-r--r--fs/ceph/super.h7
-rw-r--r--fs/char_dev.c15
-rw-r--r--fs/cifs/cached_dir.c490
-rw-r--r--fs/cifs/cached_dir.h30
-rw-r--r--fs/cifs/cifs_debug.c4
-rw-r--r--fs/cifs/cifs_debug.h6
-rw-r--r--fs/cifs/cifs_ioctl.h10
-rw-r--r--fs/cifs/cifs_spnego.c2
-rw-r--r--fs/cifs/cifs_swn.c12
-rw-r--r--fs/cifs/cifsacl.c141
-rw-r--r--fs/cifs/cifsencrypt.c100
-rw-r--r--fs/cifs/cifsfs.c54
-rw-r--r--fs/cifs/cifsfs.h8
-rw-r--r--fs/cifs/cifsglob.h154
-rw-r--r--fs/cifs/cifspdu.h57
-rw-r--r--fs/cifs/cifsproto.h46
-rw-r--r--fs/cifs/cifssmb.c210
-rw-r--r--fs/cifs/connect.c65
-rw-r--r--fs/cifs/dfs_cache.c2
-rw-r--r--fs/cifs/dir.c46
-rw-r--r--fs/cifs/file.c120
-rw-r--r--fs/cifs/fs_context.c23
-rw-r--r--fs/cifs/fscache.c6
-rw-r--r--fs/cifs/inode.c188
-rw-r--r--fs/cifs/ioctl.c29
-rw-r--r--fs/cifs/link.c120
-rw-r--r--fs/cifs/misc.c73
-rw-r--r--fs/cifs/netlink.c1
-rw-r--r--fs/cifs/readdir.c31
-rw-r--r--fs/cifs/sess.c53
-rw-r--r--fs/cifs/smb1ops.c56
-rw-r--r--fs/cifs/smb2file.c127
-rw-r--r--fs/cifs/smb2inode.c216
-rw-r--r--fs/cifs/smb2misc.c96
-rw-r--r--fs/cifs/smb2ops.c399
-rw-r--r--fs/cifs/smb2pdu.c119
-rw-r--r--fs/cifs/smb2pdu.h3
-rw-r--r--fs/cifs/smb2proto.h28
-rw-r--r--fs/cifs/smb2transport.c117
-rw-r--r--fs/cifs/smbdirect.c227
-rw-r--r--fs/cifs/smbdirect.h14
-rw-r--r--fs/cifs/trace.h3
-rw-r--r--fs/cifs/transport.c11
-rw-r--r--fs/cifs/xattr.c68
-rw-r--r--fs/configfs/dir.c2
-rw-r--r--fs/coredump.c99
-rw-r--r--fs/crypto/bio.c16
-rw-r--r--fs/crypto/fscrypt_private.h77
-rw-r--r--fs/crypto/hooks.c10
-rw-r--r--fs/crypto/inline_crypt.c192
-rw-r--r--fs/crypto/keyring.c498
-rw-r--r--fs/crypto/keysetup.c104
-rw-r--r--fs/crypto/keysetup_v1.c4
-rw-r--r--fs/crypto/policy.c33
-rw-r--r--fs/d_path.c5
-rw-r--r--fs/dax.c221
-rw-r--r--fs/dcache.c17
-rw-r--r--fs/debugfs/file.c44
-rw-r--r--fs/debugfs/inode.c37
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/dlm/ast.c325
-rw-r--r--fs/dlm/ast.h18
-rw-r--r--fs/dlm/config.c4
-rw-r--r--fs/dlm/debug_fs.c2
-rw-r--r--fs/dlm/dlm_internal.h27
-rw-r--r--fs/dlm/lock.c357
-rw-r--r--fs/dlm/lock.h2
-rw-r--r--fs/dlm/lockspace.c46
-rw-r--r--fs/dlm/lockspace.h13
-rw-r--r--fs/dlm/lowcomms.c1544
-rw-r--r--fs/dlm/lowcomms.h6
-rw-r--r--fs/dlm/main.c7
-rw-r--r--fs/dlm/member.c5
-rw-r--r--fs/dlm/memory.c30
-rw-r--r--fs/dlm/memory.h4
-rw-r--r--fs/dlm/midcomms.c141
-rw-r--r--fs/dlm/midcomms.h7
-rw-r--r--fs/dlm/netlink.c1
-rw-r--r--fs/dlm/rcom.c4
-rw-r--r--fs/dlm/requestqueue.c3
-rw-r--r--fs/dlm/user.c85
-rw-r--r--fs/dlm/user.h2
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/file.c40
-rw-r--r--fs/ecryptfs/inode.c34
-rw-r--r--fs/ecryptfs/main.c2
-rw-r--r--fs/efivarfs/inode.c4
-rw-r--r--fs/efivarfs/super.c3
-rw-r--r--fs/efivarfs/vars.c16
-rw-r--r--fs/erofs/data.c10
-rw-r--r--fs/erofs/decompressor.c47
-rw-r--r--fs/erofs/decompressor_lzma.c3
-rw-r--r--fs/erofs/erofs_fs.h40
-rw-r--r--fs/erofs/fscache.c759
-rw-r--r--fs/erofs/inode.c34
-rw-r--r--fs/erofs/internal.h70
-rw-r--r--fs/erofs/namei.c15
-rw-r--r--fs/erofs/super.c119
-rw-r--r--fs/erofs/sysfs.c23
-rw-r--r--fs/erofs/xattr.c8
-rw-r--r--fs/erofs/xattr.h2
-rw-r--r--fs/erofs/zdata.c156
-rw-r--r--fs/erofs/zdata.h6
-rw-r--r--fs/erofs/zmap.c133
-rw-r--r--fs/eventfd.c47
-rw-r--r--fs/eventpoll.c20
-rw-r--r--fs/exec.c76
-rw-r--r--fs/exfat/dir.c190
-rw-r--r--fs/exfat/exfat_fs.h56
-rw-r--r--fs/exfat/file.c12
-rw-r--r--fs/exfat/inode.c28
-rw-r--r--fs/exfat/namei.c63
-rw-r--r--fs/exportfs/expfs.c15
-rw-r--r--fs/ext2/acl.c3
-rw-r--r--fs/ext2/acl.h2
-rw-r--r--fs/ext2/balloc.c19
-rw-r--r--fs/ext2/dir.c41
-rw-r--r--fs/ext2/file.c2
-rw-r--r--fs/ext2/ialloc.c3
-rw-r--r--fs/ext2/inode.c8
-rw-r--r--fs/ext2/namei.c10
-rw-r--r--fs/ext2/super.c24
-rw-r--r--fs/ext4/acl.c3
-rw-r--r--fs/ext4/acl.h2
-rw-r--r--fs/ext4/ext4.h19
-rw-r--r--fs/ext4/ext4_jbd2.c14
-rw-r--r--fs/ext4/ext4_jbd2.h10
-rw-r--r--fs/ext4/extents.c141
-rw-r--r--fs/ext4/extents_status.c14
-rw-r--r--fs/ext4/fast_commit.c354
-rw-r--r--fs/ext4/fast_commit.h6
-rw-r--r--fs/ext4/file.c45
-rw-r--r--fs/ext4/ialloc.c13
-rw-r--r--fs/ext4/indirect.c9
-rw-r--r--fs/ext4/inline.c3
-rw-r--r--fs/ext4/inode.c308
-rw-r--r--fs/ext4/ioctl.c38
-rw-r--r--fs/ext4/mballoc.c13
-rw-r--r--fs/ext4/migrate.c6
-rw-r--r--fs/ext4/mmp.c8
-rw-r--r--fs/ext4/move_extent.c78
-rw-r--r--fs/ext4/namei.c84
-rw-r--r--fs/ext4/orphan.c2
-rw-r--r--fs/ext4/page-io.c44
-rw-r--r--fs/ext4/readpage.c21
-rw-r--r--fs/ext4/resize.c33
-rw-r--r--fs/ext4/super.c1303
-rw-r--r--fs/ext4/verity.c11
-rw-r--r--fs/ext4/xattr.c23
-rw-r--r--fs/f2fs/acl.c6
-rw-r--r--fs/f2fs/acl.h2
-rw-r--r--fs/f2fs/checkpoint.c74
-rw-r--r--fs/f2fs/compress.c144
-rw-r--r--fs/f2fs/data.c174
-rw-r--r--fs/f2fs/debug.c140
-rw-r--r--fs/f2fs/dir.c37
-rw-r--r--fs/f2fs/extent_cache.c694
-rw-r--r--fs/f2fs/f2fs.h366
-rw-r--r--fs/f2fs/file.c148
-rw-r--r--fs/f2fs/gc.c103
-rw-r--r--fs/f2fs/inline.c17
-rw-r--r--fs/f2fs/inode.c71
-rw-r--r--fs/f2fs/iostat.c74
-rw-r--r--fs/f2fs/iostat.h4
-rw-r--r--fs/f2fs/namei.c408
-rw-r--r--fs/f2fs/node.c28
-rw-r--r--fs/f2fs/node.h3
-rw-r--r--fs/f2fs/recovery.c33
-rw-r--r--fs/f2fs/segment.c242
-rw-r--r--fs/f2fs/segment.h8
-rw-r--r--fs/f2fs/shrinker.c25
-rw-r--r--fs/f2fs/super.c240
-rw-r--r--fs/f2fs/sysfs.c171
-rw-r--r--fs/f2fs/verity.c15
-rw-r--r--fs/f2fs/xattr.c8
-rw-r--r--fs/fat/dir.c8
-rw-r--r--fs/fat/file.c5
-rw-r--r--fs/fat/inode.c11
-rw-r--r--fs/fat/nfs.c4
-rw-r--r--fs/fhandle.c2
-rw-r--r--fs/file.c12
-rw-r--r--fs/file_table.c7
-rw-r--r--fs/fs-writeback.c72
-rw-r--r--fs/fs_parser.c3
-rw-r--r--fs/fscache/cookie.c8
-rw-r--r--fs/fscache/io.c2
-rw-r--r--fs/fscache/volume.c7
-rw-r--r--fs/fuse/acl.c5
-rw-r--r--fs/fuse/cuse.c5
-rw-r--r--fs/fuse/dev.c65
-rw-r--r--fs/fuse/dir.c71
-rw-r--r--fs/fuse/file.c86
-rw-r--r--fs/fuse/fuse_i.h9
-rw-r--r--fs/fuse/ioctl.c4
-rw-r--r--fs/fuse/readdir.c14
-rw-r--r--fs/gfs2/acl.c3
-rw-r--r--fs/gfs2/acl.h2
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/gfs2/bmap.c3
-rw-r--r--fs/gfs2/export.c6
-rw-r--r--fs/gfs2/file.c28
-rw-r--r--fs/gfs2/glock.c518
-rw-r--r--fs/gfs2/glock.h67
-rw-r--r--fs/gfs2/glops.c44
-rw-r--r--fs/gfs2/incore.h1
-rw-r--r--fs/gfs2/inode.c83
-rw-r--r--fs/gfs2/lock_dlm.c2
-rw-r--r--fs/gfs2/main.c24
-rw-r--r--fs/gfs2/meta_io.c13
-rw-r--r--fs/gfs2/ops_fstype.c31
-rw-r--r--fs/gfs2/quota.c8
-rw-r--r--fs/gfs2/super.c87
-rw-r--r--fs/gfs2/util.c12
-rw-r--r--fs/gfs2/xattr.c26
-rw-r--r--fs/hfs/bnode.c32
-rw-r--r--fs/hfs/btree.c29
-rw-r--r--fs/hfs/inode.c4
-rw-r--r--fs/hfs/trans.c2
-rw-r--r--fs/hfsplus/bitmap.c20
-rw-r--r--fs/hfsplus/bnode.c105
-rw-r--r--fs/hfsplus/btree.c27
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/hfsplus/inode.c6
-rw-r--r--fs/hfsplus/options.c4
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hpfs/file.c9
-rw-r--r--fs/hugetlbfs/inode.c373
-rw-r--r--fs/inode.c82
-rw-r--r--fs/internal.h50
-rw-r--r--fs/iomap/buffered-io.c257
-rw-r--r--fs/iomap/direct-io.c3
-rw-r--r--fs/iomap/iter.c19
-rw-r--r--fs/iomap/trace.h1
-rw-r--r--fs/isofs/compress.c22
-rw-r--r--fs/isofs/inode.c9
-rw-r--r--fs/jbd2/commit.c17
-rw-r--r--fs/jbd2/journal.c34
-rw-r--r--fs/jbd2/recovery.c17
-rw-r--r--fs/jbd2/transaction.c6
-rw-r--r--fs/jffs2/acl.c3
-rw-r--r--fs/jffs2/acl.h2
-rw-r--r--fs/jffs2/dir.c2
-rw-r--r--fs/jffs2/file.c2
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/jffs2/wbuf.c6
-rw-r--r--fs/jfs/acl.c3
-rw-r--r--fs/jfs/file.c4
-rw-r--r--fs/jfs/inode.c7
-rw-r--r--fs/jfs/jfs_acl.h2
-rw-r--r--fs/jfs/jfs_dmap.c27
-rw-r--r--fs/jfs/jfs_extent.h2
-rw-r--r--fs/jfs/jfs_imap.c2
-rw-r--r--fs/jfs/jfs_mount.c4
-rw-r--r--fs/jfs/jfs_umount.c4
-rw-r--r--fs/jfs/jfs_xattr.h2
-rw-r--r--fs/jfs/jfs_xtree.h4
-rw-r--r--fs/jfs/namei.c4
-rw-r--r--fs/jfs/super.c6
-rw-r--r--fs/kernfs/dir.c221
-rw-r--r--fs/kernfs/file.c163
-rw-r--r--fs/kernfs/inode.c12
-rw-r--r--fs/kernfs/kernfs-internal.h3
-rw-r--r--fs/kernfs/mount.c10
-rw-r--r--fs/kernfs/symlink.c2
-rw-r--r--fs/ksmbd/auth.c15
-rw-r--r--fs/ksmbd/auth.h3
-rw-r--r--fs/ksmbd/connection.c8
-rw-r--r--fs/ksmbd/connection.h2
-rw-r--r--fs/ksmbd/ksmbd_netlink.h4
-rw-r--r--fs/ksmbd/mgmt/share_config.c36
-rw-r--r--fs/ksmbd/mgmt/share_config.h4
-rw-r--r--fs/ksmbd/mgmt/tree_connect.c6
-rw-r--r--fs/ksmbd/mgmt/tree_connect.h2
-rw-r--r--fs/ksmbd/mgmt/user_session.c8
-rw-r--r--fs/ksmbd/misc.c46
-rw-r--r--fs/ksmbd/misc.h5
-rw-r--r--fs/ksmbd/ndr.c8
-rw-r--r--fs/ksmbd/oplock.c27
-rw-r--r--fs/ksmbd/server.c24
-rw-r--r--fs/ksmbd/smb2ops.c10
-rw-r--r--fs/ksmbd/smb2pdu.c170
-rw-r--r--fs/ksmbd/smb2pdu.h9
-rw-r--r--fs/ksmbd/smb_common.c8
-rw-r--r--fs/ksmbd/smb_common.h12
-rw-r--r--fs/ksmbd/smbacl.c18
-rw-r--r--fs/ksmbd/smbacl.h18
-rw-r--r--fs/ksmbd/transport_ipc.c1
-rw-r--r--fs/ksmbd/transport_rdma.c8
-rw-r--r--fs/ksmbd/transport_tcp.c3
-rw-r--r--fs/ksmbd/unicode.h3
-rw-r--r--fs/ksmbd/vfs.c79
-rw-r--r--fs/ksmbd/vfs.h8
-rw-r--r--fs/libfs.c68
-rw-r--r--fs/lockd/host.c2
-rw-r--r--fs/lockd/svc4proc.c25
-rw-r--r--fs/lockd/svclock.c17
-rw-r--r--fs/lockd/svcproc.c25
-rw-r--r--fs/lockd/svcsubs.c21
-rw-r--r--fs/locks.c50
-rw-r--r--fs/mbcache.c31
-rw-r--r--fs/minix/namei.c6
-rw-r--r--fs/namei.c139
-rw-r--r--fs/namespace.c179
-rw-r--r--fs/netfs/buffered_read.c20
-rw-r--r--fs/netfs/io.c9
-rw-r--r--fs/nfs/Kconfig8
-rw-r--r--fs/nfs/callback_xdr.c1
-rw-r--r--fs/nfs/client.c4
-rw-r--r--fs/nfs/delegation.c38
-rw-r--r--fs/nfs/dir.c39
-rw-r--r--fs/nfs/dns_resolve.c7
-rw-r--r--fs/nfs/dns_resolve.h2
-rw-r--r--fs/nfs/file.c9
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c113
-rw-r--r--fs/nfs/fs_context.c20
-rw-r--r--fs/nfs/fscache.c4
-rw-r--r--fs/nfs/inode.c21
-rw-r--r--fs/nfs/internal.h22
-rw-r--r--fs/nfs/mount_clnt.c4
-rw-r--r--fs/nfs/namespace.c6
-rw-r--r--fs/nfs/nfs3_fs.h2
-rw-r--r--fs/nfs/nfs3acl.c9
-rw-r--r--fs/nfs/nfs3client.c4
-rw-r--r--fs/nfs/nfs3proc.c7
-rw-r--r--fs/nfs/nfs42proc.c9
-rw-r--r--fs/nfs/nfs42xattr.c2
-rw-r--r--fs/nfs/nfs42xdr.c17
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4client.c21
-rw-r--r--fs/nfs/nfs4file.c12
-rw-r--r--fs/nfs/nfs4idmap.c4
-rw-r--r--fs/nfs/nfs4namespace.c16
-rw-r--r--fs/nfs/nfs4proc.c85
-rw-r--r--fs/nfs/nfs4state.c22
-rw-r--r--fs/nfs/nfs4trace.h60
-rw-r--r--fs/nfs/nfs4xdr.c22
-rw-r--r--fs/nfs/nfsroot.c2
-rw-r--r--fs/nfs/nfstrace.h6
-rw-r--r--fs/nfs/pagelist.c2
-rw-r--r--fs/nfs/pnfs.c13
-rw-r--r--fs/nfs/pnfs.h9
-rw-r--r--fs/nfs/pnfs_nfs.c10
-rw-r--r--fs/nfs/super.c5
-rw-r--r--fs/nfs/sysfs.c6
-rw-r--r--fs/nfs/unlink.c1
-rw-r--r--fs/nfs/write.c4
-rw-r--r--fs/nfsd/Kconfig19
-rw-r--r--fs/nfsd/Makefile5
-rw-r--r--fs/nfsd/blocklayout.c1
-rw-r--r--fs/nfsd/blocklayoutxdr.c1
-rw-r--r--fs/nfsd/cache.h2
-rw-r--r--fs/nfsd/export.h1
-rw-r--r--fs/nfsd/filecache.c580
-rw-r--r--fs/nfsd/filecache.h6
-rw-r--r--fs/nfsd/flexfilelayout.c1
-rw-r--r--fs/nfsd/netns.h4
-rw-r--r--fs/nfsd/nfs2acl.c23
-rw-r--r--fs/nfsd/nfs3acl.c41
-rw-r--r--fs/nfsd/nfs3proc.c53
-rw-r--r--fs/nfsd/nfs3xdr.c18
-rw-r--r--fs/nfsd/nfs4acl.c4
-rw-r--r--fs/nfsd/nfs4callback.c92
-rw-r--r--fs/nfsd/nfs4idmap.c9
-rw-r--r--fs/nfsd/nfs4layouts.c2
-rw-r--r--fs/nfsd/nfs4proc.c295
-rw-r--r--fs/nfsd/nfs4recover.c22
-rw-r--r--fs/nfsd/nfs4state.c542
-rw-r--r--fs/nfsd/nfs4xdr.c873
-rw-r--r--fs/nfsd/nfscache.c13
-rw-r--r--fs/nfsd/nfsctl.c64
-rw-r--r--fs/nfsd/nfsd.h16
-rw-r--r--fs/nfsd/nfsfh.c8
-rw-r--r--fs/nfsd/nfsfh.h10
-rw-r--r--fs/nfsd/nfsproc.c105
-rw-r--r--fs/nfsd/nfssvc.c10
-rw-r--r--fs/nfsd/nfsxdr.c4
-rw-r--r--fs/nfsd/state.h22
-rw-r--r--fs/nfsd/stats.c14
-rw-r--r--fs/nfsd/trace.h325
-rw-r--r--fs/nfsd/vfs.c236
-rw-r--r--fs/nfsd/vfs.h8
-rw-r--r--fs/nfsd/xdr4.h14
-rw-r--r--fs/nfsd/xdr4cb.h6
-rw-r--r--fs/nilfs2/btree.c6
-rw-r--r--fs/nilfs2/dat.c7
-rw-r--r--fs/nilfs2/inode.c19
-rw-r--r--fs/nilfs2/page.c45
-rw-r--r--fs/nilfs2/segment.c40
-rw-r--r--fs/nilfs2/sufile.c8
-rw-r--r--fs/nilfs2/super.c2
-rw-r--r--fs/nilfs2/the_nilfs.c75
-rw-r--r--fs/notify/fanotify/fanotify.c2
-rw-r--r--fs/notify/fanotify/fanotify.h8
-rw-r--r--fs/notify/fanotify/fanotify_user.c6
-rw-r--r--fs/notify/fsnotify.h4
-rw-r--r--fs/nsfs.c2
-rw-r--r--fs/ntfs/attrib.c28
-rw-r--r--fs/ntfs/file.c4
-rw-r--r--fs/ntfs/inode.c7
-rw-r--r--fs/ntfs/super.c3
-rw-r--r--fs/ntfs3/file.c4
-rw-r--r--fs/ntfs3/fslog.c6
-rw-r--r--fs/ntfs3/inode.c14
-rw-r--r--fs/ntfs3/namei.c6
-rw-r--r--fs/ntfs3/ntfs_fs.h4
-rw-r--r--fs/ntfs3/xattr.c103
-rw-r--r--fs/ocfs2/acl.c3
-rw-r--r--fs/ocfs2/acl.h2
-rw-r--r--fs/ocfs2/aops.c2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c38
-rw-r--r--fs/ocfs2/cluster/heartbeat.h2
-rw-r--r--fs/ocfs2/cluster/netdebug.c2
-rw-r--r--fs/ocfs2/cluster/nodemanager.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c9
-rw-r--r--fs/ocfs2/dir.c10
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h2
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c19
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c30
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/file.c8
-rw-r--r--fs/ocfs2/journal.c16
-rw-r--r--fs/ocfs2/journal.h1
-rw-r--r--fs/ocfs2/namei.c25
-rw-r--r--fs/ocfs2/ocfs2.h3
-rw-r--r--fs/ocfs2/ocfs2_fs.h8
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/ocfs2/stack_o2cb.c6
-rw-r--r--fs/ocfs2/stack_user.c2
-rw-r--r--fs/ocfs2/stackglue.c12
-rw-r--r--fs/ocfs2/suballoc.h2
-rw-r--r--fs/ocfs2/super.c11
-rw-r--r--fs/omfs/file.c7
-rw-r--r--fs/open.c21
-rw-r--r--fs/orangefs/acl.c47
-rw-r--r--fs/orangefs/dir.c2
-rw-r--r--fs/orangefs/file.c5
-rw-r--r--fs/orangefs/inode.c64
-rw-r--r--fs/orangefs/namei.c2
-rw-r--r--fs/orangefs/orangefs-debugfs.c29
-rw-r--r--fs/orangefs/orangefs-kernel.h7
-rw-r--r--fs/orangefs/orangefs-mod.c8
-rw-r--r--fs/orangefs/orangefs-sysfs.c71
-rw-r--r--fs/overlayfs/Kconfig2
-rw-r--r--fs/overlayfs/copy_up.c154
-rw-r--r--fs/overlayfs/dir.c68
-rw-r--r--fs/overlayfs/export.c8
-rw-r--r--fs/overlayfs/file.c33
-rw-r--r--fs/overlayfs/inode.c193
-rw-r--r--fs/overlayfs/namei.c16
-rw-r--r--fs/overlayfs/overlayfs.h89
-rw-r--r--fs/overlayfs/readdir.c102
-rw-r--r--fs/overlayfs/super.c119
-rw-r--r--fs/overlayfs/util.c25
-rw-r--r--fs/pipe.c2
-rw-r--r--fs/pnode.c2
-rw-r--r--fs/posix_acl.c602
-rw-r--r--fs/proc/Kconfig1
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/proc/base.c24
-rw-r--r--fs/proc/cmdline.c6
-rw-r--r--fs/proc/consoles.c21
-rw-r--r--fs/proc/devices.c6
-rw-r--r--fs/proc/fd.c45
-rw-r--r--fs/proc/internal.h7
-rw-r--r--fs/proc/kcore.c33
-rw-r--r--fs/proc/kmsg.c2
-rw-r--r--fs/proc/loadavg.c6
-rw-r--r--fs/proc/meminfo.c7
-rw-r--r--fs/proc/page.c6
-rw-r--r--fs/proc/proc_sysctl.c9
-rw-r--r--fs/proc/softirqs.c6
-rw-r--r--fs/proc/task_mmu.c112
-rw-r--r--fs/proc/task_nommu.c45
-rw-r--r--fs/proc/uptime.c6
-rw-r--r--fs/proc/version.c6
-rw-r--r--fs/proc/vmcore.c7
-rw-r--r--fs/pstore/platform.c88
-rw-r--r--fs/pstore/pmsg.c2
-rw-r--r--fs/pstore/ram.c44
-rw-r--r--fs/pstore/ram_core.c20
-rw-r--r--fs/pstore/ram_internal.h98
-rw-r--r--fs/pstore/zone.c2
-rw-r--r--fs/qnx6/inode.c6
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/quota/quota_tree.c73
-rw-r--r--fs/ramfs/file-nommu.c50
-rw-r--r--fs/ramfs/inode.c6
-rw-r--r--fs/read_write.c51
-rw-r--r--fs/readdir.c68
-rw-r--r--fs/reiserfs/acl.h6
-rw-r--r--fs/reiserfs/file.c2
-rw-r--r--fs/reiserfs/inode.c2
-rw-r--r--fs/reiserfs/journal.c11
-rw-r--r--fs/reiserfs/namei.c8
-rw-r--r--fs/reiserfs/prints.c2
-rw-r--r--fs/reiserfs/procfs.c4
-rw-r--r--fs/reiserfs/resize.c2
-rw-r--r--fs/reiserfs/stree.c4
-rw-r--r--fs/reiserfs/super.c11
-rw-r--r--fs/reiserfs/xattr.c20
-rw-r--r--fs/reiserfs/xattr_acl.c11
-rw-r--r--fs/reiserfs/xattr_security.c2
-rw-r--r--fs/remap_range.c9
-rw-r--r--fs/seq_file.c2
-rw-r--r--fs/smbfs_common/smb2pdu.h6
-rw-r--r--fs/splice.c10
-rw-r--r--fs/squashfs/Kconfig51
-rw-r--r--fs/squashfs/block.c2
-rw-r--r--fs/squashfs/decompressor.c2
-rw-r--r--fs/squashfs/decompressor_multi.c20
-rw-r--r--fs/squashfs/decompressor_multi_percpu.c23
-rw-r--r--fs/squashfs/decompressor_single.c15
-rw-r--r--fs/squashfs/file.c23
-rw-r--r--fs/squashfs/page_actor.c3
-rw-r--r--fs/squashfs/page_actor.h6
-rw-r--r--fs/squashfs/squashfs.h23
-rw-r--r--fs/squashfs/squashfs_fs_sb.h4
-rw-r--r--fs/squashfs/super.c102
-rw-r--r--fs/stat.c21
-rw-r--r--fs/super.c63
-rw-r--r--fs/sysv/itree.c2
-rw-r--r--fs/ubifs/crypto.c11
-rw-r--r--fs/ubifs/debug.c10
-rw-r--r--fs/ubifs/dir.c34
-rw-r--r--fs/ubifs/journal.c30
-rw-r--r--fs/ubifs/lpt_commit.c14
-rw-r--r--fs/ubifs/tnc_commit.c2
-rw-r--r--fs/ubifs/ubifs.h2
-rw-r--r--fs/ubifs/xattr.c2
-rw-r--r--fs/udf/dir.c2
-rw-r--r--fs/udf/directory.c2
-rw-r--r--fs/udf/file.c1
-rw-r--r--fs/udf/inode.c91
-rw-r--r--fs/udf/namei.c16
-rw-r--r--fs/udf/super.c4
-rw-r--r--fs/udf/truncate.c48
-rw-r--r--fs/udf/udf_sb.h6
-rw-r--r--fs/ufs/balloc.c12
-rw-r--r--fs/userfaultfd.c140
-rw-r--r--fs/verity/fsverity_private.h7
-rw-r--r--fs/verity/hash_algs.c6
-rw-r--r--fs/verity/measure.c19
-rw-r--r--fs/verity/read_metadata.c6
-rw-r--r--fs/verity/verify.c26
-rw-r--r--fs/xattr.c451
-rw-r--r--fs/xfs/libxfs/xfs_ag.h15
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c8
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c10
-rw-r--r--fs/xfs/libxfs/xfs_btree.h1
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c50
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h4
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c9
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c4
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h18
-rw-r--r--fs/xfs/libxfs/xfs_format.h22
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c4
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c4
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h60
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c432
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h40
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c15
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c9
-rw-r--r--fs/xfs/libxfs/xfs_sb.c4
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c4
-rw-r--r--fs/xfs/libxfs/xfs_types.h30
-rw-r--r--fs/xfs/scrub/agheader.c47
-rw-r--r--fs/xfs/scrub/agheader_repair.c81
-rw-r--r--fs/xfs/scrub/alloc.c4
-rw-r--r--fs/xfs/scrub/attr.c11
-rw-r--r--fs/xfs/scrub/bitmap.c11
-rw-r--r--fs/xfs/scrub/bmap.c147
-rw-r--r--fs/xfs/scrub/btree.c14
-rw-r--r--fs/xfs/scrub/common.c48
-rw-r--r--fs/xfs/scrub/common.h2
-rw-r--r--fs/xfs/scrub/dabtree.c4
-rw-r--r--fs/xfs/scrub/dir.c20
-rw-r--r--fs/xfs/scrub/fscounters.c109
-rw-r--r--fs/xfs/scrub/ialloc.c5
-rw-r--r--fs/xfs/scrub/inode.c2
-rw-r--r--fs/xfs/scrub/parent.c4
-rw-r--r--fs/xfs/scrub/quota.c8
-rw-r--r--fs/xfs/scrub/refcount.c84
-rw-r--r--fs/xfs/scrub/repair.c51
-rw-r--r--fs/xfs/scrub/scrub.c6
-rw-r--r--fs/xfs/scrub/scrub.h18
-rw-r--r--fs/xfs/scrub/symlink.c2
-rw-r--r--fs/xfs/xfs_acl.c3
-rw-r--r--fs/xfs/xfs_acl.h2
-rw-r--r--fs/xfs/xfs_aops.c32
-rw-r--r--fs/xfs/xfs_attr_item.c73
-rw-r--r--fs/xfs/xfs_bmap_item.c54
-rw-r--r--fs/xfs/xfs_bmap_util.c10
-rw-r--r--fs/xfs/xfs_bmap_util.h2
-rw-r--r--fs/xfs/xfs_buf.c1
-rw-r--r--fs/xfs/xfs_buf_item.c2
-rw-r--r--fs/xfs/xfs_dir2_readdir.c2
-rw-r--r--fs/xfs/xfs_error.c57
-rw-r--r--fs/xfs/xfs_error.h13
-rw-r--r--fs/xfs/xfs_extfree_item.c94
-rw-r--r--fs/xfs/xfs_extfree_item.h16
-rw-r--r--fs/xfs/xfs_file.c9
-rw-r--r--fs/xfs/xfs_fsmap.c4
-rw-r--r--fs/xfs/xfs_icache.c8
-rw-r--r--fs/xfs/xfs_inode.c17
-rw-r--r--fs/xfs/xfs_inode_item.c2
-rw-r--r--fs/xfs/xfs_inode_item_recover.c4
-rw-r--r--fs/xfs/xfs_ioctl.c4
-rw-r--r--fs/xfs/xfs_iomap.c191
-rw-r--r--fs/xfs/xfs_iomap.h6
-rw-r--r--fs/xfs/xfs_iops.c52
-rw-r--r--fs/xfs/xfs_iops.h1
-rw-r--r--fs/xfs/xfs_itable.c8
-rw-r--r--fs/xfs/xfs_log.c58
-rw-r--r--fs/xfs/xfs_log_recover.c10
-rw-r--r--fs/xfs/xfs_mount.c53
-rw-r--r--fs/xfs/xfs_notify_failure.c32
-rw-r--r--fs/xfs/xfs_ondisk.h23
-rw-r--r--fs/xfs/xfs_pnfs.c6
-rw-r--r--fs/xfs/xfs_qm.c16
-rw-r--r--fs/xfs/xfs_refcount_item.c57
-rw-r--r--fs/xfs/xfs_reflink.c30
-rw-r--r--fs/xfs/xfs_rmap_item.c70
-rw-r--r--fs/xfs/xfs_rtalloc.c60
-rw-r--r--fs/xfs/xfs_stats.c4
-rw-r--r--fs/xfs/xfs_super.c24
-rw-r--r--fs/xfs/xfs_sysfs.h7
-rw-r--r--fs/xfs/xfs_trace.c2
-rw-r--r--fs/xfs/xfs_trace.h138
-rw-r--r--fs/xfs/xfs_trans_ail.c15
-rw-r--r--fs/xfs/xfs_xattr.c2
-rw-r--r--fs/zonefs/super.c60
-rw-r--r--fs/zonefs/sysfs.c5
-rw-r--r--fs/zonefs/zonefs.h6
797 files changed, 37332 insertions, 27288 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 4dac4a0dc5f4..c397c51f80d9 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -17,34 +17,64 @@
#include "v9fs_vfs.h"
#include "fid.h"
-static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
+static struct posix_acl *v9fs_fid_get_acl(struct p9_fid *fid, const char *name)
{
ssize_t size;
void *value = NULL;
struct posix_acl *acl = NULL;
size = v9fs_fid_xattr_get(fid, name, NULL, 0);
- if (size > 0) {
- value = kzalloc(size, GFP_NOFS);
- if (!value)
- return ERR_PTR(-ENOMEM);
- size = v9fs_fid_xattr_get(fid, name, value, size);
- if (size > 0) {
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- goto err_out;
- }
- } else if (size == -ENODATA || size == 0 ||
- size == -ENOSYS || size == -EOPNOTSUPP) {
- acl = NULL;
- } else
- acl = ERR_PTR(-EIO);
-
-err_out:
+ if (size < 0)
+ return ERR_PTR(size);
+ if (size == 0)
+ return ERR_PTR(-ENODATA);
+
+ value = kzalloc(size, GFP_NOFS);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+
+ size = v9fs_fid_xattr_get(fid, name, value, size);
+ if (size < 0)
+ acl = ERR_PTR(size);
+ else if (size == 0)
+ acl = ERR_PTR(-ENODATA);
+ else
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
kfree(value);
return acl;
}
+static struct posix_acl *v9fs_acl_get(struct dentry *dentry, const char *name)
+{
+ struct p9_fid *fid;
+ struct posix_acl *acl = NULL;
+
+ fid = v9fs_fid_lookup(dentry);
+ if (IS_ERR(fid))
+ return ERR_CAST(fid);
+
+ acl = v9fs_fid_get_acl(fid, name);
+ p9_fid_put(fid);
+ return acl;
+}
+
+static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, const char *name)
+{
+ int retval;
+ struct posix_acl *acl = NULL;
+
+ acl = v9fs_fid_get_acl(fid, name);
+ if (!IS_ERR(acl))
+ return acl;
+
+ retval = PTR_ERR(acl);
+ if (retval == -ENODATA || retval == -ENOSYS || retval == -EOPNOTSUPP)
+ return NULL;
+
+ /* map everything else to -EIO */
+ return ERR_PTR(-EIO);
+}
+
int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
{
int retval = 0;
@@ -89,7 +119,7 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
return acl;
}
-struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu)
+struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type, bool rcu)
{
struct v9fs_session_info *v9ses;
@@ -109,6 +139,112 @@ struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu)
}
+struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, int type)
+{
+ struct v9fs_session_info *v9ses;
+
+ v9ses = v9fs_dentry2v9ses(dentry);
+ /* We allow set/get/list of acl when access=client is not specified. */
+ if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
+ return v9fs_acl_get(dentry, posix_acl_xattr_name(type));
+ return v9fs_get_cached_acl(d_inode(dentry), type);
+}
+
+int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct posix_acl *acl, int type)
+{
+ int retval;
+ size_t size = 0;
+ void *value = NULL;
+ const char *acl_name;
+ struct v9fs_session_info *v9ses;
+ struct inode *inode = d_inode(dentry);
+
+ if (acl) {
+ retval = posix_acl_valid(inode->i_sb->s_user_ns, acl);
+ if (retval)
+ goto err_out;
+
+ size = posix_acl_xattr_size(acl->a_count);
+
+ value = kzalloc(size, GFP_NOFS);
+ if (!value) {
+ retval = -ENOMEM;
+ goto err_out;
+ }
+
+ retval = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+ if (retval < 0)
+ goto err_out;
+ }
+
+ /*
+ * set the attribute on the remote. Without even looking at the
+ * xattr value. We leave it to the server to validate
+ */
+ acl_name = posix_acl_xattr_name(type);
+ v9ses = v9fs_dentry2v9ses(dentry);
+ if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+ retval = v9fs_xattr_set(dentry, acl_name, value, size, 0);
+ goto err_out;
+ }
+
+ if (S_ISLNK(inode->i_mode)) {
+ retval = -EOPNOTSUPP;
+ goto err_out;
+ }
+
+ if (!inode_owner_or_capable(&init_user_ns, inode)) {
+ retval = -EPERM;
+ goto err_out;
+ }
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ if (acl) {
+ struct iattr iattr = {};
+ struct posix_acl *acl_mode = acl;
+
+ retval = posix_acl_update_mode(&init_user_ns, inode,
+ &iattr.ia_mode,
+ &acl_mode);
+ if (retval)
+ goto err_out;
+ if (!acl_mode) {
+ /*
+ * ACL can be represented by the mode bits.
+ * So don't update ACL below.
+ */
+ kfree(value);
+ value = NULL;
+ size = 0;
+ }
+ iattr.ia_valid = ATTR_MODE;
+ /*
+ * FIXME should we update ctime ?
+ * What is the following setxattr update the mode ?
+ */
+ v9fs_vfs_setattr_dotl(&init_user_ns, dentry, &iattr);
+ }
+ break;
+ case ACL_TYPE_DEFAULT:
+ if (!S_ISDIR(inode->i_mode)) {
+ retval = acl ? -EINVAL : 0;
+ goto err_out;
+ }
+ break;
+ }
+
+ retval = v9fs_xattr_set(dentry, acl_name, value, size, 0);
+ if (!retval)
+ set_cached_acl(inode, type, acl);
+
+err_out:
+ kfree(value);
+ return retval;
+}
+
static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl)
{
int retval;
@@ -207,124 +343,3 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep,
*modep = mode;
return 0;
}
-
-static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *inode,
- const char *name, void *buffer, size_t size)
-{
- struct v9fs_session_info *v9ses;
- struct posix_acl *acl;
- int error;
-
- v9ses = v9fs_dentry2v9ses(dentry);
- /*
- * We allow set/get/list of acl when access=client is not specified
- */
- if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
- return v9fs_xattr_get(dentry, handler->name, buffer, size);
-
- acl = v9fs_get_cached_acl(inode, handler->flags);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
- error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
- posix_acl_release(acl);
-
- return error;
-}
-
-static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
- struct user_namespace *mnt_userns,
- struct dentry *dentry, struct inode *inode,
- const char *name, const void *value,
- size_t size, int flags)
-{
- int retval;
- struct posix_acl *acl;
- struct v9fs_session_info *v9ses;
-
- v9ses = v9fs_dentry2v9ses(dentry);
- /*
- * set the attribute on the remote. Without even looking at the
- * xattr value. We leave it to the server to validate
- */
- if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
- return v9fs_xattr_set(dentry, handler->name, value, size,
- flags);
-
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
- if (!inode_owner_or_capable(&init_user_ns, inode))
- return -EPERM;
- if (value) {
- /* update the cached acl value */
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- else if (acl) {
- retval = posix_acl_valid(inode->i_sb->s_user_ns, acl);
- if (retval)
- goto err_out;
- }
- } else
- acl = NULL;
-
- switch (handler->flags) {
- case ACL_TYPE_ACCESS:
- if (acl) {
- struct iattr iattr = { 0 };
- struct posix_acl *old_acl = acl;
-
- retval = posix_acl_update_mode(&init_user_ns, inode,
- &iattr.ia_mode, &acl);
- if (retval)
- goto err_out;
- if (!acl) {
- /*
- * ACL can be represented
- * by the mode bits. So don't
- * update ACL.
- */
- posix_acl_release(old_acl);
- value = NULL;
- size = 0;
- }
- iattr.ia_valid = ATTR_MODE;
- /* FIXME should we update ctime ?
- * What is the following setxattr update the
- * mode ?
- */
- v9fs_vfs_setattr_dotl(&init_user_ns, dentry, &iattr);
- }
- break;
- case ACL_TYPE_DEFAULT:
- if (!S_ISDIR(inode->i_mode)) {
- retval = acl ? -EINVAL : 0;
- goto err_out;
- }
- break;
- default:
- BUG();
- }
- retval = v9fs_xattr_set(dentry, handler->name, value, size, flags);
- if (!retval)
- set_cached_acl(inode, handler->flags, acl);
-err_out:
- posix_acl_release(acl);
- return retval;
-}
-
-const struct xattr_handler v9fs_xattr_acl_access_handler = {
- .name = XATTR_NAME_POSIX_ACL_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .get = v9fs_xattr_get_acl,
- .set = v9fs_xattr_set_acl,
-};
-
-const struct xattr_handler v9fs_xattr_acl_default_handler = {
- .name = XATTR_NAME_POSIX_ACL_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .get = v9fs_xattr_get_acl,
- .set = v9fs_xattr_set_acl,
-};
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index ce5175d463dd..4c60a2bce5de 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -8,8 +8,12 @@
#ifdef CONFIG_9P_FS_POSIX_ACL
int v9fs_get_acl(struct inode *inode, struct p9_fid *fid);
-struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type,
+struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type,
bool rcu);
+struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, int type);
+int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct posix_acl *acl, int type);
int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid);
int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid,
struct posix_acl *dacl, struct posix_acl *acl);
@@ -17,7 +21,9 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep,
struct posix_acl **dpacl, struct posix_acl **pacl);
void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl);
#else
+#define v9fs_iop_get_inode_acl NULL
#define v9fs_iop_get_acl NULL
+#define v9fs_iop_set_acl NULL
static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
{
return 0;
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 47b9a1122f34..a19891015f19 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -40,7 +40,7 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
size_t len = subreq->len - subreq->transferred;
int total, err;
- iov_iter_xarray(&to, READ, &rreq->mapping->i_pages, pos, len);
+ iov_iter_xarray(&to, ITER_DEST, &rreq->mapping->i_pages, pos, len);
total = p9_client_read(fid, pos, &to, &err);
@@ -172,7 +172,7 @@ static int v9fs_vfs_write_folio_locked(struct folio *folio)
len = min_t(loff_t, i_size - start, len);
- iov_iter_xarray(&from, WRITE, &folio_mapping(folio)->i_pages, start, len);
+ iov_iter_xarray(&from, ITER_SOURCE, &folio_mapping(folio)->i_pages, start, len);
/* We should have writeback_fid always set */
BUG_ON(!v9inode->writeback_fid);
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 000fbaae9b18..3bb95adc9619 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -109,7 +109,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
struct iov_iter to;
int n;
- iov_iter_kvec(&to, READ, &kvec, 1, buflen);
+ iov_iter_kvec(&to, ITER_DEST, &kvec, 1, buflen);
n = p9_client_read(file->private_data, ctx->pos, &to,
&err);
if (err)
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 5cfa4b4f070f..03c1743c4aff 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -983,14 +983,18 @@ const struct inode_operations v9fs_dir_inode_operations_dotl = {
.getattr = v9fs_vfs_getattr_dotl,
.setattr = v9fs_vfs_setattr_dotl,
.listxattr = v9fs_listxattr,
+ .get_inode_acl = v9fs_iop_get_inode_acl,
.get_acl = v9fs_iop_get_acl,
+ .set_acl = v9fs_iop_set_acl,
};
const struct inode_operations v9fs_file_inode_operations_dotl = {
.getattr = v9fs_vfs_getattr_dotl,
.setattr = v9fs_vfs_setattr_dotl,
.listxattr = v9fs_listxattr,
+ .get_inode_acl = v9fs_iop_get_inode_acl,
.get_acl = v9fs_iop_get_acl,
+ .set_acl = v9fs_iop_set_acl,
};
const struct inode_operations v9fs_symlink_inode_operations_dotl = {
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 1f9298a4bd42..b6984311e00a 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -8,6 +8,7 @@
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/uio.h>
+#include <linux/posix_acl_xattr.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
@@ -24,7 +25,7 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
struct iov_iter to;
int err;
- iov_iter_kvec(&to, READ, &kvec, 1, buffer_size);
+ iov_iter_kvec(&to, ITER_DEST, &kvec, 1, buffer_size);
attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
if (IS_ERR(attr_fid)) {
@@ -109,7 +110,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
struct iov_iter from;
int retval, err;
- iov_iter_kvec(&from, WRITE, &kvec, 1, value_len);
+ iov_iter_kvec(&from, ITER_SOURCE, &kvec, 1, value_len);
p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n",
name, value_len, flags);
@@ -182,9 +183,9 @@ static struct xattr_handler v9fs_xattr_security_handler = {
const struct xattr_handler *v9fs_xattr_handlers[] = {
&v9fs_xattr_user_handler,
&v9fs_xattr_trusted_handler,
-#ifdef CONFIG_9P_FS_POSIX_ACL
- &v9fs_xattr_acl_access_handler,
- &v9fs_xattr_acl_default_handler,
+#ifdef CONFIG_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
#ifdef CONFIG_9P_FS_SECURITY
&v9fs_xattr_security_handler,
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index 3e11fc3331eb..b5636e544c8a 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -11,8 +11,6 @@
#include <net/9p/client.h>
extern const struct xattr_handler *v9fs_xattr_handlers[];
-extern const struct xattr_handler v9fs_xattr_acl_access_handler;
-extern const struct xattr_handler v9fs_xattr_acl_default_handler;
ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
void *buffer, size_t buffer_size);
diff --git a/fs/Kconfig b/fs/Kconfig
index a547307c1ae8..2685a4d0d353 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -235,6 +235,7 @@ config ARCH_SUPPORTS_HUGETLBFS
config HUGETLBFS
bool "HugeTLB file system support"
depends on X86 || IA64 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
+ depends on (SYSFS || SYSCTL)
help
hugetlbfs is a filesystem backing for HugeTLB pages, based on
ramfs. For architectures that support it, say Y here and read
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 21e154516bf2..93539aac0e5b 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -58,7 +58,7 @@ config ARCH_USE_GNU_PROPERTY
config BINFMT_ELF_FDPIC
bool "Kernel support for FDPIC ELF binaries"
default y if !BINFMT_ELF
- depends on ARM || ((M68K || SUPERH) && !MMU)
+ depends on ARM || ((M68K || SUPERH || XTENSA) && !MMU)
select ELFCORE
help
ELF FDPIC binaries are based on ELF, but allow the individual load
@@ -142,39 +142,6 @@ config BINFMT_ZFLAT
help
Support FLAT format compressed binaries
-config HAVE_AOUT
- def_bool n
-
-config BINFMT_AOUT
- tristate "Kernel support for a.out and ECOFF binaries"
- depends on HAVE_AOUT
- help
- A.out (Assembler.OUTput) is a set of formats for libraries and
- executables used in the earliest versions of UNIX. Linux used
- the a.out formats QMAGIC and ZMAGIC until they were replaced
- with the ELF format.
-
- The conversion to ELF started in 1995. This option is primarily
- provided for historical interest and for the benefit of those
- who need to run binaries from that era.
-
- Most people should answer N here. If you think you may have
- occasional use for this format, enable module support above
- and answer M here to compile this support as a module called
- binfmt_aout.
-
- If any crucial components of your system (such as /sbin/init
- or /lib/ld.so) are still in a.out format, you will have to
- say Y here.
-
-config OSF4_COMPAT
- bool "OSF/1 v4 readv/writev compatibility"
- depends on ALPHA && BINFMT_AOUT
- help
- Say Y if you are using OSF/1 binaries (like Netscape and Acrobat)
- with v4 shared libraries freely available from Compaq. If you're
- going to use shared libraries from Tru64 version 5.0 or later, say N.
-
config BINFMT_MISC
tristate "Kernel support for MISC binaries"
help
diff --git a/fs/Makefile b/fs/Makefile
index 93b80529f8e8..4dea17840761 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -38,7 +38,6 @@ obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FS_VERITY) += verity/
obj-$(CONFIG_FILE_LOCKING) += locks.o
-obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o
obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 4c5f30a83336..58b391446ae1 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -276,7 +276,7 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
char *vol = match_strdup(&args[0]);
if (!vol)
return 0;
- strlcpy(volume, vol, 32);
+ strscpy(volume, vol, 32);
kfree(vol);
break;
}
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 0a090d614e76..7dcd59693a0c 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -298,7 +298,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
if (call->count2 != call->count && call->count2 != 0)
return afs_protocol_error(call, afs_eproto_cb_count);
call->iter = &call->def_iter;
- iov_iter_discard(&call->def_iter, READ, call->count2 * 3 * 4);
+ iov_iter_discard(&call->def_iter, ITER_DEST, call->count2 * 3 * 4);
call->unmarshall++;
fallthrough;
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 56ae5cd5184f..104df2964225 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -24,9 +24,9 @@ static int afs_readdir(struct file *file, struct dir_context *ctx);
static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
static int afs_d_delete(const struct dentry *dentry);
static void afs_d_iput(struct dentry *dentry, struct inode *inode);
-static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen,
+static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen,
loff_t fpos, u64 ino, unsigned dtype);
-static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen,
+static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen,
loff_t fpos, u64 ino, unsigned dtype);
static int afs_create(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t mode, bool excl);
@@ -305,7 +305,7 @@ expand:
req->actual_len = i_size; /* May change */
req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */
req->data_version = dvnode->status.data_version; /* May change */
- iov_iter_xarray(&req->def_iter, READ, &dvnode->netfs.inode.i_mapping->i_pages,
+ iov_iter_xarray(&req->def_iter, ITER_DEST, &dvnode->netfs.inode.i_mapping->i_pages,
0, i_size);
req->iter = &req->def_iter;
@@ -568,7 +568,7 @@ static int afs_readdir(struct file *file, struct dir_context *ctx)
* - if afs_dir_iterate_block() spots this function, it'll pass the FID
* uniquifier through dtype
*/
-static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name,
+static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name,
int nlen, loff_t fpos, u64 ino, unsigned dtype)
{
struct afs_lookup_one_cookie *cookie =
@@ -584,16 +584,16 @@ static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name,
if (cookie->name.len != nlen ||
memcmp(cookie->name.name, name, nlen) != 0) {
- _leave(" = 0 [no]");
- return 0;
+ _leave(" = true [keep looking]");
+ return true;
}
cookie->fid.vnode = ino;
cookie->fid.unique = dtype;
cookie->found = 1;
- _leave(" = -1 [found]");
- return -1;
+ _leave(" = false [found]");
+ return false;
}
/*
@@ -636,12 +636,11 @@ static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry,
* - if afs_dir_iterate_block() spots this function, it'll pass the FID
* uniquifier through dtype
*/
-static int afs_lookup_filldir(struct dir_context *ctx, const char *name,
+static bool afs_lookup_filldir(struct dir_context *ctx, const char *name,
int nlen, loff_t fpos, u64 ino, unsigned dtype)
{
struct afs_lookup_cookie *cookie =
container_of(ctx, struct afs_lookup_cookie, ctx);
- int ret;
_enter("{%s,%u},%s,%u,,%llu,%u",
cookie->name.name, cookie->name.len, name, nlen,
@@ -663,12 +662,10 @@ static int afs_lookup_filldir(struct dir_context *ctx, const char *name,
cookie->fids[1].unique = dtype;
cookie->found = 1;
if (cookie->one_only)
- return -1;
+ return false;
}
- ret = cookie->nr_fids >= 50 ? -1 : 0;
- _leave(" = %d", ret);
- return ret;
+ return cookie->nr_fids < 50;
}
/*
diff --git a/fs/afs/file.c b/fs/afs/file.c
index d1cfb235c4b9..2eeab57df133 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -324,7 +324,7 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq)
fsreq->vnode = vnode;
fsreq->iter = &fsreq->def_iter;
- iov_iter_xarray(&fsreq->def_iter, READ,
+ iov_iter_xarray(&fsreq->def_iter, ITER_DEST,
&fsreq->vnode->netfs.inode.i_mapping->i_pages,
fsreq->pos, fsreq->len);
@@ -346,7 +346,7 @@ static int afs_symlink_read_folio(struct file *file, struct folio *folio)
fsreq->len = folio_size(folio);
fsreq->vnode = vnode;
fsreq->iter = &fsreq->def_iter;
- iov_iter_xarray(&fsreq->def_iter, READ, &folio->mapping->i_pages,
+ iov_iter_xarray(&fsreq->def_iter, ITER_DEST, &folio->mapping->i_pages,
fsreq->pos, fsreq->len);
ret = afs_fetch_data(fsreq->vnode, fsreq);
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index c0031a3ab42f..3ac5fcf98d0d 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -167,8 +167,8 @@ responded:
clear_bit(AFS_SERVER_FL_HAS_FS64, &server->flags);
}
- if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) &&
- rtt_us < server->probe.rtt) {
+ rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us);
+ if (rtt_us < server->probe.rtt) {
server->probe.rtt = rtt_us;
server->rtt = rtt_us;
alist->preferred = index;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 723d162078a3..9ba7b68375c9 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1301,7 +1301,7 @@ static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t si
call->iov_len = size;
call->kvec[0].iov_base = buf;
call->kvec[0].iov_len = size;
- iov_iter_kvec(&call->def_iter, READ, call->kvec, 1, size);
+ iov_iter_kvec(&call->def_iter, ITER_DEST, call->kvec, 1, size);
}
static inline void afs_extract_to_tmp(struct afs_call *call)
@@ -1319,7 +1319,7 @@ static inline void afs_extract_to_tmp64(struct afs_call *call)
static inline void afs_extract_discard(struct afs_call *call, size_t size)
{
call->iov_len = size;
- iov_iter_discard(&call->def_iter, READ, size);
+ iov_iter_discard(&call->def_iter, ITER_DEST, size);
}
static inline void afs_extract_to_buf(struct afs_call *call, size_t size)
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index eccc3cd0cb70..c62939e5ea1f 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -359,7 +359,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
msg.msg_name = NULL;
msg.msg_namelen = 0;
- iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, call->request_size);
+ iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iov, 1, call->request_size);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = MSG_WAITALL | (call->write_iter ? MSG_MORE : 0);
@@ -400,7 +400,7 @@ error_do_abort:
RX_USER_ABORT, ret, "KSD");
} else {
len = 0;
- iov_iter_kvec(&msg.msg_iter, READ, NULL, 0, 0);
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, NULL, 0, 0);
rxrpc_kernel_recv_data(call->net->socket, rxcall,
&msg.msg_iter, &len, false,
&call->abort_code, &call->service_id);
@@ -485,7 +485,7 @@ static void afs_deliver_to_call(struct afs_call *call)
) {
if (state == AFS_CALL_SV_AWAIT_ACK) {
len = 0;
- iov_iter_kvec(&call->def_iter, READ, NULL, 0, 0);
+ iov_iter_kvec(&call->def_iter, ITER_DEST, NULL, 0, 0);
ret = rxrpc_kernel_recv_data(call->net->socket,
call->rxcall, &call->def_iter,
&len, false, &remote_abort,
@@ -822,7 +822,7 @@ void afs_send_empty_reply(struct afs_call *call)
msg.msg_name = NULL;
msg.msg_namelen = 0;
- iov_iter_kvec(&msg.msg_iter, WRITE, NULL, 0, 0);
+ iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, NULL, 0, 0);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
@@ -862,7 +862,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
iov[0].iov_len = len;
msg.msg_name = NULL;
msg.msg_namelen = 0;
- iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len);
+ iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iov, 1, len);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 4981baf97835..b5237206eac3 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -406,7 +406,7 @@ void afs_put_server(struct afs_net *net, struct afs_server *server,
if (!server)
return;
- a = atomic_inc_return(&server->active);
+ a = atomic_read(&server->active);
zero = __refcount_dec_and_test(&server->ref, &r);
trace_afs_server(debug_id, r - 1, a, reason);
if (unlikely(zero))
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 9ebdd36eaf2f..08fd456dde67 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -609,7 +609,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
*/
afs_write_to_cache(vnode, start, len, i_size, caching);
- iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len);
+ iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len);
ret = afs_store_data(vnode, &iter, start, false);
} else {
_debug("write discard %x @%llx [%llx]", len, start, i_size);
@@ -1000,7 +1000,7 @@ int afs_launder_folio(struct folio *folio)
bv[0].bv_page = &folio->page;
bv[0].bv_offset = f;
bv[0].bv_len = t - f;
- iov_iter_bvec(&iter, WRITE, bv, 1, bv[0].bv_len);
+ iov_iter_bvec(&iter, ITER_SOURCE, bv, 1, bv[0].bv_len);
trace_afs_folio_dirty(vnode, tracepoint_string("launder"), folio);
ret = afs_store_data(vnode, &iter, folio_pos(folio) + f, true);
diff --git a/fs/aio.c b/fs/aio.c
index 606613e9d1f4..562916d85cba 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -951,16 +951,13 @@ static bool __get_reqs_available(struct kioctx *ctx)
local_irq_save(flags);
kcpu = this_cpu_ptr(ctx->cpu);
if (!kcpu->reqs_available) {
- int old, avail = atomic_read(&ctx->reqs_available);
+ int avail = atomic_read(&ctx->reqs_available);
do {
if (avail < ctx->req_batch)
goto out;
-
- old = avail;
- avail = atomic_cmpxchg(&ctx->reqs_available,
- avail, avail - ctx->req_batch);
- } while (avail != old);
+ } while (!atomic_try_cmpxchg(&ctx->reqs_available,
+ &avail, avail - ctx->req_batch));
kcpu->reqs_available += ctx->req_batch;
}
@@ -1555,7 +1552,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
if (unlikely(!file->f_op->read_iter))
return -EINVAL;
- ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
+ ret = aio_setup_rw(ITER_DEST, iocb, &iovec, vectored, compat, &iter);
if (ret < 0)
return ret;
ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
@@ -1583,7 +1580,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
if (unlikely(!file->f_op->write_iter))
return -EINVAL;
- ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
+ ret = aio_setup_rw(ITER_SOURCE, iocb, &iovec, vectored, compat, &iter);
if (ret < 0)
return ret;
ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index e0c3e33c4177..24192a7667ed 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -32,7 +32,7 @@ static struct inode *anon_inode_inode;
*/
static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
{
- return dynamic_dname(dentry, buffer, buflen, "anon_inode:%s",
+ return dynamic_dname(buffer, buflen, "anon_inode:%s",
dentry->d_name.name);
}
diff --git a/fs/attr.c b/fs/attr.c
index 1552a5f23d6b..b45f30e516fa 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -18,6 +18,70 @@
#include <linux/evm.h>
#include <linux/ima.h>
+#include "internal.h"
+
+/**
+ * setattr_should_drop_sgid - determine whether the setgid bit needs to be
+ * removed
+ * @mnt_userns: user namespace of the mount @inode was found from
+ * @inode: inode to check
+ *
+ * This function determines whether the setgid bit needs to be removed.
+ * We retain backwards compatibility and require setgid bit to be removed
+ * unconditionally if S_IXGRP is set. Otherwise we have the exact same
+ * requirements as setattr_prepare() and setattr_copy().
+ *
+ * Return: ATTR_KILL_SGID if setgid bit needs to be removed, 0 otherwise.
+ */
+int setattr_should_drop_sgid(struct user_namespace *mnt_userns,
+ const struct inode *inode)
+{
+ umode_t mode = inode->i_mode;
+
+ if (!(mode & S_ISGID))
+ return 0;
+ if (mode & S_IXGRP)
+ return ATTR_KILL_SGID;
+ if (!in_group_or_capable(mnt_userns, inode,
+ i_gid_into_vfsgid(mnt_userns, inode)))
+ return ATTR_KILL_SGID;
+ return 0;
+}
+
+/**
+ * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to
+ * be dropped
+ * @mnt_userns: user namespace of the mount @inode was found from
+ * @inode: inode to check
+ *
+ * This function determines whether the set{g,u}id bits need to be removed.
+ * If the setuid bit needs to be removed ATTR_KILL_SUID is returned. If the
+ * setgid bit needs to be removed ATTR_KILL_SGID is returned. If both
+ * set{g,u}id bits need to be removed the corresponding mask of both flags is
+ * returned.
+ *
+ * Return: A mask of ATTR_KILL_S{G,U}ID indicating which - if any - setid bits
+ * to remove, 0 otherwise.
+ */
+int setattr_should_drop_suidgid(struct user_namespace *mnt_userns,
+ struct inode *inode)
+{
+ umode_t mode = inode->i_mode;
+ int kill = 0;
+
+ /* suid always must be killed */
+ if (unlikely(mode & S_ISUID))
+ kill = ATTR_KILL_SUID;
+
+ kill |= setattr_should_drop_sgid(mnt_userns, inode);
+
+ if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
+ return kill;
+
+ return 0;
+}
+EXPORT_SYMBOL(setattr_should_drop_suidgid);
+
/**
* chown_ok - verify permissions to chown inode
* @mnt_userns: user namespace of the mount @inode was found from
@@ -140,8 +204,7 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry,
vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
/* Also check the setgid bit! */
- if (!vfsgid_in_group_p(vfsgid) &&
- !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
+ if (!in_group_or_capable(mnt_userns, inode, vfsgid))
attr->ia_mode &= ~S_ISGID;
}
@@ -251,9 +314,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
inode->i_ctime = attr->ia_ctime;
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
- vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
- if (!vfsgid_in_group_p(vfsgid) &&
- !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
+ if (!in_group_or_capable(mnt_userns, inode,
+ i_gid_into_vfsgid(mnt_userns, inode)))
mode &= ~S_ISGID;
inode->i_mode = mode;
}
@@ -375,7 +437,7 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
}
}
if (ia_valid & ATTR_KILL_SGID) {
- if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
+ if (mode & S_ISGID) {
if (!(ia_valid & ATTR_MODE)) {
ia_valid = attr->ia_valid |= ATTR_MODE;
attr->ia_mode = inode->i_mode;
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 12b8fdcc445b..92737166203f 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -147,14 +147,14 @@ static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry,
}
static int bad_inode_tmpfile(struct user_namespace *mnt_userns,
- struct inode *inode, struct dentry *dentry,
+ struct inode *inode, struct file *file,
umode_t mode)
{
return -EIO;
}
static int bad_inode_set_acl(struct user_namespace *mnt_userns,
- struct inode *inode, struct posix_acl *acl,
+ struct dentry *dentry, struct posix_acl *acl,
int type)
{
return -EIO;
@@ -177,7 +177,7 @@ static const struct inode_operations bad_inode_ops =
.setattr = bad_inode_setattr,
.listxattr = bad_inode_listxattr,
.get_link = bad_inode_get_link,
- .get_acl = bad_inode_get_acl,
+ .get_inode_acl = bad_inode_get_acl,
.fiemap = bad_inode_fiemap,
.update_time = bad_inode_update_time,
.atomic_open = bad_inode_atomic_open,
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
deleted file mode 100644
index 0dcfc691e7e2..000000000000
--- a/fs/binfmt_aout.c
+++ /dev/null
@@ -1,342 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/fs/binfmt_aout.c
- *
- * Copyright (C) 1991, 1992, 1996 Linus Torvalds
- */
-
-#include <linux/module.h>
-
-#include <linux/time.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/a.out.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/ptrace.h>
-#include <linux/user.h>
-#include <linux/binfmts.h>
-#include <linux/personality.h>
-#include <linux/init.h>
-#include <linux/coredump.h>
-#include <linux/slab.h>
-#include <linux/sched/task_stack.h>
-
-#include <linux/uaccess.h>
-#include <asm/cacheflush.h>
-
-static int load_aout_binary(struct linux_binprm *);
-static int load_aout_library(struct file*);
-
-static struct linux_binfmt aout_format = {
- .module = THIS_MODULE,
- .load_binary = load_aout_binary,
- .load_shlib = load_aout_library,
-};
-
-#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
-
-static int set_brk(unsigned long start, unsigned long end)
-{
- start = PAGE_ALIGN(start);
- end = PAGE_ALIGN(end);
- if (end > start)
- return vm_brk(start, end - start);
- return 0;
-}
-
-/*
- * create_aout_tables() parses the env- and arg-strings in new user
- * memory and creates the pointer tables from them, and puts their
- * addresses on the "stack", returning the new stack pointer value.
- */
-static unsigned long __user *create_aout_tables(char __user *p, struct linux_binprm * bprm)
-{
- char __user * __user *argv;
- char __user * __user *envp;
- unsigned long __user *sp;
- int argc = bprm->argc;
- int envc = bprm->envc;
-
- sp = (void __user *)((-(unsigned long)sizeof(char *)) & (unsigned long) p);
-#ifdef __alpha__
-/* whee.. test-programs are so much fun. */
- put_user(0, --sp);
- put_user(0, --sp);
- if (bprm->loader) {
- put_user(0, --sp);
- put_user(1003, --sp);
- put_user(bprm->loader, --sp);
- put_user(1002, --sp);
- }
- put_user(bprm->exec, --sp);
- put_user(1001, --sp);
-#endif
- sp -= envc+1;
- envp = (char __user * __user *) sp;
- sp -= argc+1;
- argv = (char __user * __user *) sp;
-#ifndef __alpha__
- put_user((unsigned long) envp,--sp);
- put_user((unsigned long) argv,--sp);
-#endif
- put_user(argc,--sp);
- current->mm->arg_start = (unsigned long) p;
- while (argc-->0) {
- char c;
- put_user(p,argv++);
- do {
- get_user(c,p++);
- } while (c);
- }
- put_user(NULL,argv);
- current->mm->arg_end = current->mm->env_start = (unsigned long) p;
- while (envc-->0) {
- char c;
- put_user(p,envp++);
- do {
- get_user(c,p++);
- } while (c);
- }
- put_user(NULL,envp);
- current->mm->env_end = (unsigned long) p;
- return sp;
-}
-
-/*
- * These are the functions used to load a.out style executables and shared
- * libraries. There is no binary dependent code anywhere else.
- */
-
-static int load_aout_binary(struct linux_binprm * bprm)
-{
- struct pt_regs *regs = current_pt_regs();
- struct exec ex;
- unsigned long error;
- unsigned long fd_offset;
- unsigned long rlim;
- int retval;
-
- ex = *((struct exec *) bprm->buf); /* exec-header */
- if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
- N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
- N_TRSIZE(ex) || N_DRSIZE(ex) ||
- i_size_read(file_inode(bprm->file)) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
- return -ENOEXEC;
- }
-
- /*
- * Requires a mmap handler. This prevents people from using a.out
- * as part of an exploit attack against /proc-related vulnerabilities.
- */
- if (!bprm->file->f_op->mmap)
- return -ENOEXEC;
-
- fd_offset = N_TXTOFF(ex);
-
- /* Check initial limits. This avoids letting people circumvent
- * size limits imposed on them by creating programs with large
- * arrays in the data or bss.
- */
- rlim = rlimit(RLIMIT_DATA);
- if (rlim >= RLIM_INFINITY)
- rlim = ~0;
- if (ex.a_data + ex.a_bss > rlim)
- return -ENOMEM;
-
- /* Flush all traces of the currently running executable */
- retval = begin_new_exec(bprm);
- if (retval)
- return retval;
-
- /* OK, This is the point of no return */
-#ifdef __alpha__
- SET_AOUT_PERSONALITY(bprm, ex);
-#else
- set_personality(PER_LINUX);
-#endif
- setup_new_exec(bprm);
-
- current->mm->end_code = ex.a_text +
- (current->mm->start_code = N_TXTADDR(ex));
- current->mm->end_data = ex.a_data +
- (current->mm->start_data = N_DATADDR(ex));
- current->mm->brk = ex.a_bss +
- (current->mm->start_brk = N_BSSADDR(ex));
-
- retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
- if (retval < 0)
- return retval;
-
-
- if (N_MAGIC(ex) == OMAGIC) {
- unsigned long text_addr, map_size;
- loff_t pos;
-
- text_addr = N_TXTADDR(ex);
-
-#ifdef __alpha__
- pos = fd_offset;
- map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1;
-#else
- pos = 32;
- map_size = ex.a_text+ex.a_data;
-#endif
- error = vm_brk(text_addr & PAGE_MASK, map_size);
- if (error)
- return error;
-
- error = read_code(bprm->file, text_addr, pos,
- ex.a_text+ex.a_data);
- if ((signed long)error < 0)
- return error;
- } else {
- if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
- (N_MAGIC(ex) != NMAGIC) && printk_ratelimit())
- {
- printk(KERN_NOTICE "executable not page aligned\n");
- }
-
- if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit())
- {
- printk(KERN_WARNING
- "fd_offset is not page aligned. Please convert program: %pD\n",
- bprm->file);
- }
-
- if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
- error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
- if (error)
- return error;
-
- read_code(bprm->file, N_TXTADDR(ex), fd_offset,
- ex.a_text + ex.a_data);
- goto beyond_if;
- }
-
- error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
- PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE,
- fd_offset);
-
- if (error != N_TXTADDR(ex))
- return error;
-
- error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
- PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE,
- fd_offset + ex.a_text);
- if (error != N_DATADDR(ex))
- return error;
- }
-beyond_if:
- set_binfmt(&aout_format);
-
- retval = set_brk(current->mm->start_brk, current->mm->brk);
- if (retval < 0)
- return retval;
-
- current->mm->start_stack =
- (unsigned long) create_aout_tables((char __user *) bprm->p, bprm);
-#ifdef __alpha__
- regs->gp = ex.a_gpvalue;
-#endif
- finalize_exec(bprm);
- start_thread(regs, ex.a_entry, current->mm->start_stack);
- return 0;
-}
-
-static int load_aout_library(struct file *file)
-{
- struct inode * inode;
- unsigned long bss, start_addr, len;
- unsigned long error;
- int retval;
- struct exec ex;
- loff_t pos = 0;
-
- inode = file_inode(file);
-
- retval = -ENOEXEC;
- error = kernel_read(file, &ex, sizeof(ex), &pos);
- if (error != sizeof(ex))
- goto out;
-
- /* We come in here for the regular a.out style of shared libraries */
- if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
- N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
- i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
- goto out;
- }
-
- /*
- * Requires a mmap handler. This prevents people from using a.out
- * as part of an exploit attack against /proc-related vulnerabilities.
- */
- if (!file->f_op->mmap)
- goto out;
-
- if (N_FLAGS(ex))
- goto out;
-
- /* For QMAGIC, the starting address is 0x20 into the page. We mask
- this off to get the starting address for the page */
-
- start_addr = ex.a_entry & 0xfffff000;
-
- if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
- if (printk_ratelimit())
- {
- printk(KERN_WARNING
- "N_TXTOFF is not page aligned. Please convert library: %pD\n",
- file);
- }
- retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
- if (retval)
- goto out;
-
- read_code(file, start_addr, N_TXTOFF(ex),
- ex.a_text + ex.a_data);
- retval = 0;
- goto out;
- }
- /* Now use mmap to map the library into memory. */
- error = vm_mmap(file, start_addr, ex.a_text + ex.a_data,
- PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE,
- N_TXTOFF(ex));
- retval = error;
- if (error != start_addr)
- goto out;
-
- len = PAGE_ALIGN(ex.a_text + ex.a_data);
- bss = ex.a_text + ex.a_data + ex.a_bss;
- if (bss > len) {
- retval = vm_brk(start_addr + len, bss - len);
- if (retval)
- goto out;
- }
- retval = 0;
-out:
- return retval;
-}
-
-static int __init init_aout_binfmt(void)
-{
- register_binfmt(&aout_format);
- return 0;
-}
-
-static void __exit exit_aout_binfmt(void)
-{
- unregister_binfmt(&aout_format);
-}
-
-core_initcall(init_aout_binfmt);
-module_exit(exit_aout_binfmt);
-MODULE_LICENSE("GPL");
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 63c7ebb0da89..de63572a9404 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -248,7 +248,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
} while (0)
#ifdef ARCH_DLINFO
- /*
+ /*
* ARCH_DLINFO must come first so PPC can do its special alignment of
* AUXV.
* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT() in
@@ -456,13 +456,13 @@ static unsigned long maximum_alignment(struct elf_phdr *cmds, int nr)
*
* Loads ELF program headers from the binary file elf_file, which has the ELF
* header pointed to by elf_ex, into a newly allocated array. The caller is
- * responsible for freeing the allocated data. Returns an ERR_PTR upon failure.
+ * responsible for freeing the allocated data. Returns NULL upon failure.
*/
static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
struct file *elf_file)
{
struct elf_phdr *elf_phdata = NULL;
- int retval, err = -1;
+ int retval = -1;
unsigned int size;
/*
@@ -484,15 +484,9 @@ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
/* Read in the program headers */
retval = elf_read(elf_file, elf_phdata, size, elf_ex->e_phoff);
- if (retval < 0) {
- err = retval;
- goto out;
- }
- /* Success! */
- err = 0;
out:
- if (err) {
+ if (retval) {
kfree(elf_phdata);
elf_phdata = NULL;
}
@@ -911,7 +905,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
interp_elf_ex = kmalloc(sizeof(*interp_elf_ex), GFP_KERNEL);
if (!interp_elf_ex) {
retval = -ENOMEM;
- goto out_free_ph;
+ goto out_free_file;
}
/* Get the exec headers */
@@ -1020,7 +1014,7 @@ out_free_interp:
executable_stack);
if (retval < 0)
goto out_free_dentry;
-
+
elf_bss = 0;
elf_brk = 0;
@@ -1043,7 +1037,7 @@ out_free_interp:
if (unlikely (elf_brk > elf_bss)) {
unsigned long nbyte;
-
+
/* There was a PT_LOAD segment with p_memsz > p_filesz
before this one. Map anonymous pages, if needed,
and clear the area. */
@@ -1166,7 +1160,7 @@ out_free_interp:
error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
elf_prot, elf_flags, total_size);
if (BAD_ADDR(error)) {
- retval = IS_ERR((void *)error) ?
+ retval = IS_ERR_VALUE(error) ?
PTR_ERR((void*)error) : -EINVAL;
goto out_free_dentry;
}
@@ -1251,7 +1245,7 @@ out_free_interp:
interpreter,
load_bias, interp_elf_phdata,
&arch_state);
- if (!IS_ERR((void *)elf_entry)) {
+ if (!IS_ERR_VALUE(elf_entry)) {
/*
* load_elf_interp() returns relocation
* adjustment
@@ -1260,7 +1254,7 @@ out_free_interp:
elf_entry += interp_elf_ex->e_entry;
}
if (BAD_ADDR(elf_entry)) {
- retval = IS_ERR((void *)elf_entry) ?
+ retval = IS_ERR_VALUE(elf_entry) ?
(int)elf_entry : -EINVAL;
goto out_free_dentry;
}
@@ -1354,6 +1348,7 @@ out:
out_free_dentry:
kfree(interp_elf_ex);
kfree(interp_elf_phdata);
+out_free_file:
allow_write_access(interpreter);
if (interpreter)
fput(interpreter);
@@ -1520,7 +1515,7 @@ static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset)
phdr->p_align = 0;
}
-static void fill_note(struct memelfnote *note, const char *name, int type,
+static void fill_note(struct memelfnote *note, const char *name, int type,
unsigned int sz, void *data)
{
note->name = name;
@@ -1723,7 +1718,6 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm
return 0;
}
-#ifdef CORE_DUMP_USE_REGSET
#include <linux/regset.h>
struct elf_thread_core_info {
@@ -1744,6 +1738,7 @@ struct elf_note_info {
int thread_notes;
};
+#ifdef CORE_DUMP_USE_REGSET
/*
* When a regset has a writeback hook, we call it on each thread before
* dumping user memory. On register window machines, this makes sure the
@@ -1823,34 +1818,58 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
return 1;
}
+#else
+static int fill_thread_core_info(struct elf_thread_core_info *t,
+ const struct user_regset_view *view,
+ long signr, struct elf_note_info *info)
+{
+ struct task_struct *p = t->task;
+ elf_fpregset_t *fpu;
+
+ fill_prstatus(&t->prstatus.common, p, signr);
+ elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
+
+ fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
+ &(t->prstatus));
+ info->size += notesize(&t->notes[0]);
+
+ fpu = kzalloc(sizeof(elf_fpregset_t), GFP_KERNEL);
+ if (!fpu || !elf_core_copy_task_fpregs(p, fpu)) {
+ kfree(fpu);
+ return 1;
+ }
+
+ t->prstatus.pr_fpvalid = 1;
+ fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
+ info->size += notesize(&t->notes[1]);
+
+ return 1;
+}
+#endif
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
struct coredump_params *cprm)
{
struct task_struct *dump_task = current;
- const struct user_regset_view *view = task_user_regset_view(dump_task);
+ const struct user_regset_view *view;
struct elf_thread_core_info *t;
struct elf_prpsinfo *psinfo;
struct core_thread *ct;
- unsigned int i;
-
- info->size = 0;
- info->thread = NULL;
psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
- if (psinfo == NULL) {
- info->psinfo.data = NULL; /* So we don't free this wrongly */
+ if (!psinfo)
return 0;
- }
-
fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
+#ifdef CORE_DUMP_USE_REGSET
+ view = task_user_regset_view(dump_task);
+
/*
* Figure out how many notes we're going to need for each thread.
*/
info->thread_notes = 0;
- for (i = 0; i < view->n; ++i)
+ for (int i = 0; i < view->n; ++i)
if (view->regsets[i].core_note_type != 0)
++info->thread_notes;
@@ -1869,11 +1888,23 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
*/
fill_elf_header(elf, phdrs,
view->e_machine, view->e_flags);
+#else
+ view = NULL;
+ info->thread_notes = 2;
+ fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
+#endif
/*
* Allocate a structure for each thread.
*/
- for (ct = &dump_task->signal->core_state->dumper; ct; ct = ct->next) {
+ info->thread = kzalloc(offsetof(struct elf_thread_core_info,
+ notes[info->thread_notes]),
+ GFP_KERNEL);
+ if (unlikely(!info->thread))
+ return 0;
+
+ info->thread->task = dump_task;
+ for (ct = dump_task->signal->core_state->dumper.next; ct; ct = ct->next) {
t = kzalloc(offsetof(struct elf_thread_core_info,
notes[info->thread_notes]),
GFP_KERNEL);
@@ -1881,17 +1912,8 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
return 0;
t->task = ct->task;
- if (ct->task == dump_task || !info->thread) {
- t->next = info->thread;
- info->thread = t;
- } else {
- /*
- * Make sure to keep the original task at
- * the head of the list.
- */
- t->next = info->thread->next;
- info->thread->next = t;
- }
+ t->next = info->thread->next;
+ info->thread->next = t;
}
/*
@@ -1919,11 +1941,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
return 1;
}
-static size_t get_note_info_size(struct elf_note_info *info)
-{
- return info->size;
-}
-
/*
* Write all the notes for each thread. When writing the first thread, the
* process-wide notes are interleaved after the first thread-specific note.
@@ -1978,197 +1995,6 @@ static void free_note_info(struct elf_note_info *info)
kvfree(info->files.data);
}
-#else
-
-/* Here is the structure in which status of each thread is captured. */
-struct elf_thread_status
-{
- struct list_head list;
- struct elf_prstatus prstatus; /* NT_PRSTATUS */
- elf_fpregset_t fpu; /* NT_PRFPREG */
- struct task_struct *thread;
- struct memelfnote notes[3];
- int num_notes;
-};
-
-/*
- * In order to add the specific thread information for the elf file format,
- * we need to keep a linked list of every threads pr_status and then create
- * a single section for them in the final core file.
- */
-static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
-{
- int sz = 0;
- struct task_struct *p = t->thread;
- t->num_notes = 0;
-
- fill_prstatus(&t->prstatus.common, p, signr);
- elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
-
- fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
- &(t->prstatus));
- t->num_notes++;
- sz += notesize(&t->notes[0]);
-
- if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL,
- &t->fpu))) {
- fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu),
- &(t->fpu));
- t->num_notes++;
- sz += notesize(&t->notes[1]);
- }
- return sz;
-}
-
-struct elf_note_info {
- struct memelfnote *notes;
- struct memelfnote *notes_files;
- struct elf_prstatus *prstatus; /* NT_PRSTATUS */
- struct elf_prpsinfo *psinfo; /* NT_PRPSINFO */
- struct list_head thread_list;
- elf_fpregset_t *fpu;
- user_siginfo_t csigdata;
- int thread_status_size;
- int numnote;
-};
-
-static int elf_note_info_init(struct elf_note_info *info)
-{
- memset(info, 0, sizeof(*info));
- INIT_LIST_HEAD(&info->thread_list);
-
- /* Allocate space for ELF notes */
- info->notes = kmalloc_array(8, sizeof(struct memelfnote), GFP_KERNEL);
- if (!info->notes)
- return 0;
- info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
- if (!info->psinfo)
- return 0;
- info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
- if (!info->prstatus)
- return 0;
- info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
- if (!info->fpu)
- return 0;
- return 1;
-}
-
-static int fill_note_info(struct elfhdr *elf, int phdrs,
- struct elf_note_info *info,
- struct coredump_params *cprm)
-{
- struct core_thread *ct;
- struct elf_thread_status *ets;
-
- if (!elf_note_info_init(info))
- return 0;
-
- for (ct = current->signal->core_state->dumper.next;
- ct; ct = ct->next) {
- ets = kzalloc(sizeof(*ets), GFP_KERNEL);
- if (!ets)
- return 0;
-
- ets->thread = ct->task;
- list_add(&ets->list, &info->thread_list);
- }
-
- list_for_each_entry(ets, &info->thread_list, list) {
- int sz;
-
- sz = elf_dump_thread_status(cprm->siginfo->si_signo, ets);
- info->thread_status_size += sz;
- }
- /* now collect the dump for the current */
- memset(info->prstatus, 0, sizeof(*info->prstatus));
- fill_prstatus(&info->prstatus->common, current, cprm->siginfo->si_signo);
- elf_core_copy_regs(&info->prstatus->pr_reg, cprm->regs);
-
- /* Set up header */
- fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
-
- /*
- * Set up the notes in similar form to SVR4 core dumps made
- * with info from their /proc.
- */
-
- fill_note(info->notes + 0, "CORE", NT_PRSTATUS,
- sizeof(*info->prstatus), info->prstatus);
- fill_psinfo(info->psinfo, current->group_leader, current->mm);
- fill_note(info->notes + 1, "CORE", NT_PRPSINFO,
- sizeof(*info->psinfo), info->psinfo);
-
- fill_siginfo_note(info->notes + 2, &info->csigdata, cprm->siginfo);
- fill_auxv_note(info->notes + 3, current->mm);
- info->numnote = 4;
-
- if (fill_files_note(info->notes + info->numnote, cprm) == 0) {
- info->notes_files = info->notes + info->numnote;
- info->numnote++;
- }
-
- /* Try to dump the FPU. */
- info->prstatus->pr_fpvalid =
- elf_core_copy_task_fpregs(current, cprm->regs, info->fpu);
- if (info->prstatus->pr_fpvalid)
- fill_note(info->notes + info->numnote++,
- "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu);
- return 1;
-}
-
-static size_t get_note_info_size(struct elf_note_info *info)
-{
- int sz = 0;
- int i;
-
- for (i = 0; i < info->numnote; i++)
- sz += notesize(info->notes + i);
-
- sz += info->thread_status_size;
-
- return sz;
-}
-
-static int write_note_info(struct elf_note_info *info,
- struct coredump_params *cprm)
-{
- struct elf_thread_status *ets;
- int i;
-
- for (i = 0; i < info->numnote; i++)
- if (!writenote(info->notes + i, cprm))
- return 0;
-
- /* write out the thread status notes section */
- list_for_each_entry(ets, &info->thread_list, list) {
- for (i = 0; i < ets->num_notes; i++)
- if (!writenote(&ets->notes[i], cprm))
- return 0;
- }
-
- return 1;
-}
-
-static void free_note_info(struct elf_note_info *info)
-{
- while (!list_empty(&info->thread_list)) {
- struct list_head *tmp = info->thread_list.next;
- list_del(tmp);
- kfree(list_entry(tmp, struct elf_thread_status, list));
- }
-
- /* Free data possibly allocated by fill_files_note(): */
- if (info->notes_files)
- kvfree(info->notes_files->data);
-
- kfree(info->prstatus);
- kfree(info->psinfo);
- kfree(info->notes);
- kfree(info->fpu);
-}
-
-#endif
-
static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
elf_addr_t e_shoff, int segs)
{
@@ -2232,7 +2058,7 @@ static int elf_core_dump(struct coredump_params *cprm)
/* Write notes phdr entry */
{
- size_t sz = get_note_info_size(&info);
+ size_t sz = info.size;
/* For cell spufs */
sz += elf_coredump_extra_notes_size();
@@ -2294,7 +2120,7 @@ static int elf_core_dump(struct coredump_params *cprm)
if (!elf_core_write_extra_phdrs(cprm, offset))
goto end_coredump;
- /* write out the notes section */
+ /* write out the notes section */
if (!write_note_info(&info, cprm))
goto end_coredump;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 08d0c8797828..096e3520a0b1 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -434,8 +434,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
current->mm->start_stack = current->mm->start_brk + stack_size;
#endif
- if (create_elf_fdpic_tables(bprm, current->mm,
- &exec_params, &interp_params) < 0)
+ retval = create_elf_fdpic_tables(bprm, current->mm, &exec_params,
+ &interp_params);
+ if (retval < 0)
goto error;
kdebug("- start_code %lx", current->mm->start_code);
@@ -1603,7 +1604,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
if (!elf_core_write_extra_phdrs(cprm, offset))
goto end_coredump;
- /* write out the notes section */
+ /* write out the notes section */
if (!writenote(thread_list->notes, cprm))
goto end_coredump;
if (!writenote(&psinfo_note, cprm))
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index e1eae7ea823a..bb202ad369d5 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -44,10 +44,10 @@ static LIST_HEAD(entries);
static int enabled = 1;
enum {Enabled, Magic};
-#define MISC_FMT_PRESERVE_ARGV0 (1 << 31)
-#define MISC_FMT_OPEN_BINARY (1 << 30)
-#define MISC_FMT_CREDENTIALS (1 << 29)
-#define MISC_FMT_OPEN_FILE (1 << 28)
+#define MISC_FMT_PRESERVE_ARGV0 (1UL << 31)
+#define MISC_FMT_OPEN_BINARY (1UL << 30)
+#define MISC_FMT_CREDENTIALS (1UL << 29)
+#define MISC_FMT_OPEN_FILE (1UL << 28)
typedef struct {
struct list_head list;
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 99f9995670ea..555c962fdad6 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -23,15 +23,15 @@ obj-$(CONFIG_BTRFS_FS) := btrfs.o
btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
file-item.o inode-item.o disk-io.o \
- transaction.o inode.o file.o tree-defrag.o \
- extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
+ transaction.o inode.o file.o defrag.o \
+ extent_map.o sysfs.o accessors.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
- subpage.o tree-mod-log.o
+ subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/accessors.c
index 12455b2b41de..206cf1612c1d 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/accessors.c
@@ -4,8 +4,9 @@
*/
#include <asm/unaligned.h>
-
+#include "messages.h"
#include "ctree.h"
+#include "accessors.h"
static bool check_setget_bounds(const struct extent_buffer *eb,
const void *ptr, unsigned off, int size)
@@ -23,6 +24,13 @@ static bool check_setget_bounds(const struct extent_buffer *eb,
return true;
}
+void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb)
+{
+ token->eb = eb;
+ token->kaddr = page_address(eb->pages[0]);
+ token->offset = 0;
+}
+
/*
* Macro templates that define helpers to read/write extent buffer data of a
* given size, that are also used via ctree.h for access to item members by
@@ -160,7 +168,7 @@ DEFINE_BTRFS_SETGET_BITS(64)
void btrfs_node_key(const struct extent_buffer *eb,
struct btrfs_disk_key *disk_key, int nr)
{
- unsigned long ptr = btrfs_node_key_ptr_offset(nr);
+ unsigned long ptr = btrfs_node_key_ptr_offset(eb, nr);
read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
struct btrfs_key_ptr, key, disk_key);
}
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
new file mode 100644
index 000000000000..ceadfc5d6c66
--- /dev/null
+++ b/fs/btrfs/accessors.h
@@ -0,0 +1,1073 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_ACCESSORS_H
+#define BTRFS_ACCESSORS_H
+
+struct btrfs_map_token {
+ struct extent_buffer *eb;
+ char *kaddr;
+ unsigned long offset;
+};
+
+void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb);
+
+/*
+ * Some macros to generate set/get functions for the struct fields. This
+ * assumes there is a lefoo_to_cpu for every type, so lets make a simple one
+ * for u8:
+ */
+#define le8_to_cpu(v) (v)
+#define cpu_to_le8(v) (v)
+#define __le8 u8
+
+static inline u8 get_unaligned_le8(const void *p)
+{
+ return *(u8 *)p;
+}
+
+static inline void put_unaligned_le8(u8 val, void *p)
+{
+ *(u8 *)p = val;
+}
+
+#define read_eb_member(eb, ptr, type, member, result) (\
+ read_extent_buffer(eb, (char *)(result), \
+ ((unsigned long)(ptr)) + \
+ offsetof(type, member), \
+ sizeof(((type *)0)->member)))
+
+#define write_eb_member(eb, ptr, type, member, result) (\
+ write_extent_buffer(eb, (char *)(result), \
+ ((unsigned long)(ptr)) + \
+ offsetof(type, member), \
+ sizeof(((type *)0)->member)))
+
+#define DECLARE_BTRFS_SETGET_BITS(bits) \
+u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
+ const void *ptr, unsigned long off); \
+void btrfs_set_token_##bits(struct btrfs_map_token *token, \
+ const void *ptr, unsigned long off, \
+ u##bits val); \
+u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
+ const void *ptr, unsigned long off); \
+void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
+ unsigned long off, u##bits val);
+
+DECLARE_BTRFS_SETGET_BITS(8)
+DECLARE_BTRFS_SETGET_BITS(16)
+DECLARE_BTRFS_SETGET_BITS(32)
+DECLARE_BTRFS_SETGET_BITS(64)
+
+#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
+static inline u##bits btrfs_##name(const struct extent_buffer *eb, \
+ const type *s) \
+{ \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
+ return btrfs_get_##bits(eb, s, offsetof(type, member)); \
+} \
+static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \
+ u##bits val) \
+{ \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
+ btrfs_set_##bits(eb, s, offsetof(type, member), val); \
+} \
+static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \
+ const type *s) \
+{ \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
+ return btrfs_get_token_##bits(token, s, offsetof(type, member));\
+} \
+static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
+ type *s, u##bits val) \
+{ \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
+ btrfs_set_token_##bits(token, s, offsetof(type, member), val); \
+}
+
+#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
+static inline u##bits btrfs_##name(const struct extent_buffer *eb) \
+{ \
+ const type *p = page_address(eb->pages[0]) + \
+ offset_in_page(eb->start); \
+ return get_unaligned_le##bits(&p->member); \
+} \
+static inline void btrfs_set_##name(const struct extent_buffer *eb, \
+ u##bits val) \
+{ \
+ type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \
+ put_unaligned_le##bits(val, &p->member); \
+}
+
+#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
+static inline u##bits btrfs_##name(const type *s) \
+{ \
+ return get_unaligned_le##bits(&s->member); \
+} \
+static inline void btrfs_set_##name(type *s, u##bits val) \
+{ \
+ put_unaligned_le##bits(val, &s->member); \
+}
+
+static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb,
+ struct btrfs_dev_item *s)
+{
+ static_assert(sizeof(u64) ==
+ sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item,
+ total_bytes));
+}
+static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb,
+ struct btrfs_dev_item *s,
+ u64 val)
+{
+ static_assert(sizeof(u64) ==
+ sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize));
+ btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val);
+}
+
+BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
+BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
+BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
+BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, start_offset, 64);
+BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
+BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
+BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
+BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
+BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
+ total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
+ bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
+ io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
+ io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
+ sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, dev_group, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
+ seek_speed, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
+ bandwidth, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
+ generation, 64);
+
+static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d)
+{
+ return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid);
+}
+
+static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d)
+{
+ return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid);
+}
+
+BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
+BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
+BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
+BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
+BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
+BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
+BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
+{
+ return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
+}
+
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
+ stripe_len, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
+ sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
+ num_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
+ sub_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, int nr)
+{
+ unsigned long offset = (unsigned long)c;
+
+ offset += offsetof(struct btrfs_chunk, stripe);
+ offset += nr * sizeof(struct btrfs_stripe);
+ return (struct btrfs_stripe *)offset;
+}
+
+static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
+{
+ return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
+}
+
+static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb,
+ struct btrfs_chunk *c, int nr)
+{
+ return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
+}
+
+static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
+ struct btrfs_chunk *c, int nr,
+ u64 val)
+{
+ btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
+}
+
+static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb,
+ struct btrfs_chunk *c, int nr)
+{
+ return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
+}
+
+static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
+ struct btrfs_chunk *c, int nr,
+ u64 val)
+{
+ btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
+}
+
+/* struct btrfs_block_group_item */
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item,
+ used, 64);
+BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item, used, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid,
+ struct btrfs_block_group_item, chunk_objectid, 64);
+
+BTRFS_SETGET_FUNCS(block_group_chunk_objectid,
+ struct btrfs_block_group_item, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags,
+ struct btrfs_block_group_item, flags, 64);
+
+/* struct btrfs_free_space_info */
+BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info,
+ extent_count, 32);
+BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32);
+
+/* struct btrfs_inode_ref */
+BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
+BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_ref_index, struct btrfs_inode_ref, index, 64);
+
+/* struct btrfs_inode_extref */
+BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
+ parent_objectid, 64);
+BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
+ name_len, 16);
+BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);
+
+/* struct btrfs_inode_item */
+BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
+BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
+BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
+BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
+BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
+BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item,
+ generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item,
+ sequence, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item,
+ transid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, nbytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item,
+ block_group, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
+BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
+
+/* struct btrfs_dev_extent */
+BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
+ chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
+ chunk_offset, 64);
+BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_tree, struct btrfs_dev_extent,
+ chunk_tree, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_objectid, struct btrfs_dev_extent,
+ chunk_objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_offset, struct btrfs_dev_extent,
+ chunk_offset, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_length, struct btrfs_dev_extent, length, 64);
+
+BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
+BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, generation, 64);
+BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);
+
+BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8);
+
+static inline void btrfs_tree_block_key(const struct extent_buffer *eb,
+ struct btrfs_tree_block_info *item,
+ struct btrfs_disk_key *key)
+{
+ read_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+
+static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb,
+ struct btrfs_tree_block_info *item,
+ struct btrfs_disk_key *key)
+{
+ write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+
+BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, root, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref,
+ objectid, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref,
+ offset, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 32);
+
+BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32);
+
+BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
+ type, 8);
+BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
+ offset, 64);
+
+static inline u32 btrfs_extent_inline_ref_size(int type)
+{
+ if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+ type == BTRFS_SHARED_BLOCK_REF_KEY)
+ return sizeof(struct btrfs_extent_inline_ref);
+ if (type == BTRFS_SHARED_DATA_REF_KEY)
+ return sizeof(struct btrfs_shared_data_ref) +
+ sizeof(struct btrfs_extent_inline_ref);
+ if (type == BTRFS_EXTENT_DATA_REF_KEY)
+ return sizeof(struct btrfs_extent_data_ref) +
+ offsetof(struct btrfs_extent_inline_ref, offset);
+ return 0;
+}
+
+/* struct btrfs_node */
+BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
+BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr, blockptr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr,
+ generation, 64);
+
+static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr)
+{
+ unsigned long ptr;
+
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
+}
+
+static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb,
+ int nr, u64 val)
+{
+ unsigned long ptr;
+
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+
+static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr)
+{
+ unsigned long ptr;
+
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
+}
+
+static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb,
+ int nr, u64 val)
+{
+ unsigned long ptr;
+
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+
+static inline unsigned long btrfs_node_key_ptr_offset(const struct extent_buffer *eb, int nr)
+{
+ return offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+}
+
+void btrfs_node_key(const struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr);
+
+static inline void btrfs_set_node_key(const struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr)
+{
+ unsigned long ptr;
+
+ ptr = btrfs_node_key_ptr_offset(eb, nr);
+ write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+ struct btrfs_key_ptr, key, disk_key);
+}
+
+/* struct btrfs_item */
+BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32);
+
+static inline unsigned long btrfs_item_nr_offset(const struct extent_buffer *eb, int nr)
+{
+ return offsetof(struct btrfs_leaf, items) +
+ sizeof(struct btrfs_item) * nr;
+}
+
+static inline struct btrfs_item *btrfs_item_nr(const struct extent_buffer *eb, int nr)
+{
+ return (struct btrfs_item *)btrfs_item_nr_offset(eb, nr);
+}
+
+#define BTRFS_ITEM_SETGET_FUNCS(member) \
+static inline u32 btrfs_item_##member(const struct extent_buffer *eb, int slot) \
+{ \
+ return btrfs_raw_item_##member(eb, btrfs_item_nr(eb, slot)); \
+} \
+static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \
+ int slot, u32 val) \
+{ \
+ btrfs_set_raw_item_##member(eb, btrfs_item_nr(eb, slot), val); \
+} \
+static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \
+ int slot) \
+{ \
+ struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \
+ return btrfs_token_raw_item_##member(token, item); \
+} \
+static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \
+ int slot, u32 val) \
+{ \
+ struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \
+ btrfs_set_token_raw_item_##member(token, item, val); \
+}
+
+BTRFS_ITEM_SETGET_FUNCS(offset)
+BTRFS_ITEM_SETGET_FUNCS(size);
+
+static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr)
+{
+ return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr);
+}
+
+static inline void btrfs_item_key(const struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr)
+{
+ struct btrfs_item *item = btrfs_item_nr(eb, nr);
+
+ read_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+
+static inline void btrfs_set_item_key(struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr)
+{
+ struct btrfs_item *item = btrfs_item_nr(eb, nr);
+
+ write_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+
+BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
+
+/* struct btrfs_root_ref */
+BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
+BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
+BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_root_ref_dirid, struct btrfs_root_ref, dirid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_root_ref_sequence, struct btrfs_root_ref, sequence, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_root_ref_name_len, struct btrfs_root_ref, name_len, 16);
+
+/* struct btrfs_dir_item */
+BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
+BTRFS_SETGET_FUNCS(dir_flags, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
+BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_flags, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, data_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, name_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, transid, 64);
+
+static inline u8 btrfs_dir_ftype(const struct extent_buffer *eb,
+ const struct btrfs_dir_item *item)
+{
+ return btrfs_dir_flags_to_ftype(btrfs_dir_flags(eb, item));
+}
+
+static inline u8 btrfs_stack_dir_ftype(const struct btrfs_dir_item *item)
+{
+ return btrfs_dir_flags_to_ftype(btrfs_stack_dir_flags(item));
+}
+
+static inline void btrfs_dir_item_key(const struct extent_buffer *eb,
+ const struct btrfs_dir_item *item,
+ struct btrfs_disk_key *key)
+{
+ read_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+
+static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
+ struct btrfs_dir_item *item,
+ const struct btrfs_disk_key *key)
+{
+ write_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+
+BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header,
+ num_entries, 64);
+BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
+ num_bitmaps, 64);
+BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
+ generation, 64);
+
+static inline void btrfs_free_space_key(const struct extent_buffer *eb,
+ const struct btrfs_free_space_header *h,
+ struct btrfs_disk_key *key)
+{
+ read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+
+static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
+ struct btrfs_free_space_header *h,
+ const struct btrfs_disk_key *key)
+{
+ write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+
+/* struct btrfs_disk_key */
+BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
+
+#ifdef __LITTLE_ENDIAN
+
+/*
+ * Optimized helpers for little-endian architectures where CPU and on-disk
+ * structures have the same endianness and we can skip conversions.
+ */
+
+static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key,
+ const struct btrfs_disk_key *disk_key)
+{
+ memcpy(cpu_key, disk_key, sizeof(struct btrfs_key));
+}
+
+static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key,
+ const struct btrfs_key *cpu_key)
+{
+ memcpy(disk_key, cpu_key, sizeof(struct btrfs_key));
+}
+
+static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *cpu_key, int nr)
+{
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+ btrfs_node_key(eb, disk_key, nr);
+}
+
+static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *cpu_key, int nr)
+{
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+ btrfs_item_key(eb, disk_key, nr);
+}
+
+static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
+ const struct btrfs_dir_item *item,
+ struct btrfs_key *cpu_key)
+{
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+ btrfs_dir_item_key(eb, item, disk_key);
+}
+
+#else
+
+static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
+ const struct btrfs_disk_key *disk)
+{
+ cpu->offset = le64_to_cpu(disk->offset);
+ cpu->type = disk->type;
+ cpu->objectid = le64_to_cpu(disk->objectid);
+}
+
+static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
+ const struct btrfs_key *cpu)
+{
+ disk->offset = cpu_to_le64(cpu->offset);
+ disk->type = cpu->type;
+ disk->objectid = cpu_to_le64(cpu->objectid);
+}
+
+static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *key, int nr)
+{
+ struct btrfs_disk_key disk_key;
+
+ btrfs_node_key(eb, &disk_key, nr);
+ btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *key, int nr)
+{
+ struct btrfs_disk_key disk_key;
+
+ btrfs_item_key(eb, &disk_key, nr);
+ btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
+ const struct btrfs_dir_item *item,
+ struct btrfs_key *key)
+{
+ struct btrfs_disk_key disk_key;
+
+ btrfs_dir_item_key(eb, item, &disk_key);
+ btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+#endif
+
+/* struct btrfs_header */
+BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, generation, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
+BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header,
+ generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, nritems, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64);
+
+static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag)
+{
+ return (btrfs_header_flags(eb) & flag) == flag;
+}
+
+static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
+{
+ u64 flags = btrfs_header_flags(eb);
+
+ btrfs_set_header_flags(eb, flags | flag);
+}
+
+static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
+{
+ u64 flags = btrfs_header_flags(eb);
+
+ btrfs_set_header_flags(eb, flags & ~flag);
+}
+
+static inline int btrfs_header_backref_rev(const struct extent_buffer *eb)
+{
+ u64 flags = btrfs_header_flags(eb);
+
+ return flags >> BTRFS_BACKREF_REV_SHIFT;
+}
+
+static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, int rev)
+{
+ u64 flags = btrfs_header_flags(eb);
+
+ flags &= ~BTRFS_BACKREF_REV_MASK;
+ flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT;
+ btrfs_set_header_flags(eb, flags);
+}
+
+static inline int btrfs_is_leaf(const struct extent_buffer *eb)
+{
+ return btrfs_header_level(eb) == 0;
+}
+
+/* struct btrfs_root_item */
+BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, generation, 64);
+BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8);
+BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
+BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
+BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
+ last_snapshot, 64);
+BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item,
+ generation_v2, 64);
+BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item, ctransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, otransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, stransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, rtransid, 64);
+
+/* struct btrfs_root_backup */
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
+ tree_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
+ tree_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
+ tree_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
+ chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
+ chunk_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
+ chunk_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
+ extent_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
+ extent_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
+ extent_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
+ fs_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
+ fs_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
+ fs_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
+ dev_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
+ dev_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
+ dev_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
+ csum_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
+ csum_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
+ csum_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
+ total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
+ bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
+ num_devices, 64);
+
+/* struct btrfs_balance_item */
+BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
+
+static inline void btrfs_balance_data(const struct extent_buffer *eb,
+ const struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_set_balance_data(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ const struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_balance_meta(const struct extent_buffer *eb,
+ const struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ const struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_balance_sys(const struct extent_buffer *eb,
+ const struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ const struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+ const struct btrfs_disk_balance_args *disk)
+{
+ memset(cpu, 0, sizeof(*cpu));
+
+ cpu->profiles = le64_to_cpu(disk->profiles);
+ cpu->usage = le64_to_cpu(disk->usage);
+ cpu->devid = le64_to_cpu(disk->devid);
+ cpu->pstart = le64_to_cpu(disk->pstart);
+ cpu->pend = le64_to_cpu(disk->pend);
+ cpu->vstart = le64_to_cpu(disk->vstart);
+ cpu->vend = le64_to_cpu(disk->vend);
+ cpu->target = le64_to_cpu(disk->target);
+ cpu->flags = le64_to_cpu(disk->flags);
+ cpu->limit = le64_to_cpu(disk->limit);
+ cpu->stripes_min = le32_to_cpu(disk->stripes_min);
+ cpu->stripes_max = le32_to_cpu(disk->stripes_max);
+}
+
+static inline void btrfs_cpu_balance_args_to_disk(
+ struct btrfs_disk_balance_args *disk,
+ const struct btrfs_balance_args *cpu)
+{
+ memset(disk, 0, sizeof(*disk));
+
+ disk->profiles = cpu_to_le64(cpu->profiles);
+ disk->usage = cpu_to_le64(cpu->usage);
+ disk->devid = cpu_to_le64(cpu->devid);
+ disk->pstart = cpu_to_le64(cpu->pstart);
+ disk->pend = cpu_to_le64(cpu->pend);
+ disk->vstart = cpu_to_le64(cpu->vstart);
+ disk->vend = cpu_to_le64(cpu->vend);
+ disk->target = cpu_to_le64(cpu->target);
+ disk->flags = cpu_to_le64(cpu->flags);
+ disk->limit = cpu_to_le64(cpu->limit);
+ disk->stripes_min = cpu_to_le32(cpu->stripes_min);
+ disk->stripes_max = cpu_to_le32(cpu->stripes_max);
+}
+
+/* struct btrfs_super_block */
+BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
+ generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
+ struct btrfs_super_block, sys_chunk_array_size, 32);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
+ struct btrfs_super_block, chunk_root_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
+ root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
+ chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
+ chunk_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, log_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
+ log_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
+ total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
+ bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
+ sectorsize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
+ nodesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
+ stripesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
+ root_dir_objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
+ num_devices, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
+ compat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
+ compat_ro_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
+ incompat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
+ csum_type, 16);
+BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
+ cache_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
+BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
+ uuid_tree_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_nr_global_roots, struct btrfs_super_block,
+ nr_global_roots, 64);
+
+/* struct btrfs_file_extent_item */
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item,
+ type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr,
+ struct btrfs_file_extent_item, disk_bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset,
+ struct btrfs_file_extent_item, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation,
+ struct btrfs_file_extent_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes,
+ struct btrfs_file_extent_item, num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes,
+ struct btrfs_file_extent_item, ram_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes,
+ struct btrfs_file_extent_item, disk_num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
+ struct btrfs_file_extent_item, compression, 8);
+
+
+BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
+BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
+ disk_bytenr, 64);
+BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
+ disk_num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
+ offset, 64);
+BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
+ num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
+ ram_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
+ compression, 8);
+BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
+ encryption, 8);
+BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
+ other_encoding, 16);
+
+/* btrfs_qgroup_status_item */
+BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item,
+ version, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
+ flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item,
+ rescan, 64);
+
+/* btrfs_qgroup_info_item */
+BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item,
+ rfer_cmpr, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item,
+ excl_cmpr, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation,
+ struct btrfs_qgroup_info_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item,
+ rfer, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr,
+ struct btrfs_qgroup_info_item, rfer_cmpr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item,
+ excl, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr,
+ struct btrfs_qgroup_info_item, excl_cmpr, 64);
+
+/* btrfs_qgroup_limit_item */
+BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item, flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item,
+ max_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item,
+ max_excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
+ rsv_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
+ rsv_excl, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_flags,
+ struct btrfs_qgroup_limit_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_max_rfer,
+ struct btrfs_qgroup_limit_item, max_rfer, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_max_excl,
+ struct btrfs_qgroup_limit_item, max_excl, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_rsv_rfer,
+ struct btrfs_qgroup_limit_item, rsv_rfer, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_rsv_excl,
+ struct btrfs_qgroup_limit_item, rsv_excl, 64);
+
+/* btrfs_dev_replace_item */
+BTRFS_SETGET_FUNCS(dev_replace_src_devid,
+ struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
+ struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
+ 64);
+BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
+ replace_state, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
+ time_started, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
+ time_stopped, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
+ num_write_errors, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
+ struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
+ 64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
+ cursor_left, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
+ cursor_right, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
+ struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
+ struct btrfs_dev_replace_item,
+ cont_reading_from_srcdev_mode, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
+ struct btrfs_dev_replace_item, replace_state, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
+ struct btrfs_dev_replace_item, time_started, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
+ struct btrfs_dev_replace_item, time_stopped, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
+ struct btrfs_dev_replace_item, num_write_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
+ struct btrfs_dev_replace_item,
+ num_uncorrectable_read_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
+ struct btrfs_dev_replace_item, cursor_left, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
+ struct btrfs_dev_replace_item, cursor_right, 64);
+
+/* btrfs_verity_descriptor_item */
+BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item,
+ encryption, 8);
+BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item,
+ size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
+ struct btrfs_verity_descriptor_item, encryption, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
+ struct btrfs_verity_descriptor_item, size, 64);
+
+/* Cast into the data area of the leaf. */
+#define btrfs_item_ptr(leaf, slot, type) \
+ ((type *)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot)))
+
+#define btrfs_item_ptr_offset(leaf, slot) \
+ ((unsigned long)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot)))
+
+#endif
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 548d6a5477b4..3da1779e8b79 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -11,10 +11,10 @@
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/slab.h>
-
#include "ctree.h"
#include "btrfs_inode.h"
#include "xattr.h"
+#include "acl.h"
struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu)
{
@@ -110,10 +110,11 @@ out:
return ret;
}
-int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int ret;
+ struct inode *inode = d_inode(dentry);
umode_t old_mode = inode->i_mode;
if (type == ACL_TYPE_ACCESS && acl) {
diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h
new file mode 100644
index 000000000000..39bd36e6eeb7
--- /dev/null
+++ b/fs/btrfs/acl.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_ACL_H
+#define BTRFS_ACL_H
+
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+
+struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
+int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct posix_acl *acl, int type);
+int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
+ struct posix_acl *acl, int type);
+
+#else
+
+#define btrfs_get_acl NULL
+#define btrfs_set_acl NULL
+static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct posix_acl *acl,
+ int type)
+{
+ return -EOPNOTSUPP;
+}
+
+#endif
+
+#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index d385357e19b6..21c92c74bf71 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -15,49 +15,76 @@
#include "locking.h"
#include "misc.h"
#include "tree-mod-log.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "relocation.h"
+#include "tree-checker.h"
-/* Just an arbitrary number so we can be sure this happened */
-#define BACKREF_FOUND_SHARED 6
+/* Just arbitrary numbers so we can be sure one of these happened. */
+#define BACKREF_FOUND_SHARED 6
+#define BACKREF_FOUND_NOT_SHARED 7
struct extent_inode_elem {
u64 inum;
u64 offset;
+ u64 num_bytes;
struct extent_inode_elem *next;
};
-static int check_extent_in_eb(const struct btrfs_key *key,
+static int check_extent_in_eb(struct btrfs_backref_walk_ctx *ctx,
+ const struct btrfs_key *key,
const struct extent_buffer *eb,
const struct btrfs_file_extent_item *fi,
- u64 extent_item_pos,
- struct extent_inode_elem **eie,
- bool ignore_offset)
+ struct extent_inode_elem **eie)
{
- u64 offset = 0;
+ const u64 data_len = btrfs_file_extent_num_bytes(eb, fi);
+ u64 offset = key->offset;
struct extent_inode_elem *e;
+ const u64 *root_ids;
+ int root_count;
+ bool cached;
- if (!ignore_offset &&
- !btrfs_file_extent_compression(eb, fi) &&
+ if (!btrfs_file_extent_compression(eb, fi) &&
!btrfs_file_extent_encryption(eb, fi) &&
!btrfs_file_extent_other_encoding(eb, fi)) {
u64 data_offset;
- u64 data_len;
data_offset = btrfs_file_extent_offset(eb, fi);
- data_len = btrfs_file_extent_num_bytes(eb, fi);
- if (extent_item_pos < data_offset ||
- extent_item_pos >= data_offset + data_len)
+ if (ctx->extent_item_pos < data_offset ||
+ ctx->extent_item_pos >= data_offset + data_len)
return 1;
- offset = extent_item_pos - data_offset;
+ offset += ctx->extent_item_pos - data_offset;
+ }
+
+ if (!ctx->indirect_ref_iterator || !ctx->cache_lookup)
+ goto add_inode_elem;
+
+ cached = ctx->cache_lookup(eb->start, ctx->user_ctx, &root_ids,
+ &root_count);
+ if (!cached)
+ goto add_inode_elem;
+
+ for (int i = 0; i < root_count; i++) {
+ int ret;
+
+ ret = ctx->indirect_ref_iterator(key->objectid, offset,
+ data_len, root_ids[i],
+ ctx->user_ctx);
+ if (ret)
+ return ret;
}
+add_inode_elem:
e = kmalloc(sizeof(*e), GFP_NOFS);
if (!e)
return -ENOMEM;
e->next = *eie;
e->inum = key->objectid;
- e->offset = key->offset + offset;
+ e->offset = offset;
+ e->num_bytes = data_len;
*eie = e;
return 0;
@@ -73,10 +100,9 @@ static void free_inode_elem_list(struct extent_inode_elem *eie)
}
}
-static int find_extent_in_eb(const struct extent_buffer *eb,
- u64 wanted_disk_byte, u64 extent_item_pos,
- struct extent_inode_elem **eie,
- bool ignore_offset)
+static int find_extent_in_eb(struct btrfs_backref_walk_ctx *ctx,
+ const struct extent_buffer *eb,
+ struct extent_inode_elem **eie)
{
u64 disk_byte;
struct btrfs_key key;
@@ -102,11 +128,11 @@ static int find_extent_in_eb(const struct extent_buffer *eb,
continue;
/* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
- if (disk_byte != wanted_disk_byte)
+ if (disk_byte != ctx->bytenr)
continue;
- ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie, ignore_offset);
- if (ret < 0)
+ ret = check_extent_in_eb(ctx, &key, eb, fi, eie);
+ if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0)
return ret;
}
@@ -135,9 +161,30 @@ struct preftrees {
* - decremented when a ref->count transitions to <1
*/
struct share_check {
- u64 root_objectid;
+ struct btrfs_backref_share_check_ctx *ctx;
+ struct btrfs_root *root;
u64 inum;
+ u64 data_bytenr;
+ u64 data_extent_gen;
+ /*
+ * Counts number of inodes that refer to an extent (different inodes in
+ * the same root or different roots) that we could find. The sharedness
+ * check typically stops once this counter gets greater than 1, so it
+ * may not reflect the total number of inodes.
+ */
int share_count;
+ /*
+ * The number of times we found our inode refers to the data extent we
+ * are determining the sharedness. In other words, how many file extent
+ * items we could find for our inode that point to our target data
+ * extent. The value we get here after finishing the extent sharedness
+ * check may be smaller than reality, but if it ends up being greater
+ * than 1, then we know for sure the inode has multiple file extent
+ * items that point to our inode, and we can safely assume it's useful
+ * to cache the sharedness check result.
+ */
+ int self_ref_count;
+ bool have_delayed_delete_refs;
};
static inline int extent_is_shared(struct share_check *sc)
@@ -206,7 +253,7 @@ static int prelim_ref_compare(struct prelim_ref *ref1,
}
static void update_share_count(struct share_check *sc, int oldcount,
- int newcount)
+ int newcount, struct prelim_ref *newref)
{
if ((!sc) || (oldcount == 0 && newcount < 1))
return;
@@ -215,6 +262,11 @@ static void update_share_count(struct share_check *sc, int oldcount,
sc->share_count--;
else if (oldcount < 1 && newcount > 0)
sc->share_count++;
+
+ if (newref->root_id == sc->root->root_key.objectid &&
+ newref->wanted_disk_byte == sc->data_bytenr &&
+ newref->key_for_search.objectid == sc->inum)
+ sc->self_ref_count += newref->count;
}
/*
@@ -265,14 +317,14 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
* BTRFS_[ADD|DROP]_DELAYED_REF actions.
*/
update_share_count(sc, ref->count,
- ref->count + newref->count);
+ ref->count + newref->count, newref);
ref->count += newref->count;
free_pref(newref);
return;
}
}
- update_share_count(sc, 0, newref->count);
+ update_share_count(sc, 0, newref->count, newref);
preftree->count++;
trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count);
rb_link_node(&newref->rbnode, parent, p);
@@ -288,8 +340,10 @@ static void prelim_release(struct preftree *preftree)
struct prelim_ref *ref, *next_ref;
rbtree_postorder_for_each_entry_safe(ref, next_ref,
- &preftree->root.rb_root, rbnode)
+ &preftree->root.rb_root, rbnode) {
+ free_inode_elem_list(ref->inode_list);
free_pref(ref);
+ }
preftree->root = RB_ROOT_CACHED;
preftree->count = 0;
@@ -413,11 +467,11 @@ static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr)
return 0;
}
-static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
+static int add_all_parents(struct btrfs_backref_walk_ctx *ctx,
+ struct btrfs_root *root, struct btrfs_path *path,
struct ulist *parents,
struct preftrees *preftrees, struct prelim_ref *ref,
- int level, u64 time_seq, const u64 *extent_item_pos,
- bool ignore_offset)
+ int level)
{
int ret = 0;
int slot;
@@ -453,10 +507,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
if (path->slots[0] >= btrfs_header_nritems(eb) ||
is_shared_data_backref(preftrees, eb->start) ||
ref->root_id != btrfs_header_owner(eb)) {
- if (time_seq == BTRFS_SEQ_LAST)
+ if (ctx->time_seq == BTRFS_SEQ_LAST)
ret = btrfs_next_leaf(root, path);
else
- ret = btrfs_next_old_leaf(root, path, time_seq);
+ ret = btrfs_next_old_leaf(root, path, ctx->time_seq);
}
while (!ret && count < ref->count) {
@@ -477,10 +531,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
if (slot == 0 &&
(is_shared_data_backref(preftrees, eb->start) ||
ref->root_id != btrfs_header_owner(eb))) {
- if (time_seq == BTRFS_SEQ_LAST)
+ if (ctx->time_seq == BTRFS_SEQ_LAST)
ret = btrfs_next_leaf(root, path);
else
- ret = btrfs_next_old_leaf(root, path, time_seq);
+ ret = btrfs_next_old_leaf(root, path, ctx->time_seq);
continue;
}
fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
@@ -494,11 +548,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
count++;
else
goto next;
- if (extent_item_pos) {
- ret = check_extent_in_eb(&key, eb, fi,
- *extent_item_pos,
- &eie, ignore_offset);
- if (ret < 0)
+ if (!ctx->ignore_extent_item_pos) {
+ ret = check_extent_in_eb(ctx, &key, eb, fi, &eie);
+ if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP ||
+ ret < 0)
break;
}
if (ret > 0)
@@ -507,7 +560,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
eie, (void **)&old, GFP_NOFS);
if (ret < 0)
break;
- if (!ret && extent_item_pos) {
+ if (!ret && !ctx->ignore_extent_item_pos) {
while (old->next)
old = old->next;
old->next = eie;
@@ -515,16 +568,17 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
eie = NULL;
}
next:
- if (time_seq == BTRFS_SEQ_LAST)
+ if (ctx->time_seq == BTRFS_SEQ_LAST)
ret = btrfs_next_item(root, path);
else
- ret = btrfs_next_old_item(root, path, time_seq);
+ ret = btrfs_next_old_item(root, path, ctx->time_seq);
}
- if (ret > 0)
- ret = 0;
- else if (ret < 0)
+ if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0)
free_inode_elem_list(eie);
+ else if (ret > 0)
+ ret = 0;
+
return ret;
}
@@ -532,11 +586,10 @@ next:
* resolve an indirect backref in the form (root_id, key, level)
* to a logical address
*/
-static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path, u64 time_seq,
+static int resolve_indirect_ref(struct btrfs_backref_walk_ctx *ctx,
+ struct btrfs_path *path,
struct preftrees *preftrees,
- struct prelim_ref *ref, struct ulist *parents,
- const u64 *extent_item_pos, bool ignore_offset)
+ struct prelim_ref *ref, struct ulist *parents)
{
struct btrfs_root *root;
struct extent_buffer *eb;
@@ -554,9 +607,9 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
* here.
*/
if (path->search_commit_root)
- root = btrfs_get_fs_root_commit_root(fs_info, path, ref->root_id);
+ root = btrfs_get_fs_root_commit_root(ctx->fs_info, path, ref->root_id);
else
- root = btrfs_get_fs_root(fs_info, ref->root_id, false);
+ root = btrfs_get_fs_root(ctx->fs_info, ref->root_id, false);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
goto out_free;
@@ -568,17 +621,17 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
goto out;
}
- if (btrfs_is_testing(fs_info)) {
+ if (btrfs_is_testing(ctx->fs_info)) {
ret = -ENOENT;
goto out;
}
if (path->search_commit_root)
root_level = btrfs_header_level(root->commit_root);
- else if (time_seq == BTRFS_SEQ_LAST)
+ else if (ctx->time_seq == BTRFS_SEQ_LAST)
root_level = btrfs_header_level(root->node);
else
- root_level = btrfs_old_root_level(root, time_seq);
+ root_level = btrfs_old_root_level(root, ctx->time_seq);
if (root_level + 1 == level)
goto out;
@@ -606,12 +659,12 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
search_key.offset >= LLONG_MAX)
search_key.offset = 0;
path->lowest_level = level;
- if (time_seq == BTRFS_SEQ_LAST)
+ if (ctx->time_seq == BTRFS_SEQ_LAST)
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
else
- ret = btrfs_search_old_slot(root, &search_key, path, time_seq);
+ ret = btrfs_search_old_slot(root, &search_key, path, ctx->time_seq);
- btrfs_debug(fs_info,
+ btrfs_debug(ctx->fs_info,
"search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)",
ref->root_id, level, ref->count, ret,
ref->key_for_search.objectid, ref->key_for_search.type,
@@ -629,8 +682,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
eb = path->nodes[level];
}
- ret = add_all_parents(root, path, parents, preftrees, ref, level,
- time_seq, extent_item_pos, ignore_offset);
+ ret = add_all_parents(ctx, root, path, parents, preftrees, ref, level);
out:
btrfs_put_root(root);
out_free:
@@ -647,6 +699,18 @@ unode_aux_to_inode_list(struct ulist_node *node)
return (struct extent_inode_elem *)(uintptr_t)node->aux;
}
+static void free_leaf_list(struct ulist *ulist)
+{
+ struct ulist_node *node;
+ struct ulist_iterator uiter;
+
+ ULIST_ITER_INIT(&uiter);
+ while ((node = ulist_next(ulist, &uiter)))
+ free_inode_elem_list(unode_aux_to_inode_list(node));
+
+ ulist_free(ulist);
+}
+
/*
* We maintain three separate rbtrees: one for direct refs, one for
* indirect refs which have a key, and one for indirect refs which do not
@@ -663,11 +727,10 @@ unode_aux_to_inode_list(struct ulist_node *node)
* rbtree as they are encountered. The new backrefs are subsequently
* resolved as above.
*/
-static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path, u64 time_seq,
+static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx,
+ struct btrfs_path *path,
struct preftrees *preftrees,
- const u64 *extent_item_pos,
- struct share_check *sc, bool ignore_offset)
+ struct share_check *sc)
{
int err;
int ret = 0;
@@ -704,21 +767,18 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
continue;
}
- if (sc && sc->root_objectid &&
- ref->root_id != sc->root_objectid) {
+ if (sc && ref->root_id != sc->root->root_key.objectid) {
free_pref(ref);
ret = BACKREF_FOUND_SHARED;
goto out;
}
- err = resolve_indirect_ref(fs_info, path, time_seq, preftrees,
- ref, parents, extent_item_pos,
- ignore_offset);
+ err = resolve_indirect_ref(ctx, path, preftrees, ref, parents);
/*
* we can only tolerate ENOENT,otherwise,we should catch error
* and return directly.
*/
if (err == -ENOENT) {
- prelim_ref_insert(fs_info, &preftrees->direct, ref,
+ prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref,
NULL);
continue;
} else if (err) {
@@ -747,7 +807,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
memcpy(new_ref, ref, sizeof(*ref));
new_ref->parent = node->val;
new_ref->inode_list = unode_aux_to_inode_list(node);
- prelim_ref_insert(fs_info, &preftrees->direct,
+ prelim_ref_insert(ctx->fs_info, &preftrees->direct,
new_ref, NULL);
}
@@ -755,13 +815,17 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
* Now it's a direct ref, put it in the direct tree. We must
* do this last because the ref could be merged/freed here.
*/
- prelim_ref_insert(fs_info, &preftrees->direct, ref, NULL);
+ prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref, NULL);
ulist_reinit(parents);
cond_resched();
}
out:
- ulist_free(parents);
+ /*
+ * We may have inode lists attached to refs in the parents ulist, so we
+ * must free them before freeing the ulist and its refs.
+ */
+ free_leaf_list(parents);
return ret;
}
@@ -777,6 +841,8 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
struct rb_node *node;
while ((node = rb_first_cached(&tree->root))) {
+ struct btrfs_tree_parent_check check = { 0 };
+
ref = rb_entry(node, struct prelim_ref, rbnode);
rb_erase_cached(node, &tree->root);
@@ -784,8 +850,10 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
BUG_ON(ref->key_for_search.type);
BUG_ON(!ref->wanted_disk_byte);
- eb = read_tree_block(fs_info, ref->wanted_disk_byte,
- ref->root_id, 0, ref->level - 1, NULL);
+ check.level = ref->level - 1;
+ check.owner_root = ref->root_id;
+
+ eb = read_tree_block(fs_info, ref->wanted_disk_byte, &check);
if (IS_ERR(eb)) {
free_pref(ref);
return PTR_ERR(eb);
@@ -820,16 +888,11 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
struct preftrees *preftrees, struct share_check *sc)
{
struct btrfs_delayed_ref_node *node;
- struct btrfs_delayed_extent_op *extent_op = head->extent_op;
struct btrfs_key key;
- struct btrfs_key tmp_op_key;
struct rb_node *n;
int count;
int ret = 0;
- if (extent_op && extent_op->update_key)
- btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
-
spin_lock(&head->lock);
for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) {
node = rb_entry(n, struct btrfs_delayed_ref_node,
@@ -855,10 +918,16 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
case BTRFS_TREE_BLOCK_REF_KEY: {
/* NORMAL INDIRECT METADATA backref */
struct btrfs_delayed_tree_ref *ref;
+ struct btrfs_key *key_ptr = NULL;
+
+ if (head->extent_op && head->extent_op->update_key) {
+ btrfs_disk_key_to_cpu(&key, &head->extent_op->key);
+ key_ptr = &key;
+ }
ref = btrfs_delayed_node_to_tree_ref(node);
ret = add_indirect_ref(fs_info, preftrees, ref->root,
- &tmp_op_key, ref->level + 1,
+ key_ptr, ref->level + 1,
node->bytenr, count, sc,
GFP_ATOMIC);
break;
@@ -884,13 +953,22 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
key.offset = ref->offset;
/*
- * Found a inum that doesn't match our known inum, we
- * know it's shared.
+ * If we have a share check context and a reference for
+ * another inode, we can't exit immediately. This is
+ * because even if this is a BTRFS_ADD_DELAYED_REF
+ * reference we may find next a BTRFS_DROP_DELAYED_REF
+ * which cancels out this ADD reference.
+ *
+ * If this is a DROP reference and there was no previous
+ * ADD reference, then we need to signal that when we
+ * process references from the extent tree (through
+ * add_inline_refs() and add_keyed_refs()), we should
+ * not exit early if we find a reference for another
+ * inode, because one of the delayed DROP references
+ * may cancel that reference in the extent tree.
*/
- if (sc && sc->inum && ref->objectid != sc->inum) {
- ret = BACKREF_FOUND_SHARED;
- goto out;
- }
+ if (sc && count < 0)
+ sc->have_delayed_delete_refs = true;
ret = add_indirect_ref(fs_info, preftrees, ref->root,
&key, 0, node->bytenr, count, sc,
@@ -920,7 +998,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
}
if (!ret)
ret = extent_is_shared(sc);
-out:
+
spin_unlock(&head->lock);
return ret;
}
@@ -930,8 +1008,8 @@ out:
*
* Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED.
*/
-static int add_inline_refs(const struct btrfs_fs_info *fs_info,
- struct btrfs_path *path, u64 bytenr,
+static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx,
+ struct btrfs_path *path,
int *info_level, struct preftrees *preftrees,
struct share_check *sc)
{
@@ -956,6 +1034,13 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
BUG_ON(item_size < sizeof(*ei));
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+
+ if (ctx->check_extent_item) {
+ ret = ctx->check_extent_item(ctx->bytenr, ei, leaf, ctx->user_ctx);
+ if (ret)
+ return ret;
+ }
+
flags = btrfs_extent_flags(leaf, ei);
btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -991,9 +1076,9 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
switch (type) {
case BTRFS_SHARED_BLOCK_REF_KEY:
- ret = add_direct_ref(fs_info, preftrees,
+ ret = add_direct_ref(ctx->fs_info, preftrees,
*info_level + 1, offset,
- bytenr, 1, NULL, GFP_NOFS);
+ ctx->bytenr, 1, NULL, GFP_NOFS);
break;
case BTRFS_SHARED_DATA_REF_KEY: {
struct btrfs_shared_data_ref *sdref;
@@ -1002,14 +1087,14 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
sdref = (struct btrfs_shared_data_ref *)(iref + 1);
count = btrfs_shared_data_ref_count(leaf, sdref);
- ret = add_direct_ref(fs_info, preftrees, 0, offset,
- bytenr, count, sc, GFP_NOFS);
+ ret = add_direct_ref(ctx->fs_info, preftrees, 0, offset,
+ ctx->bytenr, count, sc, GFP_NOFS);
break;
}
case BTRFS_TREE_BLOCK_REF_KEY:
- ret = add_indirect_ref(fs_info, preftrees, offset,
+ ret = add_indirect_ref(ctx->fs_info, preftrees, offset,
NULL, *info_level + 1,
- bytenr, 1, NULL, GFP_NOFS);
+ ctx->bytenr, 1, NULL, GFP_NOFS);
break;
case BTRFS_EXTENT_DATA_REF_KEY: {
struct btrfs_extent_data_ref *dref;
@@ -1023,16 +1108,20 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
- if (sc && sc->inum && key.objectid != sc->inum) {
+ if (sc && key.objectid != sc->inum &&
+ !sc->have_delayed_delete_refs) {
ret = BACKREF_FOUND_SHARED;
break;
}
root = btrfs_extent_data_ref_root(leaf, dref);
- ret = add_indirect_ref(fs_info, preftrees, root,
- &key, 0, bytenr, count,
- sc, GFP_NOFS);
+ if (!ctx->skip_data_ref ||
+ !ctx->skip_data_ref(root, key.objectid, key.offset,
+ ctx->user_ctx))
+ ret = add_indirect_ref(ctx->fs_info, preftrees,
+ root, &key, 0, ctx->bytenr,
+ count, sc, GFP_NOFS);
break;
}
default:
@@ -1051,8 +1140,9 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
*
* Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED.
*/
-static int add_keyed_refs(struct btrfs_root *extent_root,
- struct btrfs_path *path, u64 bytenr,
+static int add_keyed_refs(struct btrfs_backref_walk_ctx *ctx,
+ struct btrfs_root *extent_root,
+ struct btrfs_path *path,
int info_level, struct preftrees *preftrees,
struct share_check *sc)
{
@@ -1075,7 +1165,7 @@ static int add_keyed_refs(struct btrfs_root *extent_root,
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
- if (key.objectid != bytenr)
+ if (key.objectid != ctx->bytenr)
break;
if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
continue;
@@ -1087,7 +1177,7 @@ static int add_keyed_refs(struct btrfs_root *extent_root,
/* SHARED DIRECT METADATA backref */
ret = add_direct_ref(fs_info, preftrees,
info_level + 1, key.offset,
- bytenr, 1, NULL, GFP_NOFS);
+ ctx->bytenr, 1, NULL, GFP_NOFS);
break;
case BTRFS_SHARED_DATA_REF_KEY: {
/* SHARED DIRECT FULL backref */
@@ -1098,14 +1188,14 @@ static int add_keyed_refs(struct btrfs_root *extent_root,
struct btrfs_shared_data_ref);
count = btrfs_shared_data_ref_count(leaf, sdref);
ret = add_direct_ref(fs_info, preftrees, 0,
- key.offset, bytenr, count,
+ key.offset, ctx->bytenr, count,
sc, GFP_NOFS);
break;
}
case BTRFS_TREE_BLOCK_REF_KEY:
/* NORMAL INDIRECT METADATA backref */
ret = add_indirect_ref(fs_info, preftrees, key.offset,
- NULL, info_level + 1, bytenr,
+ NULL, info_level + 1, ctx->bytenr,
1, NULL, GFP_NOFS);
break;
case BTRFS_EXTENT_DATA_REF_KEY: {
@@ -1122,15 +1212,20 @@ static int add_keyed_refs(struct btrfs_root *extent_root,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
- if (sc && sc->inum && key.objectid != sc->inum) {
+ if (sc && key.objectid != sc->inum &&
+ !sc->have_delayed_delete_refs) {
ret = BACKREF_FOUND_SHARED;
break;
}
root = btrfs_extent_data_ref_root(leaf, dref);
- ret = add_indirect_ref(fs_info, preftrees, root,
- &key, 0, bytenr, count,
- sc, GFP_NOFS);
+
+ if (!ctx->skip_data_ref ||
+ !ctx->skip_data_ref(root, key.objectid, key.offset,
+ ctx->user_ctx))
+ ret = add_indirect_ref(fs_info, preftrees, root,
+ &key, 0, ctx->bytenr,
+ count, sc, GFP_NOFS);
break;
}
default:
@@ -1145,34 +1240,141 @@ static int add_keyed_refs(struct btrfs_root *extent_root,
}
/*
+ * The caller has joined a transaction or is holding a read lock on the
+ * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
+ * snapshot field changing while updating or checking the cache.
+ */
+static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx,
+ struct btrfs_root *root,
+ u64 bytenr, int level, bool *is_shared)
+{
+ struct btrfs_backref_shared_cache_entry *entry;
+
+ if (!ctx->use_path_cache)
+ return false;
+
+ if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
+ return false;
+
+ /*
+ * Level -1 is used for the data extent, which is not reliable to cache
+ * because its reference count can increase or decrease without us
+ * realizing. We cache results only for extent buffers that lead from
+ * the root node down to the leaf with the file extent item.
+ */
+ ASSERT(level >= 0);
+
+ entry = &ctx->path_cache_entries[level];
+
+ /* Unused cache entry or being used for some other extent buffer. */
+ if (entry->bytenr != bytenr)
+ return false;
+
+ /*
+ * We cached a false result, but the last snapshot generation of the
+ * root changed, so we now have a snapshot. Don't trust the result.
+ */
+ if (!entry->is_shared &&
+ entry->gen != btrfs_root_last_snapshot(&root->root_item))
+ return false;
+
+ /*
+ * If we cached a true result and the last generation used for dropping
+ * a root changed, we can not trust the result, because the dropped root
+ * could be a snapshot sharing this extent buffer.
+ */
+ if (entry->is_shared &&
+ entry->gen != btrfs_get_last_root_drop_gen(root->fs_info))
+ return false;
+
+ *is_shared = entry->is_shared;
+ /*
+ * If the node at this level is shared, than all nodes below are also
+ * shared. Currently some of the nodes below may be marked as not shared
+ * because we have just switched from one leaf to another, and switched
+ * also other nodes above the leaf and below the current level, so mark
+ * them as shared.
+ */
+ if (*is_shared) {
+ for (int i = 0; i < level; i++) {
+ ctx->path_cache_entries[i].is_shared = true;
+ ctx->path_cache_entries[i].gen = entry->gen;
+ }
+ }
+
+ return true;
+}
+
+/*
+ * The caller has joined a transaction or is holding a read lock on the
+ * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
+ * snapshot field changing while updating or checking the cache.
+ */
+static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx,
+ struct btrfs_root *root,
+ u64 bytenr, int level, bool is_shared)
+{
+ struct btrfs_backref_shared_cache_entry *entry;
+ u64 gen;
+
+ if (!ctx->use_path_cache)
+ return;
+
+ if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
+ return;
+
+ /*
+ * Level -1 is used for the data extent, which is not reliable to cache
+ * because its reference count can increase or decrease without us
+ * realizing. We cache results only for extent buffers that lead from
+ * the root node down to the leaf with the file extent item.
+ */
+ ASSERT(level >= 0);
+
+ if (is_shared)
+ gen = btrfs_get_last_root_drop_gen(root->fs_info);
+ else
+ gen = btrfs_root_last_snapshot(&root->root_item);
+
+ entry = &ctx->path_cache_entries[level];
+ entry->bytenr = bytenr;
+ entry->is_shared = is_shared;
+ entry->gen = gen;
+
+ /*
+ * If we found an extent buffer is shared, set the cache result for all
+ * extent buffers below it to true. As nodes in the path are COWed,
+ * their sharedness is moved to their children, and if a leaf is COWed,
+ * then the sharedness of a data extent becomes direct, the refcount of
+ * data extent is increased in the extent item at the extent tree.
+ */
+ if (is_shared) {
+ for (int i = 0; i < level; i++) {
+ entry = &ctx->path_cache_entries[i];
+ entry->is_shared = is_shared;
+ entry->gen = gen;
+ }
+ }
+}
+
+/*
* this adds all existing backrefs (inline backrefs, backrefs and delayed
* refs) for the given bytenr to the refs list, merges duplicates and resolves
* indirect refs to their parent bytenr.
* When roots are found, they're added to the roots list
*
- * If time_seq is set to BTRFS_SEQ_LAST, it will not search delayed_refs, and
- * behave much like trans == NULL case, the difference only lies in it will not
- * commit root.
- * The special case is for qgroup to search roots in commit_transaction().
- *
- * @sc - if !NULL, then immediately return BACKREF_FOUND_SHARED when a
- * shared extent is detected.
+ * @ctx: Backref walking context object, must be not NULL.
+ * @sc: If !NULL, then immediately return BACKREF_FOUND_SHARED when a
+ * shared extent is detected.
*
* Otherwise this returns 0 for success and <0 for an error.
*
- * If ignore_offset is set to false, only extent refs whose offsets match
- * extent_item_pos are returned. If true, every extent ref is returned
- * and extent_item_pos is ignored.
- *
* FIXME some caching might speed things up
*/
-static int find_parent_nodes(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist *refs,
- struct ulist *roots, const u64 *extent_item_pos,
- struct share_check *sc, bool ignore_offset)
+static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx,
+ struct share_check *sc)
{
- struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
+ struct btrfs_root *root = btrfs_extent_root(ctx->fs_info, ctx->bytenr);
struct btrfs_key key;
struct btrfs_path *path;
struct btrfs_delayed_ref_root *delayed_refs = NULL;
@@ -1188,9 +1390,13 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
.indirect_missing_keys = PREFTREE_INIT
};
- key.objectid = bytenr;
+ /* Roots ulist is not needed when using a sharedness check context. */
+ if (sc)
+ ASSERT(ctx->roots == NULL);
+
+ key.objectid = ctx->bytenr;
key.offset = (u64)-1;
- if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ if (btrfs_fs_incompat(ctx->fs_info, SKINNY_METADATA))
key.type = BTRFS_METADATA_ITEM_KEY;
else
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -1198,12 +1404,12 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- if (!trans) {
+ if (!ctx->trans) {
path->search_commit_root = 1;
path->skip_locking = 1;
}
- if (time_seq == BTRFS_SEQ_LAST)
+ if (ctx->time_seq == BTRFS_SEQ_LAST)
path->skip_locking = 1;
again:
@@ -1219,17 +1425,17 @@ again:
goto out;
}
- if (trans && likely(trans->type != __TRANS_DUMMY) &&
- time_seq != BTRFS_SEQ_LAST) {
+ if (ctx->trans && likely(ctx->trans->type != __TRANS_DUMMY) &&
+ ctx->time_seq != BTRFS_SEQ_LAST) {
/*
* We have a specific time_seq we care about and trans which
* means we have the path lock, we need to grab the ref head and
* lock it so we have a consistent view of the refs at the given
* time.
*/
- delayed_refs = &trans->transaction->delayed_refs;
+ delayed_refs = &ctx->trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+ head = btrfs_find_delayed_ref_head(delayed_refs, ctx->bytenr);
if (head) {
if (!mutex_trylock(&head->mutex)) {
refcount_inc(&head->refs);
@@ -1247,7 +1453,7 @@ again:
goto again;
}
spin_unlock(&delayed_refs->lock);
- ret = add_delayed_refs(fs_info, head, time_seq,
+ ret = add_delayed_refs(ctx->fs_info, head, ctx->time_seq,
&preftrees, sc);
mutex_unlock(&head->mutex);
if (ret)
@@ -1265,30 +1471,96 @@ again:
leaf = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
- if (key.objectid == bytenr &&
+ if (key.objectid == ctx->bytenr &&
(key.type == BTRFS_EXTENT_ITEM_KEY ||
key.type == BTRFS_METADATA_ITEM_KEY)) {
- ret = add_inline_refs(fs_info, path, bytenr,
- &info_level, &preftrees, sc);
+ ret = add_inline_refs(ctx, path, &info_level,
+ &preftrees, sc);
if (ret)
goto out;
- ret = add_keyed_refs(root, path, bytenr, info_level,
+ ret = add_keyed_refs(ctx, root, path, info_level,
&preftrees, sc);
if (ret)
goto out;
}
}
+ /*
+ * If we have a share context and we reached here, it means the extent
+ * is not directly shared (no multiple reference items for it),
+ * otherwise we would have exited earlier with a return value of
+ * BACKREF_FOUND_SHARED after processing delayed references or while
+ * processing inline or keyed references from the extent tree.
+ * The extent may however be indirectly shared through shared subtrees
+ * as a result from creating snapshots, so we determine below what is
+ * its parent node, in case we are dealing with a metadata extent, or
+ * what's the leaf (or leaves), from a fs tree, that has a file extent
+ * item pointing to it in case we are dealing with a data extent.
+ */
+ ASSERT(extent_is_shared(sc) == 0);
+
+ /*
+ * If we are here for a data extent and we have a share_check structure
+ * it means the data extent is not directly shared (does not have
+ * multiple reference items), so we have to check if a path in the fs
+ * tree (going from the root node down to the leaf that has the file
+ * extent item pointing to the data extent) is shared, that is, if any
+ * of the extent buffers in the path is referenced by other trees.
+ */
+ if (sc && ctx->bytenr == sc->data_bytenr) {
+ /*
+ * If our data extent is from a generation more recent than the
+ * last generation used to snapshot the root, then we know that
+ * it can not be shared through subtrees, so we can skip
+ * resolving indirect references, there's no point in
+ * determining the extent buffers for the path from the fs tree
+ * root node down to the leaf that has the file extent item that
+ * points to the data extent.
+ */
+ if (sc->data_extent_gen >
+ btrfs_root_last_snapshot(&sc->root->root_item)) {
+ ret = BACKREF_FOUND_NOT_SHARED;
+ goto out;
+ }
+
+ /*
+ * If we are only determining if a data extent is shared or not
+ * and the corresponding file extent item is located in the same
+ * leaf as the previous file extent item, we can skip resolving
+ * indirect references for a data extent, since the fs tree path
+ * is the same (same leaf, so same path). We skip as long as the
+ * cached result for the leaf is valid and only if there's only
+ * one file extent item pointing to the data extent, because in
+ * the case of multiple file extent items, they may be located
+ * in different leaves and therefore we have multiple paths.
+ */
+ if (sc->ctx->curr_leaf_bytenr == sc->ctx->prev_leaf_bytenr &&
+ sc->self_ref_count == 1) {
+ bool cached;
+ bool is_shared;
+
+ cached = lookup_backref_shared_cache(sc->ctx, sc->root,
+ sc->ctx->curr_leaf_bytenr,
+ 0, &is_shared);
+ if (cached) {
+ if (is_shared)
+ ret = BACKREF_FOUND_SHARED;
+ else
+ ret = BACKREF_FOUND_NOT_SHARED;
+ goto out;
+ }
+ }
+ }
+
btrfs_release_path(path);
- ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0);
+ ret = add_missing_keys(ctx->fs_info, &preftrees, path->skip_locking == 0);
if (ret)
goto out;
WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root));
- ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
- extent_item_pos, sc, ignore_offset);
+ ret = resolve_indirect_refs(ctx, path, &preftrees, sc);
if (ret)
goto out;
@@ -1315,25 +1587,22 @@ again:
* e.g. different offsets would not be merged,
* and would retain their original ref->count < 0.
*/
- if (roots && ref->count && ref->root_id && ref->parent == 0) {
- if (sc && sc->root_objectid &&
- ref->root_id != sc->root_objectid) {
- ret = BACKREF_FOUND_SHARED;
- goto out;
- }
-
+ if (ctx->roots && ref->count && ref->root_id && ref->parent == 0) {
/* no parent == root of tree */
- ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
+ ret = ulist_add(ctx->roots, ref->root_id, 0, GFP_NOFS);
if (ret < 0)
goto out;
}
if (ref->count && ref->parent) {
- if (extent_item_pos && !ref->inode_list &&
+ if (!ctx->ignore_extent_item_pos && !ref->inode_list &&
ref->level == 0) {
+ struct btrfs_tree_parent_check check = { 0 };
struct extent_buffer *eb;
- eb = read_tree_block(fs_info, ref->parent, 0,
- 0, ref->level, NULL);
+ check.level = ref->level;
+
+ eb = read_tree_block(ctx->fs_info, ref->parent,
+ &check);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
@@ -1346,21 +1615,27 @@ again:
if (!path->skip_locking)
btrfs_tree_read_lock(eb);
- ret = find_extent_in_eb(eb, bytenr,
- *extent_item_pos, &eie, ignore_offset);
+ ret = find_extent_in_eb(ctx, eb, &eie);
if (!path->skip_locking)
btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
- if (ret < 0)
+ if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP ||
+ ret < 0)
goto out;
ref->inode_list = eie;
+ /*
+ * We transferred the list ownership to the ref,
+ * so set to NULL to avoid a double free in case
+ * an error happens after this.
+ */
+ eie = NULL;
}
- ret = ulist_add_merge_ptr(refs, ref->parent,
+ ret = ulist_add_merge_ptr(ctx->refs, ref->parent,
ref->inode_list,
(void **)&eie, GFP_NOFS);
if (ret < 0)
goto out;
- if (!ret && extent_item_pos) {
+ if (!ret && !ctx->ignore_extent_item_pos) {
/*
* We've recorded that parent, so we must extend
* its inode list here.
@@ -1379,6 +1654,14 @@ again:
eie->next = ref->inode_list;
}
eie = NULL;
+ /*
+ * We have transferred the inode list ownership from
+ * this ref to the ref we added to the 'refs' ulist.
+ * So set this ref's inode list to NULL to avoid
+ * use-after-free when our caller uses it or double
+ * frees in case an error happens before we return.
+ */
+ ref->inode_list = NULL;
}
cond_resched();
}
@@ -1390,52 +1673,36 @@ out:
prelim_release(&preftrees.indirect);
prelim_release(&preftrees.indirect_missing_keys);
- if (ret < 0)
+ if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0)
free_inode_elem_list(eie);
return ret;
}
-static void free_leaf_list(struct ulist *blocks)
-{
- struct ulist_node *node = NULL;
- struct extent_inode_elem *eie;
- struct ulist_iterator uiter;
-
- ULIST_ITER_INIT(&uiter);
- while ((node = ulist_next(blocks, &uiter))) {
- if (!node->aux)
- continue;
- eie = unode_aux_to_inode_list(node);
- free_inode_elem_list(eie);
- node->aux = 0;
- }
-
- ulist_free(blocks);
-}
-
/*
- * Finds all leafs with a reference to the specified combination of bytenr and
- * offset. key_list_head will point to a list of corresponding keys (caller must
- * free each list element). The leafs will be stored in the leafs ulist, which
- * must be freed with ulist_free.
+ * Finds all leaves with a reference to the specified combination of
+ * @ctx->bytenr and @ctx->extent_item_pos. The bytenr of the found leaves are
+ * added to the ulist at @ctx->refs, and that ulist is allocated by this
+ * function. The caller should free the ulist with free_leaf_list() if
+ * @ctx->ignore_extent_item_pos is false, otherwise a fimple ulist_free() is
+ * enough.
*
- * returns 0 on success, <0 on error
+ * Returns 0 on success and < 0 on error. On error @ctx->refs is not allocated.
*/
-int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **leafs,
- const u64 *extent_item_pos, bool ignore_offset)
+int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx)
{
int ret;
- *leafs = ulist_alloc(GFP_NOFS);
- if (!*leafs)
+ ASSERT(ctx->refs == NULL);
+
+ ctx->refs = ulist_alloc(GFP_NOFS);
+ if (!ctx->refs)
return -ENOMEM;
- ret = find_parent_nodes(trans, fs_info, bytenr, time_seq,
- *leafs, NULL, extent_item_pos, NULL, ignore_offset);
- if (ret < 0 && ret != -ENOENT) {
- free_leaf_list(*leafs);
+ ret = find_parent_nodes(ctx, NULL);
+ if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP ||
+ (ret < 0 && ret != -ENOENT)) {
+ free_leaf_list(ctx->refs);
+ ctx->refs = NULL;
return ret;
}
@@ -1443,7 +1710,7 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
}
/*
- * walk all backrefs for a given extent to find all roots that reference this
+ * Walk all backrefs for a given extent to find all roots that reference this
* extent. Walking a backref means finding all extents that reference this
* extent and in turn walk the backrefs of those, too. Naturally this is a
* recursive process, but here it is implemented in an iterative fashion: We
@@ -1451,76 +1718,115 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
* list. In turn, we find all referencing extents for those, further appending
* to the list. The way we iterate the list allows adding more elements after
* the current while iterating. The process stops when we reach the end of the
- * list. Found roots are added to the roots list.
+ * list.
*
- * returns 0 on success, < 0 on error.
+ * Found roots are added to @ctx->roots, which is allocated by this function if
+ * it points to NULL, in which case the caller is responsible for freeing it
+ * after it's not needed anymore.
+ * This function requires @ctx->refs to be NULL, as it uses it for allocating a
+ * ulist to do temporary work, and frees it before returning.
+ *
+ * Returns 0 on success, < 0 on error.
*/
-static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **roots,
- bool ignore_offset)
+static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx *ctx)
{
- struct ulist *tmp;
- struct ulist_node *node = NULL;
+ const u64 orig_bytenr = ctx->bytenr;
+ const bool orig_ignore_extent_item_pos = ctx->ignore_extent_item_pos;
+ bool roots_ulist_allocated = false;
struct ulist_iterator uiter;
- int ret;
+ int ret = 0;
- tmp = ulist_alloc(GFP_NOFS);
- if (!tmp)
- return -ENOMEM;
- *roots = ulist_alloc(GFP_NOFS);
- if (!*roots) {
- ulist_free(tmp);
+ ASSERT(ctx->refs == NULL);
+
+ ctx->refs = ulist_alloc(GFP_NOFS);
+ if (!ctx->refs)
return -ENOMEM;
+
+ if (!ctx->roots) {
+ ctx->roots = ulist_alloc(GFP_NOFS);
+ if (!ctx->roots) {
+ ulist_free(ctx->refs);
+ ctx->refs = NULL;
+ return -ENOMEM;
+ }
+ roots_ulist_allocated = true;
}
+ ctx->ignore_extent_item_pos = true;
+
ULIST_ITER_INIT(&uiter);
while (1) {
- ret = find_parent_nodes(trans, fs_info, bytenr, time_seq,
- tmp, *roots, NULL, NULL, ignore_offset);
+ struct ulist_node *node;
+
+ ret = find_parent_nodes(ctx, NULL);
if (ret < 0 && ret != -ENOENT) {
- ulist_free(tmp);
- ulist_free(*roots);
- *roots = NULL;
- return ret;
+ if (roots_ulist_allocated) {
+ ulist_free(ctx->roots);
+ ctx->roots = NULL;
+ }
+ break;
}
- node = ulist_next(tmp, &uiter);
+ ret = 0;
+ node = ulist_next(ctx->refs, &uiter);
if (!node)
break;
- bytenr = node->val;
+ ctx->bytenr = node->val;
cond_resched();
}
- ulist_free(tmp);
- return 0;
+ ulist_free(ctx->refs);
+ ctx->refs = NULL;
+ ctx->bytenr = orig_bytenr;
+ ctx->ignore_extent_item_pos = orig_ignore_extent_item_pos;
+
+ return ret;
}
-int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **roots,
+int btrfs_find_all_roots(struct btrfs_backref_walk_ctx *ctx,
bool skip_commit_root_sem)
{
int ret;
- if (!trans && !skip_commit_root_sem)
- down_read(&fs_info->commit_root_sem);
- ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr,
- time_seq, roots, false);
- if (!trans && !skip_commit_root_sem)
- up_read(&fs_info->commit_root_sem);
+ if (!ctx->trans && !skip_commit_root_sem)
+ down_read(&ctx->fs_info->commit_root_sem);
+ ret = btrfs_find_all_roots_safe(ctx);
+ if (!ctx->trans && !skip_commit_root_sem)
+ up_read(&ctx->fs_info->commit_root_sem);
return ret;
}
-/**
- * Check if an extent is shared or not
+struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void)
+{
+ struct btrfs_backref_share_check_ctx *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+
+ ulist_init(&ctx->refs);
+
+ return ctx;
+}
+
+void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx)
+{
+ if (!ctx)
+ return;
+
+ ulist_release(&ctx->refs);
+ kfree(ctx);
+}
+
+/*
+ * Check if a data extent is shared or not.
*
- * @root: root inode belongs to
- * @inum: inode number of the inode whose extent we are checking
- * @bytenr: logical bytenr of the extent we are checking
- * @roots: list of roots this extent is shared among
- * @tmp: temporary list used for iteration
+ * @inode: The inode whose extent we are checking.
+ * @bytenr: Logical bytenr of the extent we are checking.
+ * @extent_gen: Generation of the extent (file extent item) or 0 if it is
+ * not known.
+ * @ctx: A backref sharedness check context.
*
- * btrfs_check_shared uses the backref walking code but will short
+ * btrfs_is_data_extent_shared uses the backref walking code but will short
* circuit as soon as it finds a root or inode that doesn't match the
* one passed in. This provides a significant performance benefit for
* callers (such as fiemap) which want to know whether the extent is
@@ -1531,9 +1837,12 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
*
* Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
*/
-int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
- struct ulist *roots, struct ulist *tmp)
+int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
+ u64 extent_gen,
+ struct btrfs_backref_share_check_ctx *ctx)
{
+ struct btrfs_backref_walk_ctx walk_ctx = { 0 };
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
struct ulist_iterator uiter;
@@ -1541,13 +1850,23 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
struct btrfs_seq_list elem = BTRFS_SEQ_LIST_INIT(elem);
int ret = 0;
struct share_check shared = {
- .root_objectid = root->root_key.objectid,
- .inum = inum,
+ .ctx = ctx,
+ .root = root,
+ .inum = btrfs_ino(inode),
+ .data_bytenr = bytenr,
+ .data_extent_gen = extent_gen,
.share_count = 0,
+ .self_ref_count = 0,
+ .have_delayed_delete_refs = false,
};
+ int level;
- ulist_init(roots);
- ulist_init(tmp);
+ for (int i = 0; i < BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; i++) {
+ if (ctx->prev_extents_cache[i].bytenr == bytenr)
+ return ctx->prev_extents_cache[i].is_shared;
+ }
+
+ ulist_init(&ctx->refs);
trans = btrfs_join_transaction_nostart(root);
if (IS_ERR(trans)) {
@@ -1559,28 +1878,88 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
down_read(&fs_info->commit_root_sem);
} else {
btrfs_get_tree_mod_seq(fs_info, &elem);
+ walk_ctx.time_seq = elem.seq;
}
+ walk_ctx.ignore_extent_item_pos = true;
+ walk_ctx.trans = trans;
+ walk_ctx.fs_info = fs_info;
+ walk_ctx.refs = &ctx->refs;
+
+ /* -1 means we are in the bytenr of the data extent. */
+ level = -1;
ULIST_ITER_INIT(&uiter);
+ ctx->use_path_cache = true;
while (1) {
- ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
- roots, NULL, &shared, false);
- if (ret == BACKREF_FOUND_SHARED) {
- /* this is the only condition under which we return 1 */
- ret = 1;
+ bool is_shared;
+ bool cached;
+
+ walk_ctx.bytenr = bytenr;
+ ret = find_parent_nodes(&walk_ctx, &shared);
+ if (ret == BACKREF_FOUND_SHARED ||
+ ret == BACKREF_FOUND_NOT_SHARED) {
+ /* If shared must return 1, otherwise return 0. */
+ ret = (ret == BACKREF_FOUND_SHARED) ? 1 : 0;
+ if (level >= 0)
+ store_backref_shared_cache(ctx, root, bytenr,
+ level, ret == 1);
break;
}
if (ret < 0 && ret != -ENOENT)
break;
ret = 0;
- node = ulist_next(tmp, &uiter);
+
+ /*
+ * If our data extent was not directly shared (without multiple
+ * reference items), than it might have a single reference item
+ * with a count > 1 for the same offset, which means there are 2
+ * (or more) file extent items that point to the data extent -
+ * this happens when a file extent item needs to be split and
+ * then one item gets moved to another leaf due to a b+tree leaf
+ * split when inserting some item. In this case the file extent
+ * items may be located in different leaves and therefore some
+ * of the leaves may be referenced through shared subtrees while
+ * others are not. Since our extent buffer cache only works for
+ * a single path (by far the most common case and simpler to
+ * deal with), we can not use it if we have multiple leaves
+ * (which implies multiple paths).
+ */
+ if (level == -1 && ctx->refs.nnodes > 1)
+ ctx->use_path_cache = false;
+
+ if (level >= 0)
+ store_backref_shared_cache(ctx, root, bytenr,
+ level, false);
+ node = ulist_next(&ctx->refs, &uiter);
if (!node)
break;
bytenr = node->val;
+ level++;
+ cached = lookup_backref_shared_cache(ctx, root, bytenr, level,
+ &is_shared);
+ if (cached) {
+ ret = (is_shared ? 1 : 0);
+ break;
+ }
shared.share_count = 0;
+ shared.have_delayed_delete_refs = false;
cond_resched();
}
+ /*
+ * Cache the sharedness result for the data extent if we know our inode
+ * has more than 1 file extent item that refers to the data extent.
+ */
+ if (ret >= 0 && shared.self_ref_count > 1) {
+ int slot = ctx->prev_extents_cache_slot;
+
+ ctx->prev_extents_cache[slot].bytenr = shared.data_bytenr;
+ ctx->prev_extents_cache[slot].is_shared = (ret == 1);
+
+ slot = (slot + 1) % BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE;
+ ctx->prev_extents_cache_slot = slot;
+ }
+
if (trans) {
btrfs_put_tree_mod_seq(fs_info, &elem);
btrfs_end_transaction(trans);
@@ -1588,8 +1967,9 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
up_read(&fs_info->commit_root_sem);
}
out:
- ulist_release(roots);
- ulist_release(tmp);
+ ulist_release(&ctx->refs);
+ ctx->prev_leaf_bytenr = ctx->curr_leaf_bytenr;
+
return ret;
}
@@ -1936,7 +2316,7 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
"ref for %llu resolved, key (%llu EXTEND_DATA %llu), root %llu",
extent_item_objectid, eie->inum,
eie->offset, root);
- ret = iterate(eie->inum, eie->offset, root, ctx);
+ ret = iterate(eie->inum, eie->offset, eie->num_bytes, root, ctx);
if (ret) {
btrfs_debug(fs_info,
"stopping iteration for %llu due to ret=%d",
@@ -1953,82 +2333,128 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
* the given parameters.
* when the iterator function returns a non-zero value, iteration stops.
*/
-int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
- u64 extent_item_objectid, u64 extent_item_pos,
- int search_commit_root,
- iterate_extent_inodes_t *iterate, void *ctx,
- bool ignore_offset)
+int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx,
+ bool search_commit_root,
+ iterate_extent_inodes_t *iterate, void *user_ctx)
{
int ret;
- struct btrfs_trans_handle *trans = NULL;
- struct ulist *refs = NULL;
- struct ulist *roots = NULL;
- struct ulist_node *ref_node = NULL;
- struct ulist_node *root_node = NULL;
+ struct ulist *refs;
+ struct ulist_node *ref_node;
struct btrfs_seq_list seq_elem = BTRFS_SEQ_LIST_INIT(seq_elem);
struct ulist_iterator ref_uiter;
- struct ulist_iterator root_uiter;
- btrfs_debug(fs_info, "resolving all inodes for extent %llu",
- extent_item_objectid);
+ btrfs_debug(ctx->fs_info, "resolving all inodes for extent %llu",
+ ctx->bytenr);
+
+ ASSERT(ctx->trans == NULL);
+ ASSERT(ctx->roots == NULL);
if (!search_commit_root) {
- trans = btrfs_attach_transaction(fs_info->tree_root);
+ struct btrfs_trans_handle *trans;
+
+ trans = btrfs_attach_transaction(ctx->fs_info->tree_root);
if (IS_ERR(trans)) {
if (PTR_ERR(trans) != -ENOENT &&
PTR_ERR(trans) != -EROFS)
return PTR_ERR(trans);
trans = NULL;
}
+ ctx->trans = trans;
}
- if (trans)
- btrfs_get_tree_mod_seq(fs_info, &seq_elem);
- else
- down_read(&fs_info->commit_root_sem);
+ if (ctx->trans) {
+ btrfs_get_tree_mod_seq(ctx->fs_info, &seq_elem);
+ ctx->time_seq = seq_elem.seq;
+ } else {
+ down_read(&ctx->fs_info->commit_root_sem);
+ }
- ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
- seq_elem.seq, &refs,
- &extent_item_pos, ignore_offset);
+ ret = btrfs_find_all_leafs(ctx);
if (ret)
goto out;
+ refs = ctx->refs;
+ ctx->refs = NULL;
ULIST_ITER_INIT(&ref_uiter);
while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
- ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val,
- seq_elem.seq, &roots,
- ignore_offset);
+ const u64 leaf_bytenr = ref_node->val;
+ struct ulist_node *root_node;
+ struct ulist_iterator root_uiter;
+ struct extent_inode_elem *inode_list;
+
+ inode_list = (struct extent_inode_elem *)(uintptr_t)ref_node->aux;
+
+ if (ctx->cache_lookup) {
+ const u64 *root_ids;
+ int root_count;
+ bool cached;
+
+ cached = ctx->cache_lookup(leaf_bytenr, ctx->user_ctx,
+ &root_ids, &root_count);
+ if (cached) {
+ for (int i = 0; i < root_count; i++) {
+ ret = iterate_leaf_refs(ctx->fs_info,
+ inode_list,
+ root_ids[i],
+ leaf_bytenr,
+ iterate,
+ user_ctx);
+ if (ret)
+ break;
+ }
+ continue;
+ }
+ }
+
+ if (!ctx->roots) {
+ ctx->roots = ulist_alloc(GFP_NOFS);
+ if (!ctx->roots) {
+ ret = -ENOMEM;
+ break;
+ }
+ }
+
+ ctx->bytenr = leaf_bytenr;
+ ret = btrfs_find_all_roots_safe(ctx);
if (ret)
break;
+
+ if (ctx->cache_store)
+ ctx->cache_store(leaf_bytenr, ctx->roots, ctx->user_ctx);
+
ULIST_ITER_INIT(&root_uiter);
- while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
- btrfs_debug(fs_info,
+ while (!ret && (root_node = ulist_next(ctx->roots, &root_uiter))) {
+ btrfs_debug(ctx->fs_info,
"root %llu references leaf %llu, data list %#llx",
root_node->val, ref_node->val,
ref_node->aux);
- ret = iterate_leaf_refs(fs_info,
- (struct extent_inode_elem *)
- (uintptr_t)ref_node->aux,
- root_node->val,
- extent_item_objectid,
- iterate, ctx);
+ ret = iterate_leaf_refs(ctx->fs_info, inode_list,
+ root_node->val, ctx->bytenr,
+ iterate, user_ctx);
}
- ulist_free(roots);
+ ulist_reinit(ctx->roots);
}
free_leaf_list(refs);
out:
- if (trans) {
- btrfs_put_tree_mod_seq(fs_info, &seq_elem);
- btrfs_end_transaction(trans);
+ if (ctx->trans) {
+ btrfs_put_tree_mod_seq(ctx->fs_info, &seq_elem);
+ btrfs_end_transaction(ctx->trans);
+ ctx->trans = NULL;
} else {
- up_read(&fs_info->commit_root_sem);
+ up_read(&ctx->fs_info->commit_root_sem);
}
+ ulist_free(ctx->roots);
+ ctx->roots = NULL;
+
+ if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP)
+ ret = 0;
+
return ret;
}
-static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
+static int build_ino_list(u64 inum, u64 offset, u64 num_bytes, u64 root, void *ctx)
{
struct btrfs_data_container *inodes = ctx;
const size_t c = 3 * sizeof(u64);
@@ -2052,8 +2478,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
void *ctx, bool ignore_offset)
{
+ struct btrfs_backref_walk_ctx walk_ctx = { 0 };
int ret;
- u64 extent_item_pos;
u64 flags = 0;
struct btrfs_key found_key;
int search_commit_root = path->search_commit_root;
@@ -2065,12 +2491,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
return -EINVAL;
- extent_item_pos = logical - found_key.objectid;
- ret = iterate_extent_inodes(fs_info, found_key.objectid,
- extent_item_pos, search_commit_root,
- build_ino_list, ctx, ignore_offset);
+ walk_ctx.bytenr = found_key.objectid;
+ if (ignore_offset)
+ walk_ctx.ignore_extent_item_pos = true;
+ else
+ walk_ctx.extent_item_pos = logical - found_key.objectid;
+ walk_ctx.fs_info = fs_info;
- return ret;
+ return iterate_extent_inodes(&walk_ctx, search_commit_root,
+ build_ino_list, ctx);
}
static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
@@ -2323,12 +2752,11 @@ void free_ipath(struct inode_fs_paths *ipath)
kfree(ipath);
}
-struct btrfs_backref_iter *btrfs_backref_iter_alloc(
- struct btrfs_fs_info *fs_info, gfp_t gfp_flag)
+struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info)
{
struct btrfs_backref_iter *ret;
- ret = kzalloc(sizeof(*ret), gfp_flag);
+ ret = kzalloc(sizeof(*ret), GFP_NOFS);
if (!ret)
return NULL;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 2759de7d324c..ef6bbea3f456 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -7,18 +7,193 @@
#define BTRFS_BACKREF_H
#include <linux/btrfs.h>
+#include "messages.h"
#include "ulist.h"
#include "disk-io.h"
#include "extent_io.h"
+/*
+ * Used by implementations of iterate_extent_inodes_t (see definition below) to
+ * signal that backref iteration can stop immediately and no error happened.
+ * The value must be non-negative and must not be 0, 1 (which is a common return
+ * value from things like btrfs_search_slot() and used internally in the backref
+ * walking code) and different from BACKREF_FOUND_SHARED and
+ * BACKREF_FOUND_NOT_SHARED
+ */
+#define BTRFS_ITERATE_EXTENT_INODES_STOP 5
+
+/*
+ * Should return 0 if no errors happened and iteration of backrefs should
+ * continue. Can return BTRFS_ITERATE_EXTENT_INODES_STOP or any other non-zero
+ * value to immediately stop iteration and possibly signal an error back to
+ * the caller.
+ */
+typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 num_bytes,
+ u64 root, void *ctx);
+
+/*
+ * Context and arguments for backref walking functions. Some of the fields are
+ * to be filled by the caller of such functions while other are filled by the
+ * functions themselves, as described below.
+ */
+struct btrfs_backref_walk_ctx {
+ /*
+ * The address of the extent for which we are doing backref walking.
+ * Can be either a data extent or a metadata extent.
+ *
+ * Must always be set by the top level caller.
+ */
+ u64 bytenr;
+ /*
+ * Offset relative to the target extent. This is only used for data
+ * extents, and it's meaningful because we can have file extent items
+ * that point only to a section of a data extent ("bookend" extents),
+ * and we want to filter out any that don't point to a section of the
+ * data extent containing the given offset.
+ *
+ * Must always be set by the top level caller.
+ */
+ u64 extent_item_pos;
+ /*
+ * If true and bytenr corresponds to a data extent, then references from
+ * all file extent items that point to the data extent are considered,
+ * @extent_item_pos is ignored.
+ */
+ bool ignore_extent_item_pos;
+ /* A valid transaction handle or NULL. */
+ struct btrfs_trans_handle *trans;
+ /*
+ * The file system's info object, can not be NULL.
+ *
+ * Must always be set by the top level caller.
+ */
+ struct btrfs_fs_info *fs_info;
+ /*
+ * Time sequence acquired from btrfs_get_tree_mod_seq(), in case the
+ * caller joined the tree mod log to get a consistent view of b+trees
+ * while we do backref walking, or BTRFS_SEQ_LAST.
+ * When using BTRFS_SEQ_LAST, delayed refs are not checked and it uses
+ * commit roots when searching b+trees - this is a special case for
+ * qgroups used during a transaction commit.
+ */
+ u64 time_seq;
+ /*
+ * Used to collect the bytenr of metadata extents that point to the
+ * target extent.
+ */
+ struct ulist *refs;
+ /*
+ * List used to collect the IDs of the roots from which the target
+ * extent is accessible. Can be NULL in case the caller does not care
+ * about collecting root IDs.
+ */
+ struct ulist *roots;
+ /*
+ * Used by iterate_extent_inodes() and the main backref walk code
+ * (find_parent_nodes()). Lookup and store functions for an optional
+ * cache which maps the logical address (bytenr) of leaves to an array
+ * of root IDs.
+ */
+ bool (*cache_lookup)(u64 leaf_bytenr, void *user_ctx,
+ const u64 **root_ids_ret, int *root_count_ret);
+ void (*cache_store)(u64 leaf_bytenr, const struct ulist *root_ids,
+ void *user_ctx);
+ /*
+ * If this is not NULL, then the backref walking code will call this
+ * for each indirect data extent reference as soon as it finds one,
+ * before collecting all the remaining backrefs and before resolving
+ * indirect backrefs. This allows for the caller to terminate backref
+ * walking as soon as it finds one backref that matches some specific
+ * criteria. The @cache_lookup and @cache_store callbacks should not
+ * be NULL in order to use this callback.
+ */
+ iterate_extent_inodes_t *indirect_ref_iterator;
+ /*
+ * If this is not NULL, then the backref walking code will call this for
+ * each extent item it's meant to process before it actually starts
+ * processing it. If this returns anything other than 0, then it stops
+ * the backref walking code immediately.
+ */
+ int (*check_extent_item)(u64 bytenr, const struct btrfs_extent_item *ei,
+ const struct extent_buffer *leaf, void *user_ctx);
+ /*
+ * If this is not NULL, then the backref walking code will call this for
+ * each extent data ref it finds (BTRFS_EXTENT_DATA_REF_KEY keys) before
+ * processing that data ref. If this callback return false, then it will
+ * ignore this data ref and it will never resolve the indirect data ref,
+ * saving time searching for leaves in a fs tree with file extent items
+ * matching the data ref.
+ */
+ bool (*skip_data_ref)(u64 root, u64 ino, u64 offset, void *user_ctx);
+ /* Context object to pass to the callbacks defined above. */
+ void *user_ctx;
+};
+
struct inode_fs_paths {
struct btrfs_path *btrfs_path;
struct btrfs_root *fs_root;
struct btrfs_data_container *fspath;
};
-typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
- void *ctx);
+struct btrfs_backref_shared_cache_entry {
+ u64 bytenr;
+ u64 gen;
+ bool is_shared;
+};
+
+#define BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE 8
+
+struct btrfs_backref_share_check_ctx {
+ /* Ulists used during backref walking. */
+ struct ulist refs;
+ /*
+ * The current leaf the caller of btrfs_is_data_extent_shared() is at.
+ * Typically the caller (at the moment only fiemap) tries to determine
+ * the sharedness of data extents point by file extent items from entire
+ * leaves.
+ */
+ u64 curr_leaf_bytenr;
+ /*
+ * The previous leaf the caller was at in the previous call to
+ * btrfs_is_data_extent_shared(). This may be the same as the current
+ * leaf. On the first call it must be 0.
+ */
+ u64 prev_leaf_bytenr;
+ /*
+ * A path from a root to a leaf that has a file extent item pointing to
+ * a given data extent should never exceed the maximum b+tree height.
+ */
+ struct btrfs_backref_shared_cache_entry path_cache_entries[BTRFS_MAX_LEVEL];
+ bool use_path_cache;
+ /*
+ * Cache the sharedness result for the last few extents we have found,
+ * but only for extents for which we have multiple file extent items
+ * that point to them.
+ * It's very common to have several file extent items that point to the
+ * same extent (bytenr) but with different offsets and lengths. This
+ * typically happens for COW writes, partial writes into prealloc
+ * extents, NOCOW writes after snapshoting a root, hole punching or
+ * reflinking within the same file (less common perhaps).
+ * So keep a small cache with the lookup results for the extent pointed
+ * by the last few file extent items. This cache is checked, with a
+ * linear scan, whenever btrfs_is_data_extent_shared() is called, so
+ * it must be small so that it does not negatively affect performance in
+ * case we don't have multiple file extent items that point to the same
+ * data extent.
+ */
+ struct {
+ u64 bytenr;
+ bool is_shared;
+ } prev_extents_cache[BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE];
+ /*
+ * The slot in the prev_extents_cache array that will be used for
+ * storing the sharedness result of a new data extent.
+ */
+ int prev_extents_cache_slot;
+};
+
+struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void);
+void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx);
int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
struct btrfs_path *path, struct btrfs_key *found_key,
@@ -28,11 +203,9 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
struct btrfs_key *key, struct btrfs_extent_item *ei,
u32 item_size, u64 *out_root, u8 *out_level);
-int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
- u64 extent_item_objectid,
- u64 extent_offset, int search_commit_root,
- iterate_extent_inodes_t *iterate, void *ctx,
- bool ignore_offset);
+int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx,
+ bool search_commit_root,
+ iterate_extent_inodes_t *iterate, void *user_ctx);
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
struct btrfs_path *path, void *ctx,
@@ -40,13 +213,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
-int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **leafs,
- const u64 *extent_item_pos, bool ignore_offset);
-int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **roots,
+int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx);
+int btrfs_find_all_roots(struct btrfs_backref_walk_ctx *ctx,
bool skip_commit_root_sem);
char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
u32 name_len, unsigned long name_off,
@@ -62,8 +230,9 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
u64 start_off, struct btrfs_path *path,
struct btrfs_inode_extref **ret_extref,
u64 *found_off);
-int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
- struct ulist *roots, struct ulist *tmp_ulist);
+int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
+ u64 extent_gen,
+ struct btrfs_backref_share_check_ctx *ctx);
int __init btrfs_prelim_ref_init(void);
void __cold btrfs_prelim_ref_exit(void);
@@ -94,8 +263,7 @@ struct btrfs_backref_iter {
u32 end_ptr;
};
-struct btrfs_backref_iter *btrfs_backref_iter_alloc(
- struct btrfs_fs_info *fs_info, gfp_t gfp_flag);
+struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info);
static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter)
{
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
new file mode 100644
index 000000000000..b8fb7ef6b520
--- /dev/null
+++ b/fs/btrfs/bio.c
@@ -0,0 +1,381 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ * Copyright (C) 2022 Christoph Hellwig.
+ */
+
+#include <linux/bio.h>
+#include "bio.h"
+#include "ctree.h"
+#include "volumes.h"
+#include "raid56.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "dev-replace.h"
+#include "rcu-string.h"
+#include "zoned.h"
+
+static struct bio_set btrfs_bioset;
+
+/*
+ * Initialize a btrfs_bio structure. This skips the embedded bio itself as it
+ * is already initialized by the block layer.
+ */
+static inline void btrfs_bio_init(struct btrfs_bio *bbio,
+ btrfs_bio_end_io_t end_io, void *private)
+{
+ memset(bbio, 0, offsetof(struct btrfs_bio, bio));
+ bbio->end_io = end_io;
+ bbio->private = private;
+}
+
+/*
+ * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for
+ * btrfs, and is used for all I/O submitted through btrfs_submit_bio.
+ *
+ * Just like the underlying bio_alloc_bioset it will not fail as it is backed by
+ * a mempool.
+ */
+struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+ btrfs_bio_end_io_t end_io, void *private)
+{
+ struct bio *bio;
+
+ bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
+ btrfs_bio_init(btrfs_bio(bio), end_io, private);
+ return bio;
+}
+
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
+ btrfs_bio_end_io_t end_io, void *private)
+{
+ struct bio *bio;
+ struct btrfs_bio *bbio;
+
+ ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
+
+ bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
+ bbio = btrfs_bio(bio);
+ btrfs_bio_init(bbio, end_io, private);
+
+ bio_trim(bio, offset >> 9, size >> 9);
+ bbio->iter = bio->bi_iter;
+ return bio;
+}
+
+static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
+{
+ if (!dev || !dev->bdev)
+ return;
+ if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET)
+ return;
+
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE)
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+ if (!(bio->bi_opf & REQ_RAHEAD))
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+ if (bio->bi_opf & REQ_PREFLUSH)
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
+}
+
+static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info,
+ struct bio *bio)
+{
+ if (bio->bi_opf & REQ_META)
+ return fs_info->endio_meta_workers;
+ return fs_info->endio_workers;
+}
+
+static void btrfs_end_bio_work(struct work_struct *work)
+{
+ struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
+
+ bbio->end_io(bbio);
+}
+
+static void btrfs_simple_end_io(struct bio *bio)
+{
+ struct btrfs_fs_info *fs_info = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+
+ btrfs_bio_counter_dec(fs_info);
+
+ if (bio->bi_status)
+ btrfs_log_dev_io_error(bio, bbio->device);
+
+ if (bio_op(bio) == REQ_OP_READ) {
+ INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
+ queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
+ } else {
+ bbio->end_io(bbio);
+ }
+}
+
+static void btrfs_raid56_end_io(struct bio *bio)
+{
+ struct btrfs_io_context *bioc = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+
+ btrfs_bio_counter_dec(bioc->fs_info);
+ bbio->mirror_num = bioc->mirror_num;
+ bbio->end_io(bbio);
+
+ btrfs_put_bioc(bioc);
+}
+
+static void btrfs_orig_write_end_io(struct bio *bio)
+{
+ struct btrfs_io_stripe *stripe = bio->bi_private;
+ struct btrfs_io_context *bioc = stripe->bioc;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+
+ btrfs_bio_counter_dec(bioc->fs_info);
+
+ if (bio->bi_status) {
+ atomic_inc(&bioc->error);
+ btrfs_log_dev_io_error(bio, stripe->dev);
+ }
+
+ /*
+ * Only send an error to the higher layers if it is beyond the tolerance
+ * threshold.
+ */
+ if (atomic_read(&bioc->error) > bioc->max_errors)
+ bio->bi_status = BLK_STS_IOERR;
+ else
+ bio->bi_status = BLK_STS_OK;
+
+ bbio->end_io(bbio);
+ btrfs_put_bioc(bioc);
+}
+
+static void btrfs_clone_write_end_io(struct bio *bio)
+{
+ struct btrfs_io_stripe *stripe = bio->bi_private;
+
+ if (bio->bi_status) {
+ atomic_inc(&stripe->bioc->error);
+ btrfs_log_dev_io_error(bio, stripe->dev);
+ }
+
+ /* Pass on control to the original bio this one was cloned from */
+ bio_endio(stripe->bioc->orig_bio);
+ bio_put(bio);
+}
+
+static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
+{
+ if (!dev || !dev->bdev ||
+ test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
+ (btrfs_op(bio) == BTRFS_MAP_WRITE &&
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
+ bio_io_error(bio);
+ return;
+ }
+
+ bio_set_dev(bio, dev->bdev);
+
+ /*
+ * For zone append writing, bi_sector must point the beginning of the
+ * zone
+ */
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
+ if (btrfs_dev_is_sequential(dev, physical)) {
+ u64 zone_start = round_down(physical,
+ dev->fs_info->zone_size);
+
+ bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
+ } else {
+ bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
+ bio->bi_opf |= REQ_OP_WRITE;
+ }
+ }
+ btrfs_debug_in_rcu(dev->fs_info,
+ "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
+ __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
+ (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
+ dev->devid, bio->bi_iter.bi_size);
+
+ btrfsic_check_bio(bio);
+ submit_bio(bio);
+}
+
+static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
+{
+ struct bio *orig_bio = bioc->orig_bio, *bio;
+
+ ASSERT(bio_op(orig_bio) != REQ_OP_READ);
+
+ /* Reuse the bio embedded into the btrfs_bio for the last mirror */
+ if (dev_nr == bioc->num_stripes - 1) {
+ bio = orig_bio;
+ bio->bi_end_io = btrfs_orig_write_end_io;
+ } else {
+ bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set);
+ bio_inc_remaining(orig_bio);
+ bio->bi_end_io = btrfs_clone_write_end_io;
+ }
+
+ bio->bi_private = &bioc->stripes[dev_nr];
+ bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
+ bioc->stripes[dev_nr].bioc = bioc;
+ btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
+}
+
+void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num)
+{
+ u64 logical = bio->bi_iter.bi_sector << 9;
+ u64 length = bio->bi_iter.bi_size;
+ u64 map_length = length;
+ struct btrfs_io_context *bioc = NULL;
+ struct btrfs_io_stripe smap;
+ int ret;
+
+ btrfs_bio_counter_inc_blocked(fs_info);
+ ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
+ &bioc, &smap, &mirror_num, 1);
+ if (ret) {
+ btrfs_bio_counter_dec(fs_info);
+ btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
+ return;
+ }
+
+ if (map_length < length) {
+ btrfs_crit(fs_info,
+ "mapping failed logical %llu bio len %llu len %llu",
+ logical, length, map_length);
+ BUG();
+ }
+
+ if (!bioc) {
+ /* Single mirror read/write fast path */
+ btrfs_bio(bio)->mirror_num = mirror_num;
+ btrfs_bio(bio)->device = smap.dev;
+ bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
+ bio->bi_private = fs_info;
+ bio->bi_end_io = btrfs_simple_end_io;
+ btrfs_submit_dev_bio(smap.dev, bio);
+ } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ /* Parity RAID write or read recovery */
+ bio->bi_private = bioc;
+ bio->bi_end_io = btrfs_raid56_end_io;
+ if (bio_op(bio) == REQ_OP_READ)
+ raid56_parity_recover(bio, bioc, mirror_num);
+ else
+ raid56_parity_write(bio, bioc);
+ } else {
+ /* Write to multiple mirrors */
+ int total_devs = bioc->num_stripes;
+ int dev_nr;
+
+ bioc->orig_bio = bio;
+ for (dev_nr = 0; dev_nr < total_devs; dev_nr++)
+ btrfs_submit_mirrored_bio(bioc, dev_nr);
+ }
+}
+
+/*
+ * Submit a repair write.
+ *
+ * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a
+ * RAID setup. Here we only want to write the one bad copy, so we do the
+ * mapping ourselves and submit the bio directly.
+ *
+ * The I/O is issued sychronously to block the repair read completion from
+ * freeing the bio.
+ */
+int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+ u64 length, u64 logical, struct page *page,
+ unsigned int pg_offset, int mirror_num)
+{
+ struct btrfs_device *dev;
+ struct bio_vec bvec;
+ struct bio bio;
+ u64 map_length = 0;
+ u64 sector;
+ struct btrfs_io_context *bioc = NULL;
+ int ret = 0;
+
+ ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
+ BUG_ON(!mirror_num);
+
+ if (btrfs_repair_one_zone(fs_info, logical))
+ return 0;
+
+ map_length = length;
+
+ /*
+ * Avoid races with device replace and make sure our bioc has devices
+ * associated to its stripes that don't go away while we are doing the
+ * read repair operation.
+ */
+ btrfs_bio_counter_inc_blocked(fs_info);
+ if (btrfs_is_parity_mirror(fs_info, logical, length)) {
+ /*
+ * Note that we don't use BTRFS_MAP_WRITE because it's supposed
+ * to update all raid stripes, but here we just want to correct
+ * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
+ * stripe's dev and sector.
+ */
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
+ &map_length, &bioc, 0);
+ if (ret)
+ goto out_counter_dec;
+ ASSERT(bioc->mirror_num == 1);
+ } else {
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
+ &map_length, &bioc, mirror_num);
+ if (ret)
+ goto out_counter_dec;
+ BUG_ON(mirror_num != bioc->mirror_num);
+ }
+
+ sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
+ dev = bioc->stripes[bioc->mirror_num - 1].dev;
+ btrfs_put_bioc(bioc);
+
+ if (!dev || !dev->bdev ||
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
+ ret = -EIO;
+ goto out_counter_dec;
+ }
+
+ bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
+ bio.bi_iter.bi_sector = sector;
+ __bio_add_page(&bio, page, length, pg_offset);
+
+ btrfsic_check_bio(&bio);
+ ret = submit_bio_wait(&bio);
+ if (ret) {
+ /* try to remap that extent elsewhere? */
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+ goto out_bio_uninit;
+ }
+
+ btrfs_info_rl_in_rcu(fs_info,
+ "read error corrected: ino %llu off %llu (dev %s sector %llu)",
+ ino, start, btrfs_dev_name(dev), sector);
+ ret = 0;
+
+out_bio_uninit:
+ bio_uninit(&bio);
+out_counter_dec:
+ btrfs_bio_counter_dec(fs_info);
+ return ret;
+}
+
+int __init btrfs_bioset_init(void)
+{
+ if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
+ offsetof(struct btrfs_bio, bio),
+ BIOSET_NEED_BVECS))
+ return -ENOMEM;
+ return 0;
+}
+
+void __cold btrfs_bioset_exit(void)
+{
+ bioset_exit(&btrfs_bioset);
+}
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
new file mode 100644
index 000000000000..b12f84b3b341
--- /dev/null
+++ b/fs/btrfs/bio.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ * Copyright (C) 2022 Christoph Hellwig.
+ */
+
+#ifndef BTRFS_BIO_H
+#define BTRFS_BIO_H
+
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include "tree-checker.h"
+
+struct btrfs_bio;
+struct btrfs_fs_info;
+
+#define BTRFS_BIO_INLINE_CSUM_SIZE 64
+
+/*
+ * Maximum number of sectors for a single bio to limit the size of the
+ * checksum array. This matches the number of bio_vecs per bio and thus the
+ * I/O size for buffered I/O.
+ */
+#define BTRFS_MAX_BIO_SECTORS (256)
+
+typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
+
+/*
+ * Additional info to pass along bio.
+ *
+ * Mostly for btrfs specific features like csum and mirror_num.
+ */
+struct btrfs_bio {
+ unsigned int mirror_num:7;
+
+ /*
+ * Extra indicator for metadata bios.
+ * For some btrfs bios they use pages without a mapping, thus
+ * we can not rely on page->mapping->host to determine if
+ * it's a metadata bio.
+ */
+ unsigned int is_metadata:1;
+ struct bvec_iter iter;
+
+ /* for direct I/O */
+ u64 file_offset;
+
+ /* @device is for stripe IO submission. */
+ struct btrfs_device *device;
+ union {
+ /* For data checksum verification. */
+ struct {
+ u8 *csum;
+ u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
+ };
+
+ /* For metadata parentness verification. */
+ struct btrfs_tree_parent_check parent_check;
+ };
+
+ /* End I/O information supplied to btrfs_bio_alloc */
+ btrfs_bio_end_io_t end_io;
+ void *private;
+
+ /* For read end I/O handling */
+ struct work_struct end_io_work;
+
+ /*
+ * This member must come last, bio_alloc_bioset will allocate enough
+ * bytes for entire btrfs_bio but relies on bio being last.
+ */
+ struct bio bio;
+};
+
+static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
+{
+ return container_of(bio, struct btrfs_bio, bio);
+}
+
+int __init btrfs_bioset_init(void);
+void __cold btrfs_bioset_exit(void);
+
+struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+ btrfs_bio_end_io_t end_io, void *private);
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
+ btrfs_bio_end_io_t end_io, void *private);
+
+
+static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
+{
+ bbio->bio.bi_status = status;
+ bbio->end_io(bbio);
+}
+
+static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
+{
+ if (bbio->is_metadata)
+ return;
+ if (bbio->csum != bbio->csum_inline) {
+ kfree(bbio->csum);
+ bbio->csum = NULL;
+ }
+}
+
+/*
+ * Iterate through a btrfs_bio (@bbio) on a per-sector basis.
+ *
+ * bvl - struct bio_vec
+ * bbio - struct btrfs_bio
+ * iters - struct bvec_iter
+ * bio_offset - unsigned int
+ */
+#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \
+ for ((iter) = (bbio)->iter, (bio_offset) = 0; \
+ (iter).bi_size && \
+ (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \
+ (bio_offset) += fs_info->sectorsize, \
+ bio_advance_iter_single(&(bbio)->bio, &(iter), \
+ (fs_info)->sectorsize))
+
+void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
+ int mirror_num);
+int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+ u64 length, u64 logical, struct page *page,
+ unsigned int pg_offset, int mirror_num);
+
+#endif
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index e0375ba9d0fe..708d843daa72 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -17,6 +17,21 @@
#include "discard.h"
#include "raid56.h"
#include "zoned.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+
+#ifdef CONFIG_BTRFS_DEBUG
+int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+
+ return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
+ block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
+ block_group->flags & BTRFS_BLOCK_GROUP_DATA);
+}
+#endif
/*
* Return target flags in extended format or 0 if restripe for this chunk_type
@@ -284,7 +299,7 @@ struct btrfs_block_group *btrfs_next_block_group(
return cache;
}
-/**
+/*
* Check if we can do a NOCOW write for a given extent.
*
* @fs_info: The filesystem information object.
@@ -325,11 +340,9 @@ struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
return bg;
}
-/**
+/*
* Decrement the number of NOCOW writers in a block group.
*
- * @bg: The block group.
- *
* This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
* and on the block group returned by that call. Typically this is called after
* creating an ordered extent for a NOCOW write, to prevent races with scrub and
@@ -593,8 +606,6 @@ next:
if (need_resched() ||
rwsem_is_contended(&fs_info->commit_root_sem)) {
- if (wakeup)
- caching_ctl->progress = last;
btrfs_release_path(path);
up_read(&fs_info->commit_root_sem);
mutex_unlock(&caching_ctl->mutex);
@@ -618,9 +629,6 @@ next:
key.objectid = last;
key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
-
- if (wakeup)
- caching_ctl->progress = last;
btrfs_release_path(path);
goto next;
}
@@ -655,7 +663,6 @@ next:
total_found += add_new_free_space(block_group, last,
block_group->start + block_group->length);
- caching_ctl->progress = (u64)-1;
out:
btrfs_free_path(path);
@@ -725,8 +732,6 @@ done:
}
#endif
- caching_ctl->progress = (u64)-1;
-
up_read(&fs_info->commit_root_sem);
btrfs_free_excluded_extents(block_group);
mutex_unlock(&caching_ctl->mutex);
@@ -755,7 +760,6 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
mutex_init(&caching_ctl->mutex);
init_waitqueue_head(&caching_ctl->wait);
caching_ctl->block_group = cache;
- caching_ctl->progress = cache->start;
refcount_set(&caching_ctl->count, 2);
btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
@@ -772,7 +776,6 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
WARN_ON(cache->caching_ctl);
cache->caching_ctl = caching_ctl;
cache->cached = BTRFS_CACHE_STARTED;
- cache->has_caching_ctl = 1;
spin_unlock(&cache->lock);
write_lock(&fs_info->block_group_cache_lock);
@@ -988,32 +991,31 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
kobject_put(kobj);
}
- if (block_group->has_caching_ctl)
- caching_ctl = btrfs_get_caching_control(block_group);
if (block_group->cached == BTRFS_CACHE_STARTED)
btrfs_wait_block_group_cache_done(block_group);
- if (block_group->has_caching_ctl) {
- write_lock(&fs_info->block_group_cache_lock);
- if (!caching_ctl) {
- struct btrfs_caching_control *ctl;
-
- list_for_each_entry(ctl,
- &fs_info->caching_block_groups, list)
- if (ctl->block_group == block_group) {
- caching_ctl = ctl;
- refcount_inc(&caching_ctl->count);
- break;
- }
- }
- if (caching_ctl)
- list_del_init(&caching_ctl->list);
- write_unlock(&fs_info->block_group_cache_lock);
- if (caching_ctl) {
- /* Once for the caching bgs list and once for us. */
- btrfs_put_caching_control(caching_ctl);
- btrfs_put_caching_control(caching_ctl);
+
+ write_lock(&fs_info->block_group_cache_lock);
+ caching_ctl = btrfs_get_caching_control(block_group);
+ if (!caching_ctl) {
+ struct btrfs_caching_control *ctl;
+
+ list_for_each_entry(ctl, &fs_info->caching_block_groups, list) {
+ if (ctl->block_group == block_group) {
+ caching_ctl = ctl;
+ refcount_inc(&caching_ctl->count);
+ break;
+ }
}
}
+ if (caching_ctl)
+ list_del_init(&caching_ctl->list);
+ write_unlock(&fs_info->block_group_cache_lock);
+
+ if (caching_ctl) {
+ /* Once for the caching bgs list and once for us. */
+ btrfs_put_caching_control(caching_ctl);
+ btrfs_put_caching_control(caching_ctl);
+ }
spin_lock(&trans->transaction->dirty_bgs_lock);
WARN_ON(!list_empty(&block_group->dirty_list));
@@ -1034,12 +1036,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
< block_group->zone_unusable);
WARN_ON(block_group->space_info->disk_total
< block_group->length * factor);
- WARN_ON(block_group->zone_is_active &&
+ WARN_ON(test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &block_group->runtime_flags) &&
block_group->space_info->active_total_bytes
< block_group->length);
}
block_group->space_info->total_bytes -= block_group->length;
- if (block_group->zone_is_active)
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
block_group->space_info->active_total_bytes -= block_group->length;
block_group->space_info->bytes_readonly -=
(block_group->length - block_group->zone_unusable);
@@ -1069,7 +1072,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
goto out;
spin_lock(&block_group->lock);
- block_group->removed = 1;
+ set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
+
/*
* At this point trimming or scrub can't start on this block group,
* because we removed the block group from the rbtree
@@ -1304,6 +1308,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
+ if (btrfs_fs_closing(fs_info))
+ return;
+
/*
* Long running balances can keep us blocked here for eternity, so
* simply skip deletion if we're unable to get the mutex.
@@ -1533,6 +1540,30 @@ static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
return true;
}
+static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed)
+{
+ const struct btrfs_space_info *space_info = bg->space_info;
+ const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
+ const u64 new_val = bg->used;
+ const u64 old_val = new_val + bytes_freed;
+ u64 thresh;
+
+ if (reclaim_thresh == 0)
+ return false;
+
+ thresh = mult_perc(bg->length, reclaim_thresh);
+
+ /*
+ * If we were below the threshold before don't reclaim, we are likely a
+ * brand new block group and we don't want to relocate new block groups.
+ */
+ if (old_val < thresh)
+ return false;
+ if (new_val >= thresh)
+ return false;
+ return true;
+}
+
void btrfs_reclaim_bgs_work(struct work_struct *work)
{
struct btrfs_fs_info *fs_info =
@@ -1543,6 +1574,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
+ if (btrfs_fs_closing(fs_info))
+ return;
+
if (!btrfs_should_reclaim(fs_info))
return;
@@ -1597,6 +1631,40 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
up_write(&space_info->groups_sem);
goto next;
}
+ if (bg->used == 0) {
+ /*
+ * It is possible that we trigger relocation on a block
+ * group as its extents are deleted and it first goes
+ * below the threshold, then shortly after goes empty.
+ *
+ * In this case, relocating it does delete it, but has
+ * some overhead in relocation specific metadata, looking
+ * for the non-existent extents and running some extra
+ * transactions, which we can avoid by using one of the
+ * other mechanisms for dealing with empty block groups.
+ */
+ if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ btrfs_mark_bg_unused(bg);
+ spin_unlock(&bg->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+
+ }
+ /*
+ * The block group might no longer meet the reclaim condition by
+ * the time we get around to reclaiming it, so to avoid
+ * reclaiming overly full block_groups, skip reclaiming them.
+ *
+ * Since the decision making process also depends on the amount
+ * being freed, pass in a fake giant value to skip that extra
+ * check, which is more meaningful when adding to the list in
+ * the first place.
+ */
+ if (!should_reclaim_block_group(bg, bg->length)) {
+ spin_unlock(&bg->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
spin_unlock(&bg->lock);
/* Get out fast, in case we're unmounting the filesystem */
@@ -1743,8 +1811,8 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
write_sequnlock(&fs_info->profiles_lock);
}
-/**
- * Map a physical disk address to a list of logical addresses
+/*
+ * Map a physical disk address to a list of logical addresses.
*
* @fs_info: the filesystem
* @chunk_start: logical address of block group
@@ -1890,16 +1958,6 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
return 0;
}
-static void link_block_group(struct btrfs_block_group *cache)
-{
- struct btrfs_space_info *space_info = cache->space_info;
- int index = btrfs_bg_flags_to_raid_index(cache->flags);
-
- down_write(&space_info->groups_sem);
- list_add_tail(&cache->list, &space_info->block_groups[index]);
- up_write(&space_info->groups_sem);
-}
-
static struct btrfs_block_group *btrfs_create_block_group_cache(
struct btrfs_fs_info *fs_info, u64 start)
{
@@ -1937,7 +1995,8 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
atomic_set(&cache->frozen, 0);
mutex_init(&cache->free_space_lock);
- btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
+ cache->full_stripe_locks_root.root = RB_ROOT;
+ mutex_init(&cache->full_stripe_locks_root.lock);
return cache;
}
@@ -2002,7 +2061,6 @@ static int read_one_block_group(struct btrfs_fs_info *info,
int need_clear)
{
struct btrfs_block_group *cache;
- struct btrfs_space_info *space_info;
const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
int ret;
@@ -2014,6 +2072,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
cache->length = key->offset;
cache->used = btrfs_stack_block_group_used(bgi);
+ cache->commit_used = cache->used;
cache->flags = btrfs_stack_block_group_flags(bgi);
cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
@@ -2078,11 +2137,9 @@ static int read_one_block_group(struct btrfs_fs_info *info,
/* Should not have any excluded extents. Just in case, though. */
btrfs_free_excluded_extents(cache);
} else if (cache->length == cache->used) {
- cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
btrfs_free_excluded_extents(cache);
} else if (cache->used == 0) {
- cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
add_new_free_space(cache, cache->start,
cache->start + cache->length);
@@ -2095,14 +2152,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
goto error;
}
trace_btrfs_add_block_group(info, cache, 0);
- btrfs_update_space_info(info, cache->flags, cache->length,
- cache->used, cache->bytes_super,
- cache->zone_unusable, cache->zone_is_active,
- &space_info);
-
- cache->space_info = space_info;
-
- link_block_group(cache);
+ btrfs_add_bg_to_space_info(info, cache);
set_avail_alloc_bits(info, cache->flags);
if (btrfs_chunk_writeable(info, cache->start)) {
@@ -2126,7 +2176,6 @@ error:
static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct btrfs_space_info *space_info;
struct rb_node *node;
int ret = 0;
@@ -2146,7 +2195,6 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
/* Fill dummy cache as FULL */
bg->length = em->len;
bg->flags = map->type;
- bg->last_byte_to_unpin = (u64)-1;
bg->cached = BTRFS_CACHE_FINISHED;
bg->used = em->len;
bg->flags = map->type;
@@ -2167,10 +2215,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
break;
}
- btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
- 0, 0, false, &space_info);
- bg->space_info = space_info;
- link_block_group(bg);
+ btrfs_add_bg_to_space_info(fs_info, bg);
set_avail_alloc_bits(fs_info, bg->flags);
}
@@ -2190,7 +2235,16 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
int need_clear = 0;
u64 cache_gen;
- if (!root)
+ /*
+ * Either no extent root (with ibadroots rescue option) or we have
+ * unsupported RO options. The fs can never be mounted read-write, so no
+ * need to waste time searching block group items.
+ *
+ * This also allows new extent tree related changes to be RO compat,
+ * no need for a full incompat flag.
+ */
+ if (!root || (btrfs_super_compat_ro_flags(info->super_copy) &
+ ~BTRFS_FEATURE_COMPAT_RO_SUPP))
return fill_dummy_bgs(info);
key.objectid = 0;
@@ -2425,7 +2479,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
ret = insert_block_group_item(trans, block_group);
if (ret)
btrfs_abort_transaction(trans, ret);
- if (!block_group->chunk_item_inserted) {
+ if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
+ &block_group->runtime_flags)) {
mutex_lock(&fs_info->chunk_mutex);
ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
mutex_unlock(&fs_info->chunk_mutex);
@@ -2494,12 +2549,11 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
set_free_space_tree_thresholds(cache);
cache->used = bytes_used;
cache->flags = type;
- cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
- cache->needs_free_space = 1;
+ set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags);
ret = btrfs_load_block_group_zone_info(cache, true);
if (ret) {
@@ -2519,14 +2573,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
btrfs_free_excluded_extents(cache);
-#ifdef CONFIG_BTRFS_DEBUG
- if (btrfs_should_fragment_free_space(cache)) {
- u64 new_bytes_used = size - bytes_used;
-
- bytes_used += new_bytes_used >> 1;
- fragment_free_space(cache);
- }
-#endif
/*
* Ensure the corresponding space_info object is created and
* assigned to our block group. We want our bg to be added to the rbtree
@@ -2547,12 +2593,17 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
* the rbtree, update the space info's counters.
*/
trace_btrfs_add_block_group(fs_info, cache, 1);
- btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
- cache->bytes_super, cache->zone_unusable,
- cache->zone_is_active, &cache->space_info);
+ btrfs_add_bg_to_space_info(fs_info, cache);
btrfs_update_global_block_rsv(fs_info);
- link_block_group(cache);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(cache)) {
+ u64 new_bytes_used = size - bytes_used;
+
+ cache->space_info->bytes_used += new_bytes_used >> 1;
+ fragment_free_space(cache);
+ }
+#endif
list_add_tail(&cache->bg_list, &trans->new_bgs);
trans->delayed_ref_updates++;
@@ -2713,6 +2764,25 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_block_group_item bgi;
struct btrfs_key key;
+ u64 old_commit_used;
+ u64 used;
+
+ /*
+ * Block group items update can be triggered out of commit transaction
+ * critical section, thus we need a consistent view of used bytes.
+ * We cannot use cache->used directly outside of the spin lock, as it
+ * may be changed.
+ */
+ spin_lock(&cache->lock);
+ old_commit_used = cache->commit_used;
+ used = cache->used;
+ /* No change in used bytes, can safely skip it. */
+ if (cache->commit_used == used) {
+ spin_unlock(&cache->lock);
+ return 0;
+ }
+ cache->commit_used = used;
+ spin_unlock(&cache->lock);
key.objectid = cache->start;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
@@ -2727,7 +2797,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
- btrfs_set_stack_block_group_used(&bgi, cache->used);
+ btrfs_set_stack_block_group_used(&bgi, used);
btrfs_set_stack_block_group_chunk_objectid(&bgi,
cache->global_root_id);
btrfs_set_stack_block_group_flags(&bgi, cache->flags);
@@ -2735,6 +2805,12 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
fail:
btrfs_release_path(path);
+ /* We didn't update the block group item, need to revert @commit_used. */
+ if (ret < 0) {
+ spin_lock(&cache->lock);
+ cache->commit_used = old_commit_used;
+ spin_unlock(&cache->lock);
+ }
return ret;
}
@@ -2869,7 +2945,7 @@ again:
cache_size *= fs_info->sectorsize;
ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
- cache_size);
+ cache_size, false);
if (ret)
goto out_put;
@@ -3232,31 +3308,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
return ret;
}
-static inline bool should_reclaim_block_group(struct btrfs_block_group *bg,
- u64 bytes_freed)
-{
- const struct btrfs_space_info *space_info = bg->space_info;
- const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
- const u64 new_val = bg->used;
- const u64 old_val = new_val + bytes_freed;
- u64 thresh;
-
- if (reclaim_thresh == 0)
- return false;
-
- thresh = div_factor_fine(bg->length, reclaim_thresh);
-
- /*
- * If we were below the threshold before don't reclaim, we are likely a
- * brand new block group and we don't want to relocate new block groups.
- */
- if (old_val < thresh)
- return false;
- if (new_val >= thresh)
- return false;
- return true;
-}
-
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, bool alloc)
{
@@ -3368,8 +3419,9 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
return ret;
}
-/**
- * btrfs_add_reserved_bytes - update the block_group and space info counters
+/*
+ * Update the block_group and space info counters.
+ *
* @cache: The cache we are manipulating
* @ram_bytes: The number of bytes of file content, and will be same to
* @num_bytes except for the compress path.
@@ -3412,8 +3464,9 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
return ret;
}
-/**
- * btrfs_free_reserved_bytes - update the block_group and space info counters
+/*
+ * Update the block_group and space info counters.
+ *
* @cache: The cache we are manipulating
* @num_bytes: The number of bytes in question
* @delalloc: The blocks are allocated for the delalloc write
@@ -3470,13 +3523,13 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
*/
if (force == CHUNK_ALLOC_LIMITED) {
thresh = btrfs_super_total_bytes(fs_info->super_copy);
- thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
+ thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1));
if (sinfo->total_bytes - bytes_used < thresh)
return 1;
}
- if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
+ if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80))
return 0;
return 1;
}
@@ -3965,35 +4018,24 @@ void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
{
struct btrfs_block_group *block_group;
- u64 last = 0;
- while (1) {
- struct inode *inode;
+ block_group = btrfs_lookup_first_block_group(info, 0);
+ while (block_group) {
+ btrfs_wait_block_group_cache_done(block_group);
+ spin_lock(&block_group->lock);
+ if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
+ &block_group->runtime_flags)) {
+ struct inode *inode = block_group->inode;
- block_group = btrfs_lookup_first_block_group(info, last);
- while (block_group) {
- btrfs_wait_block_group_cache_done(block_group);
- spin_lock(&block_group->lock);
- if (block_group->iref)
- break;
+ block_group->inode = NULL;
spin_unlock(&block_group->lock);
- block_group = btrfs_next_block_group(block_group);
- }
- if (!block_group) {
- if (last == 0)
- break;
- last = 0;
- continue;
- }
- inode = block_group->inode;
- block_group->iref = 0;
- block_group->inode = NULL;
- spin_unlock(&block_group->lock);
- ASSERT(block_group->io_ctl.inode == NULL);
- iput(inode);
- last = block_group->start + block_group->length;
- btrfs_put_block_group(block_group);
+ ASSERT(block_group->io_ctl.inode == NULL);
+ iput(inode);
+ } else {
+ spin_unlock(&block_group->lock);
+ }
+ block_group = btrfs_next_block_group(block_group);
}
}
@@ -4129,7 +4171,7 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
spin_lock(&block_group->lock);
cleanup = (atomic_dec_and_test(&block_group->frozen) &&
- block_group->removed);
+ test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags));
spin_unlock(&block_group->lock);
if (cleanup) {
@@ -4150,7 +4192,7 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
* tasks trimming this block group have left 1 entry each one.
* Free them if any.
*/
- __btrfs_remove_free_space_cache(block_group->free_space_ctl);
+ btrfs_remove_free_space_cache(block_group);
}
}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 6b3cdc4cbc41..a02ea76fd6cf 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -46,19 +46,48 @@ enum btrfs_chunk_alloc_enum {
CHUNK_ALLOC_FORCE_FOR_EXTENT,
};
+/* Block group flags set at runtime */
+enum btrfs_block_group_flags {
+ BLOCK_GROUP_FLAG_IREF,
+ BLOCK_GROUP_FLAG_REMOVED,
+ BLOCK_GROUP_FLAG_TO_COPY,
+ BLOCK_GROUP_FLAG_RELOCATING_REPAIR,
+ BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
+ BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
+ /* Does the block group need to be added to the free space tree? */
+ BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
+ /* Indicate that the block group is placed on a sequential zone */
+ BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE,
+};
+
+enum btrfs_caching_type {
+ BTRFS_CACHE_NO,
+ BTRFS_CACHE_STARTED,
+ BTRFS_CACHE_FINISHED,
+ BTRFS_CACHE_ERROR,
+};
+
struct btrfs_caching_control {
struct list_head list;
struct mutex mutex;
wait_queue_head_t wait;
struct btrfs_work work;
struct btrfs_block_group *block_group;
- u64 progress;
refcount_t count;
};
/* Once caching_thread() finds this much free space, it will wake up waiters. */
#define CACHING_CTL_WAKE_UP SZ_2M
+/*
+ * Tree to record all locked full stripes of a RAID5/6 block group
+ */
+struct btrfs_full_stripe_locks_tree {
+ struct rb_root root;
+ struct mutex lock;
+};
+
struct btrfs_block_group {
struct btrfs_fs_info *fs_info;
struct inode *inode;
@@ -75,6 +104,12 @@ struct btrfs_block_group {
u64 global_root_id;
/*
+ * The last committed used bytes of this block group, if the above @used
+ * is still the same as @commit_used, we don't need to update block
+ * group item of this block group.
+ */
+ u64 commit_used;
+ /*
* If the free space extent count exceeds this number, convert the block
* group to bitmaps.
*/
@@ -95,23 +130,15 @@ struct btrfs_block_group {
/* For raid56, this is a full stripe, without parity */
unsigned long full_stripe_len;
+ unsigned long runtime_flags;
unsigned int ro;
- unsigned int iref:1;
- unsigned int has_caching_ctl:1;
- unsigned int removed:1;
- unsigned int to_copy:1;
- unsigned int relocating_repair:1;
- unsigned int chunk_item_inserted:1;
- unsigned int zone_is_active:1;
- unsigned int zoned_data_reloc_ongoing:1;
int disk_cache_state;
/* Cache tracking stuff */
int cached;
struct btrfs_caching_control *caching_ctl;
- u64 last_byte_to_unpin;
struct btrfs_space_info *space_info;
@@ -186,15 +213,6 @@ struct btrfs_block_group {
struct mutex free_space_lock;
/*
- * Does the block group need to be added to the free space tree?
- * Protected by free_space_lock.
- */
- int needs_free_space;
-
- /* Flag indicating this block group is placed on a sequential zone */
- bool seq_zone;
-
- /*
* Number of extents in this block group used for swap files.
* All accesses protected by the spinlock 'lock'.
*/
@@ -234,16 +252,7 @@ static inline bool btrfs_is_block_group_data_only(
}
#ifdef CONFIG_BTRFS_DEBUG
-static inline int btrfs_should_fragment_free_space(
- struct btrfs_block_group *block_group)
-{
- struct btrfs_fs_info *fs_info = block_group->fs_info;
-
- return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
- block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
- (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
- block_group->flags & BTRFS_BLOCK_GROUP_DATA);
-}
+int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group);
#endif
struct btrfs_block_group *btrfs_lookup_first_block_group(
@@ -305,8 +314,6 @@ void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
-void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
- struct btrfs_caching_control *caching_ctl);
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
struct block_device *bdev, u64 physical, u64 **logical,
int *naddrs, int *stripe_len);
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 06be0644dd37..5367a14d44d2 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -7,6 +7,8 @@
#include "transaction.h"
#include "block-group.h"
#include "disk-io.h"
+#include "fs.h"
+#include "accessors.h"
/*
* HOW DO BLOCK RESERVES WORK
@@ -225,7 +227,7 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
return ret;
}
-int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
+int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent)
{
u64 num_bytes = 0;
int ret = -ENOSPC;
@@ -234,7 +236,7 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
return 0;
spin_lock(&block_rsv->lock);
- num_bytes = div_factor(block_rsv->size, min_factor);
+ num_bytes = mult_perc(block_rsv->size, min_percent);
if (block_rsv->reserved >= num_bytes)
ret = 0;
spin_unlock(&block_rsv->lock);
@@ -286,7 +288,7 @@ u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
*/
if (block_rsv == delayed_rsv)
target = global_rsv;
- else if (block_rsv != global_rsv && !delayed_rsv->full)
+ else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv))
target = delayed_rsv;
if (target && block_rsv->space_info != target->space_info)
@@ -323,31 +325,6 @@ void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
spin_unlock(&block_rsv->lock);
}
-int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *dest, u64 num_bytes,
- int min_factor)
-{
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
- u64 min_bytes;
-
- if (global_rsv->space_info != dest->space_info)
- return -ENOSPC;
-
- spin_lock(&global_rsv->lock);
- min_bytes = div_factor(global_rsv->size, min_factor);
- if (global_rsv->reserved < min_bytes + num_bytes) {
- spin_unlock(&global_rsv->lock);
- return -ENOSPC;
- }
- global_rsv->reserved -= num_bytes;
- if (global_rsv->reserved < global_rsv->size)
- global_rsv->full = false;
- spin_unlock(&global_rsv->lock);
-
- btrfs_block_rsv_add_bytes(dest, num_bytes, true);
- return 0;
-}
-
void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
{
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
@@ -424,6 +401,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root)
case BTRFS_CSUM_TREE_OBJECTID:
case BTRFS_EXTENT_TREE_OBJECTID:
case BTRFS_FREE_SPACE_TREE_OBJECTID:
+ case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
root->block_rsv = &fs_info->delayed_refs_rsv;
break;
case BTRFS_ROOT_TREE_OBJECTID:
@@ -551,5 +529,17 @@ try_reserve:
if (!ret)
return global_rsv;
}
+
+ /*
+ * All hope is lost, but of course our reservations are overly
+ * pessimistic, so instead of possibly having an ENOSPC abort here, try
+ * one last time to force a reservation if there's enough actual space
+ * on disk to make the reservation.
+ */
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize,
+ BTRFS_RESERVE_FLUSH_EMERGENCY);
+ if (!ret)
+ return block_rsv;
+
return ERR_PTR(ret);
}
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index 0c183709be00..4cc41c9aaa82 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -4,6 +4,7 @@
#define BTRFS_BLOCK_RSV_H
struct btrfs_trans_handle;
+struct btrfs_root;
enum btrfs_reserve_flush_enum;
/*
@@ -62,7 +63,7 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
enum btrfs_reserve_flush_enum flush);
-int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor);
+int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent);
int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 min_reserved,
enum btrfs_reserve_flush_enum flush);
@@ -70,9 +71,6 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
bool update_size);
int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes);
-int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *dest, u64 num_bytes,
- int min_factor);
void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes, bool update_size);
u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
@@ -92,4 +90,13 @@ static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,
btrfs_block_rsv_release(fs_info, block_rsv, 0, NULL);
}
+/*
+ * Fast path to check if the reserve is full, may be carefully used outside of
+ * locks.
+ */
+static inline bool btrfs_block_rsv_full(const struct btrfs_block_rsv *rsv)
+{
+ return data_race(rsv->full);
+}
+
#endif /* BTRFS_BLOCK_RSV_H */
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index b160b8e124e0..195c09e20609 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -65,6 +65,8 @@ enum {
* on the same file.
*/
BTRFS_INODE_VERITY_IN_PROGRESS,
+ /* Set when this inode is a free space inode. */
+ BTRFS_INODE_FREE_SPACE_INODE,
};
/* in memory btrfs inode */
@@ -94,7 +96,8 @@ struct btrfs_inode {
/* special utility tree used to record which mirrors have already been
* tried when checksums fail for a given block
*/
- struct extent_io_tree io_failure_tree;
+ struct rb_root io_failure_tree;
+ spinlock_t io_failure_lock;
/*
* Keep track of where the inode has extent items mapped in order to
@@ -250,11 +253,6 @@ struct btrfs_inode {
struct inode vfs_inode;
};
-static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode)
-{
- return inode->root->fs_info->sectorsize;
-}
-
static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
{
return container_of(inode, struct btrfs_inode, vfs_inode);
@@ -272,13 +270,6 @@ static inline unsigned long btrfs_inode_hash(u64 objectid,
return (unsigned long)h;
}
-static inline void btrfs_insert_inode_hash(struct inode *inode)
-{
- unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root);
-
- __insert_inode_hash(inode, h);
-}
-
#if BITS_PER_LONG == 32
/*
@@ -312,13 +303,7 @@ static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
{
- struct btrfs_root *root = inode->root;
-
- if (root == root->fs_info->tree_root &&
- btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
- return true;
-
- return false;
+ return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags);
}
static inline bool is_data_inode(struct inode *inode)
@@ -426,29 +411,142 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
-static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
- u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
-{
- struct btrfs_root *root = inode->root;
- const u32 csum_size = root->fs_info->csum_size;
-
- /* Output minus objectid, which is more meaningful */
- if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID)
- btrfs_warn_rl(root->fs_info,
-"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- root->root_key.objectid, btrfs_ino(inode),
- logical_start,
- CSUM_FMT_VALUE(csum_size, csum),
- CSUM_FMT_VALUE(csum_size, csum_expected),
- mirror_num);
- else
- btrfs_warn_rl(root->fs_info,
-"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- root->root_key.objectid, btrfs_ino(inode),
- logical_start,
- CSUM_FMT_VALUE(csum_size, csum),
- CSUM_FMT_VALUE(csum_size, csum_expected),
- mirror_num);
-}
+void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
+void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio,
+ int mirror_num, enum btrfs_compression_type compress_type);
+void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
+blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio);
+blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode,
+ struct bio *bio,
+ u64 dio_file_offset);
+int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
+ u32 pgoff, u8 *csum, const u8 * const csum_expected);
+int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page, u32 pgoff);
+unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page,
+ u64 start, u64 end);
+noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+ u64 *orig_start, u64 *orig_block_len,
+ u64 *ram_bytes, bool nowait, bool strict);
+
+void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode);
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
+int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir, struct btrfs_inode *inode,
+ const struct fscrypt_str *name);
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
+ const struct fscrypt_str *name, int add_backref, u64 index);
+int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry);
+int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
+ int front);
+
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
+ bool in_reclaim_context);
+int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
+ unsigned int extra_bits,
+ struct extent_state **cached_state);
+
+struct btrfs_new_inode_args {
+ /* Input */
+ struct inode *dir;
+ struct dentry *dentry;
+ struct inode *inode;
+ bool orphan;
+ bool subvol;
+
+ /* Output from btrfs_new_inode_prepare(), input to btrfs_create_new_inode(). */
+ struct posix_acl *default_acl;
+ struct posix_acl *acl;
+ struct fscrypt_name fname;
+};
+
+int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
+ unsigned int *trans_num_items);
+int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_new_inode_args *args);
+void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args);
+struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
+ struct inode *dir);
+ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
+ u32 bits);
+void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
+ struct extent_state *state, u32 bits);
+void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
+ struct extent_state *other);
+void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
+ struct extent_state *orig, u64 split);
+void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
+vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
+void btrfs_evict_inode(struct inode *inode);
+struct inode *btrfs_alloc_inode(struct super_block *sb);
+void btrfs_destroy_inode(struct inode *inode);
+void btrfs_free_inode(struct inode *inode);
+int btrfs_drop_inode(struct inode *inode);
+int __init btrfs_init_cachep(void);
+void __cold btrfs_destroy_cachep(void);
+struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
+ struct btrfs_root *root, struct btrfs_path *path);
+struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
+struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
+ struct page *page, size_t pg_offset,
+ u64 start, u64 end);
+int btrfs_update_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_inode *inode);
+int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_inode *inode);
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode);
+int btrfs_orphan_cleanup(struct btrfs_root *root);
+int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size);
+void btrfs_add_delayed_iput(struct btrfs_inode *inode);
+void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
+int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info);
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint);
+int btrfs_prealloc_file_range_trans(struct inode *inode,
+ struct btrfs_trans_handle *trans, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint);
+int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written, struct writeback_control *wbc);
+int btrfs_writepage_cow_fixup(struct page *page);
+void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
+ struct page *page, u64 start,
+ u64 end, bool uptodate);
+int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
+ int compress_type);
+int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+ u64 file_offset, u64 disk_bytenr,
+ u64 disk_io_size,
+ struct page **pages);
+ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
+ struct btrfs_ioctl_encoded_io_args *encoded);
+ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded);
+
+ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
+ size_t done_before);
+struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
+ size_t done_before);
+
+extern const struct dentry_operations btrfs_dentry_operations;
+
+/* Inode locking type flags, by default the exclusive lock is taken. */
+enum btrfs_ilock_type {
+ ENUM_BIT(BTRFS_ILOCK_SHARED),
+ ENUM_BIT(BTRFS_ILOCK_TRY),
+ ENUM_BIT(BTRFS_ILOCK_MMAP),
+};
+
+int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags);
+void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags);
+void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes,
+ const u64 del_bytes);
+void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end);
#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 98c6e5feab19..82e49d985019 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -82,6 +82,7 @@
#include <linux/mm.h>
#include <linux/string.h>
#include <crypto/hash.h>
+#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -92,6 +93,7 @@
#include "check-integrity.h"
#include "rcu-string.h"
#include "compression.h"
+#include "accessors.h"
#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
@@ -755,7 +757,7 @@ static int btrfsic_process_superblock_dev_mirror(
btrfs_info_in_rcu(fs_info,
"new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)",
superblock_bdev,
- rcu_str_deref(device->name), dev_bytenr,
+ btrfs_dev_name(device), dev_bytenr,
dev_state->bdev, dev_bytenr,
superblock_mirror_num);
list_add(&superblock_tmp->all_blocks_node,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index e84d22c5c6a8..5122ca79f7ea 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -8,6 +8,7 @@
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
+#include <linux/pagevec.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
#include <linux/time.h>
@@ -15,22 +16,26 @@
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>
+#include <linux/psi.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/log2.h>
#include <crypto/hash.h>
#include "misc.h"
#include "ctree.h"
+#include "fs.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "volumes.h"
+#include "bio.h"
#include "ordered-data.h"
#include "compression.h"
#include "extent_io.h"
#include "extent_map.h"
#include "subpage.h"
#include "zoned.h"
+#include "file-item.h"
+#include "super.h"
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
@@ -114,7 +119,7 @@ static int compression_decompress_bio(struct list_head *ws,
}
static int compression_decompress(int type, struct list_head *ws,
- unsigned char *data_in, struct page *dest_page,
+ const u8 *data_in, struct page *dest_page,
unsigned long start_byte, size_t srclen, size_t destlen)
{
switch (type) {
@@ -152,9 +157,7 @@ static void finish_compressed_bio_read(struct compressed_bio *cb)
}
/* Do io completion on the original bio */
- if (cb->status != BLK_STS_OK)
- cb->orig_bio->bi_status = cb->status;
- bio_endio(cb->orig_bio);
+ btrfs_bio_end_io(btrfs_bio(cb->orig_bio), cb->status);
/* Finally free the cb struct */
kfree(cb->compressed_pages);
@@ -166,16 +169,15 @@ static void finish_compressed_bio_read(struct compressed_bio *cb)
* before decompressing it into the original bio and freeing the uncompressed
* pages.
*/
-static void end_compressed_bio_read(struct bio *bio)
+static void end_compressed_bio_read(struct btrfs_bio *bbio)
{
- struct compressed_bio *cb = bio->bi_private;
+ struct compressed_bio *cb = bbio->private;
struct inode *inode = cb->inode;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_inode *bi = BTRFS_I(inode);
bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) &&
!test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
- blk_status_t status = bio->bi_status;
- struct btrfs_bio *bbio = btrfs_bio(bio);
+ blk_status_t status = bbio->bio.bi_status;
struct bvec_iter iter;
struct bio_vec bv;
u32 offset;
@@ -184,18 +186,17 @@ static void end_compressed_bio_read(struct bio *bio)
u64 start = bbio->file_offset + offset;
if (!status &&
- (!csum || !btrfs_check_data_csum(inode, bbio, offset,
+ (!csum || !btrfs_check_data_csum(bi, bbio, offset,
bv.bv_page, bv.bv_offset))) {
- clean_io_failure(fs_info, &bi->io_failure_tree,
- &bi->io_tree, start, bv.bv_page,
- btrfs_ino(bi), bv.bv_offset);
+ btrfs_clean_io_failure(bi, start, bv.bv_page,
+ bv.bv_offset);
} else {
int ret;
refcount_inc(&cb->pending_ios);
- ret = btrfs_repair_one_sector(inode, bbio, offset,
+ ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset,
bv.bv_page, bv.bv_offset,
- btrfs_submit_data_read_bio);
+ true);
if (ret) {
refcount_dec(&cb->pending_ios);
status = errno_to_blk_status(ret);
@@ -209,7 +210,7 @@ static void end_compressed_bio_read(struct bio *bio)
if (refcount_dec_and_test(&cb->pending_ios))
finish_compressed_bio_read(cb);
btrfs_bio_free_csum(bbio);
- bio_put(bio);
+ bio_put(&bbio->bio);
}
/*
@@ -222,8 +223,7 @@ static noinline void end_compressed_writeback(struct inode *inode,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
unsigned long index = cb->start >> PAGE_SHIFT;
unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
- struct page *pages[16];
- unsigned long nr_pages = end_index - index + 1;
+ struct folio_batch fbatch;
const int errno = blk_status_to_errno(cb->status);
int i;
int ret;
@@ -231,24 +231,23 @@ static noinline void end_compressed_writeback(struct inode *inode,
if (errno)
mapping_set_error(inode->i_mapping, errno);
- while (nr_pages > 0) {
- ret = find_get_pages_contig(inode->i_mapping, index,
- min_t(unsigned long,
- nr_pages, ARRAY_SIZE(pages)), pages);
- if (ret == 0) {
- nr_pages -= 1;
- index += 1;
- continue;
- }
+ folio_batch_init(&fbatch);
+ while (index <= end_index) {
+ ret = filemap_get_folios(inode->i_mapping, &index, end_index,
+ &fbatch);
+
+ if (ret == 0)
+ return;
+
for (i = 0; i < ret; i++) {
+ struct folio *folio = fbatch.folios[i];
+
if (errno)
- SetPageError(pages[i]);
- btrfs_page_clamp_clear_writeback(fs_info, pages[i],
+ folio_set_error(folio);
+ btrfs_page_clamp_clear_writeback(fs_info, &folio->page,
cb->start, cb->len);
- put_page(pages[i]);
}
- nr_pages -= ret;
- index += ret;
+ folio_batch_release(&fbatch);
}
/* the inode may be gone now */
}
@@ -301,20 +300,20 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
* This also calls the writeback end hooks for the file pages so that metadata
* and checksums can be updated in the file.
*/
-static void end_compressed_bio_write(struct bio *bio)
+static void end_compressed_bio_write(struct btrfs_bio *bbio)
{
- struct compressed_bio *cb = bio->bi_private;
+ struct compressed_bio *cb = bbio->private;
- if (bio->bi_status)
- cb->status = bio->bi_status;
+ if (bbio->bio.bi_status)
+ cb->status = bbio->bio.bi_status;
if (refcount_dec_and_test(&cb->pending_ios)) {
struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
- btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+ btrfs_record_physical_zoned(cb->inode, cb->start, &bbio->bio);
queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
}
- bio_put(bio);
+ bio_put(&bbio->bio);
}
/*
@@ -335,7 +334,8 @@ static void end_compressed_bio_write(struct bio *bio)
static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr,
- blk_opf_t opf, bio_end_io_t endio_func,
+ blk_opf_t opf,
+ btrfs_bio_end_io_t endio_func,
u64 *next_stripe_start)
{
struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
@@ -344,12 +344,8 @@ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_byte
struct bio *bio;
int ret;
- bio = btrfs_bio_alloc(BIO_MAX_VECS);
-
+ bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, endio_func, cb);
bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
- bio->bi_opf = opf;
- bio->bi_private = cb;
- bio->bi_end_io = endio_func;
em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize);
if (IS_ERR(em)) {
@@ -478,8 +474,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
if (!skip_sum) {
ret = btrfs_csum_one_bio(inode, bio, start, true);
if (ret) {
- bio->bi_status = ret;
- bio_endio(bio);
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
break;
}
}
@@ -519,7 +514,8 @@ static u64 bio_end_offset(struct bio *bio)
*/
static noinline int add_ra_bio_pages(struct inode *inode,
u64 compressed_end,
- struct compressed_bio *cb)
+ struct compressed_bio *cb,
+ int *memstall, unsigned long *pflags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
unsigned long end_index;
@@ -588,6 +584,11 @@ static noinline int add_ra_bio_pages(struct inode *inode,
continue;
}
+ if (!*memstall && PageWorkingset(page)) {
+ psi_memstall_enter(pflags);
+ *memstall = 1;
+ }
+
ret = set_page_extent_mapped(page);
if (ret < 0) {
unlock_page(page);
@@ -596,7 +597,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
}
page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
- lock_extent(tree, cur, page_end);
+ lock_extent(tree, cur, page_end, NULL);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
read_unlock(&em_tree->lock);
@@ -610,7 +611,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
(cur + fs_info->sectorsize > extent_map_end(em)) ||
(em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
free_extent_map(em);
- unlock_extent(tree, cur, page_end);
+ unlock_extent(tree, cur, page_end, NULL);
unlock_page(page);
put_page(page);
break;
@@ -630,7 +631,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
add_size = min(em->start + em->len, page_end + 1) - cur;
ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
if (ret != add_size) {
- unlock_extent(tree, cur, page_end);
+ unlock_extent(tree, cur, page_end, NULL);
unlock_page(page);
put_page(page);
break;
@@ -674,6 +675,8 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
u64 em_len;
u64 em_start;
struct extent_map *em;
+ unsigned long pflags;
+ int memstall = 0;
blk_status_t ret;
int ret2;
int i;
@@ -729,7 +732,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
goto fail;
}
- add_ra_bio_pages(inode, em_start + em_len, cb);
+ add_ra_bio_pages(inode, em_start + em_len, cb, &memstall, &pflags);
/* include any pages we added in add_ra-bio_pages */
cb->len = bio->bi_iter.bi_size;
@@ -799,8 +802,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL);
if (ret) {
- comp_bio->bi_status = ret;
- bio_endio(comp_bio);
+ btrfs_bio_end_io(btrfs_bio(comp_bio), ret);
break;
}
@@ -810,6 +812,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
}
}
+ if (memstall)
+ psi_memstall_leave(&pflags);
+
if (refcount_dec_and_test(&cb->pending_ios))
finish_compressed_bio_read(cb);
return;
@@ -826,8 +831,7 @@ fail:
kfree(cb);
out:
free_extent_map(em);
- bio->bi_status = ret;
- bio_endio(bio);
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
return;
}
@@ -1228,7 +1232,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
* single page, and we want to read a single page out of it.
* start_byte tells us the offset into the compressed data we're interested in
*/
-int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
unsigned long start_byte, size_t srclen, size_t destlen)
{
struct list_head *workspace;
@@ -1242,12 +1246,13 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
return ret;
}
-void __init btrfs_init_compress(void)
+int __init btrfs_init_compress(void)
{
btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
zstd_init_workspace_manager();
+ return 0;
}
void __cold btrfs_exit_compress(void)
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 1aa02903de69..6209d40a1e08 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -6,6 +6,7 @@
#ifndef BTRFS_COMPRESSION_H
#define BTRFS_COMPRESSION_H
+#include <linux/blk_types.h>
#include <linux/sizes.h>
struct btrfs_inode;
@@ -77,7 +78,7 @@ static inline unsigned int btrfs_compress_level(unsigned int type_level)
return ((type_level & 0xF0) >> 4);
}
-void __init btrfs_init_compress(void);
+int __init btrfs_init_compress(void);
void __cold btrfs_exit_compress(void);
int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
@@ -85,7 +86,7 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
unsigned long *out_pages,
unsigned long *total_in,
unsigned long *total_out);
-int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
unsigned long start_byte, size_t srclen, size_t destlen);
int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
struct compressed_bio *cb, u32 decompressed);
@@ -149,7 +150,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
u64 start, struct page **pages, unsigned long *out_pages,
unsigned long *total_in, unsigned long *total_out);
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+int zlib_decompress(struct list_head *ws, const u8 *data_in,
struct page *dest_page, unsigned long start_byte, size_t srclen,
size_t destlen);
struct list_head *zlib_alloc_workspace(unsigned int level);
@@ -160,7 +161,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
u64 start, struct page **pages, unsigned long *out_pages,
unsigned long *total_in, unsigned long *total_out);
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+int lzo_decompress(struct list_head *ws, const u8 *data_in,
struct page *dest_page, unsigned long start_byte, size_t srclen,
size_t destlen);
struct list_head *lzo_alloc_workspace(unsigned int level);
@@ -170,7 +171,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
u64 start, struct page **pages, unsigned long *out_pages,
unsigned long *total_in, unsigned long *total_out);
int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+int zstd_decompress(struct list_head *ws, const u8 *data_in,
struct page *dest_page, unsigned long start_byte, size_t srclen,
size_t destlen);
void zstd_init_workspace_manager(void);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ebfa35fe1c38..4754c9101a4c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -8,6 +8,7 @@
#include <linux/rbtree.h>
#include <linux/mm.h>
#include <linux/error-injection.h>
+#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -17,6 +18,13 @@
#include "qgroup.h"
#include "tree-mod-log.h"
#include "tree-checker.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "relocation.h"
+#include "file-item.h"
+
+static struct kmem_cache *btrfs_path_cachep;
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
@@ -44,6 +52,104 @@ static const struct btrfs_csums {
.driver = "blake2b-256" },
};
+/*
+ * The leaf data grows from end-to-front in the node. this returns the address
+ * of the start of the last item, which is the stop of the leaf data stack.
+ */
+static unsigned int leaf_data_end(const struct extent_buffer *leaf)
+{
+ u32 nr = btrfs_header_nritems(leaf);
+
+ if (nr == 0)
+ return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
+ return btrfs_item_offset(leaf, nr - 1);
+}
+
+/*
+ * Move data in a @leaf (using memmove, safe for overlapping ranges).
+ *
+ * @leaf: leaf that we're doing a memmove on
+ * @dst_offset: item data offset we're moving to
+ * @src_offset: item data offset were' moving from
+ * @len: length of the data we're moving
+ *
+ * Wrapper around memmove_extent_buffer() that takes into account the header on
+ * the leaf. The btrfs_item offset's start directly after the header, so we
+ * have to adjust any offsets to account for the header in the leaf. This
+ * handles that math to simplify the callers.
+ */
+static inline void memmove_leaf_data(const struct extent_buffer *leaf,
+ unsigned long dst_offset,
+ unsigned long src_offset,
+ unsigned long len)
+{
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset,
+ btrfs_item_nr_offset(leaf, 0) + src_offset, len);
+}
+
+/*
+ * Copy item data from @src into @dst at the given @offset.
+ *
+ * @dst: destination leaf that we're copying into
+ * @src: source leaf that we're copying from
+ * @dst_offset: item data offset we're copying to
+ * @src_offset: item data offset were' copying from
+ * @len: length of the data we're copying
+ *
+ * Wrapper around copy_extent_buffer() that takes into account the header on
+ * the leaf. The btrfs_item offset's start directly after the header, so we
+ * have to adjust any offsets to account for the header in the leaf. This
+ * handles that math to simplify the callers.
+ */
+static inline void copy_leaf_data(const struct extent_buffer *dst,
+ const struct extent_buffer *src,
+ unsigned long dst_offset,
+ unsigned long src_offset, unsigned long len)
+{
+ copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset,
+ btrfs_item_nr_offset(src, 0) + src_offset, len);
+}
+
+/*
+ * Move items in a @leaf (using memmove).
+ *
+ * @dst: destination leaf for the items
+ * @dst_item: the item nr we're copying into
+ * @src_item: the item nr we're copying from
+ * @nr_items: the number of items to copy
+ *
+ * Wrapper around memmove_extent_buffer() that does the math to get the
+ * appropriate offsets into the leaf from the item numbers.
+ */
+static inline void memmove_leaf_items(const struct extent_buffer *leaf,
+ int dst_item, int src_item, int nr_items)
+{
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item),
+ btrfs_item_nr_offset(leaf, src_item),
+ nr_items * sizeof(struct btrfs_item));
+}
+
+/*
+ * Copy items from @src into @dst at the given @offset.
+ *
+ * @dst: destination leaf for the items
+ * @src: source leaf for the items
+ * @dst_item: the item nr we're copying into
+ * @src_item: the item nr we're copying from
+ * @nr_items: the number of items to copy
+ *
+ * Wrapper around copy_extent_buffer() that does the math to get the
+ * appropriate offsets into the leaf from the item numbers.
+ */
+static inline void copy_leaf_items(const struct extent_buffer *dst,
+ const struct extent_buffer *src,
+ int dst_item, int src_item, int nr_items)
+{
+ copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item),
+ btrfs_item_nr_offset(src, src_item),
+ nr_items * sizeof(struct btrfs_item));
+}
+
int btrfs_super_csum_size(const struct btrfs_super_block *s)
{
u16 t = btrfs_super_csum_type(s);
@@ -78,6 +184,8 @@ size_t __attribute_const__ btrfs_get_num_csums(void)
struct btrfs_path *btrfs_alloc_path(void)
{
+ might_sleep();
+
return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
}
@@ -114,6 +222,22 @@ noinline void btrfs_release_path(struct btrfs_path *p)
}
/*
+ * We want the transaction abort to print stack trace only for errors where the
+ * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
+ * caused by external factors.
+ */
+bool __cold abort_should_print_stack(int errno)
+{
+ switch (errno) {
+ case -EIO:
+ case -EROFS:
+ case -ENOMEM:
+ return false;
+ }
+ return true;
+}
+
+/*
* safely gets a reference on the root node of a tree. A lock
* is not taken, so a concurrent writer may put a different node
* at the root of the tree. See btrfs_lock_root_node for the
@@ -471,7 +595,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
} else {
WARN_ON(trans->transid != btrfs_header_generation(parent));
btrfs_tree_mod_log_insert_key(parent, parent_slot,
- BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_REPLACE);
btrfs_set_node_blockptr(parent, parent_slot,
cow->start);
btrfs_set_node_ptr_generation(parent, parent_slot,
@@ -834,19 +958,22 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
int slot)
{
int level = btrfs_header_level(parent);
+ struct btrfs_tree_parent_check check = { 0 };
struct extent_buffer *eb;
- struct btrfs_key first_key;
if (slot < 0 || slot >= btrfs_header_nritems(parent))
return ERR_PTR(-ENOENT);
BUG_ON(level == 0);
- btrfs_node_key_to_cpu(parent, &first_key, slot);
+ check.level = level - 1;
+ check.transid = btrfs_node_ptr_generation(parent, slot);
+ check.owner_root = btrfs_header_owner(parent);
+ check.has_first_key = true;
+ btrfs_node_key_to_cpu(parent, &check.first_key, slot);
+
eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
- btrfs_header_owner(parent),
- btrfs_node_ptr_generation(parent, slot),
- level - 1, &first_key);
+ &check);
if (IS_ERR(eb))
return eb;
if (!extent_buffer_uptodate(eb)) {
@@ -1000,7 +1127,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
- BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_REPLACE);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &right_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -1046,7 +1173,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
struct btrfs_disk_key mid_key;
btrfs_node_key(mid, &mid_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot,
- BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_REPLACE);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &mid_key, pslot);
btrfs_mark_buffer_dirty(parent);
@@ -1148,7 +1275,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
orig_slot += left_nr;
btrfs_node_key(mid, &disk_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot,
- BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_REPLACE);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &disk_key, pslot);
btrfs_mark_buffer_dirty(parent);
@@ -1202,7 +1329,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
btrfs_node_key(right, &disk_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
- BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_REPLACE);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &disk_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -1405,10 +1532,10 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
const struct btrfs_key *key)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_tree_parent_check check = { 0 };
u64 blocknr;
u64 gen;
struct extent_buffer *tmp;
- struct btrfs_key first_key;
int ret;
int parent_level;
bool unlock_up;
@@ -1417,7 +1544,11 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
blocknr = btrfs_node_blockptr(*eb_ret, slot);
gen = btrfs_node_ptr_generation(*eb_ret, slot);
parent_level = btrfs_header_level(*eb_ret);
- btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
+ btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
+ check.has_first_key = true;
+ check.level = parent_level - 1;
+ check.transid = gen;
+ check.owner_root = root->root_key.objectid;
/*
* If we need to read an extent buffer from disk and we are holding locks
@@ -1439,7 +1570,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
* parents (shared tree blocks).
*/
if (btrfs_verify_level_key(tmp,
- parent_level - 1, &first_key, gen)) {
+ parent_level - 1, &check.first_key, gen)) {
free_extent_buffer(tmp);
return -EUCLEAN;
}
@@ -1447,11 +1578,16 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
return 0;
}
+ if (p->nowait) {
+ free_extent_buffer(tmp);
+ return -EAGAIN;
+ }
+
if (unlock_up)
btrfs_unlock_up_safe(p, level + 1);
/* now we're allowed to do a blocking uptodate check */
- ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key);
+ ret = btrfs_read_extent_buffer(tmp, &check);
if (ret) {
free_extent_buffer(tmp);
btrfs_release_path(p);
@@ -1467,6 +1603,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
ret = -EAGAIN;
goto out;
+ } else if (p->nowait) {
+ return -EAGAIN;
}
if (unlock_up) {
@@ -1479,8 +1617,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
if (p->reada != READA_NONE)
reada_for_search(fs_info, p, level, slot, key->objectid);
- tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
- gen, parent_level - 1, &first_key);
+ tmp = read_tree_block(fs_info, blocknr, &check);
if (IS_ERR(tmp)) {
btrfs_release_path(p);
return PTR_ERR(tmp);
@@ -1634,7 +1771,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
* We don't know the level of the root node until we actually
* have it read locked
*/
- b = btrfs_read_lock_root_node(root);
+ if (p->nowait) {
+ b = btrfs_try_read_lock_root_node(root);
+ if (IS_ERR(b))
+ return b;
+ } else {
+ b = btrfs_read_lock_root_node(root);
+ }
level = btrfs_header_level(b);
if (level > write_lock_level)
goto out;
@@ -1905,11 +2048,20 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int min_write_lock_level;
int prev_cmp;
+ might_sleep();
+
lowest_level = p->lowest_level;
WARN_ON(lowest_level && ins_len > 0);
WARN_ON(p->nodes[0] != NULL);
BUG_ON(!cow && ins_len);
+ /*
+ * For now only allow nowait for read only operations. There's no
+ * strict reason why we can't, we just only need it for reads so it's
+ * only implemented for reads.
+ */
+ ASSERT(!p->nowait || !cow);
+
if (ins_len < 0) {
lowest_unlock = 2;
@@ -1936,7 +2088,12 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (p->need_commit_sem) {
ASSERT(p->search_commit_root);
- down_read(&fs_info->commit_root_sem);
+ if (p->nowait) {
+ if (!down_read_trylock(&fs_info->commit_root_sem))
+ return -EAGAIN;
+ } else {
+ down_read(&fs_info->commit_root_sem);
+ }
}
again:
@@ -2082,7 +2239,15 @@ cow_done:
btrfs_tree_lock(b);
p->locks[level] = BTRFS_WRITE_LOCK;
} else {
- btrfs_tree_read_lock(b);
+ if (p->nowait) {
+ if (!btrfs_try_tree_read_lock(b)) {
+ free_extent_buffer(b);
+ ret = -EAGAIN;
+ goto done;
+ }
+ } else {
+ btrfs_tree_read_lock(b);
+ }
p->locks[level] = BTRFS_READ_LOCK;
}
p->nodes[level] = b;
@@ -2131,6 +2296,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
lowest_level = p->lowest_level;
WARN_ON(p->nodes[0] != NULL);
+ ASSERT(!p->nowait);
if (p->search_commit_root) {
BUG_ON(time_seq);
@@ -2307,7 +2473,7 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
return ret;
}
-/**
+/*
* Search for a valid slot for the given path.
*
* @root: The root node of the tree.
@@ -2366,7 +2532,7 @@ static void fixup_low_keys(struct btrfs_path *path,
break;
t = path->nodes[i];
ret = btrfs_tree_mod_log_insert_key(t, tslot,
- BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC);
+ BTRFS_MOD_LOG_KEY_REPLACE);
BUG_ON(ret < 0);
btrfs_set_node_key(t, key, tslot);
btrfs_mark_buffer_dirty(path->nodes[i]);
@@ -2535,8 +2701,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
return ret;
}
copy_extent_buffer(dst, src,
- btrfs_node_key_ptr_offset(dst_nritems),
- btrfs_node_key_ptr_offset(0),
+ btrfs_node_key_ptr_offset(dst, dst_nritems),
+ btrfs_node_key_ptr_offset(src, 0),
push_items * sizeof(struct btrfs_key_ptr));
if (push_items < src_nritems) {
@@ -2544,8 +2710,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
* Don't call btrfs_tree_mod_log_insert_move() here, key removal
* was already fully logged by btrfs_tree_mod_log_eb_copy() above.
*/
- memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
- btrfs_node_key_ptr_offset(push_items),
+ memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0),
+ btrfs_node_key_ptr_offset(src, push_items),
(src_nritems - push_items) *
sizeof(struct btrfs_key_ptr));
}
@@ -2605,8 +2771,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
}
ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
BUG_ON(ret < 0);
- memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
- btrfs_node_key_ptr_offset(0),
+ memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items),
+ btrfs_node_key_ptr_offset(dst, 0),
(dst_nritems) *
sizeof(struct btrfs_key_ptr));
@@ -2617,8 +2783,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
return ret;
}
copy_extent_buffer(dst, src,
- btrfs_node_key_ptr_offset(0),
- btrfs_node_key_ptr_offset(src_nritems - push_items),
+ btrfs_node_key_ptr_offset(dst, 0),
+ btrfs_node_key_ptr_offset(src, src_nritems - push_items),
push_items * sizeof(struct btrfs_key_ptr));
btrfs_set_header_nritems(src, src_nritems - push_items);
@@ -2721,13 +2887,13 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
BUG_ON(ret < 0);
}
memmove_extent_buffer(lower,
- btrfs_node_key_ptr_offset(slot + 1),
- btrfs_node_key_ptr_offset(slot),
+ btrfs_node_key_ptr_offset(lower, slot + 1),
+ btrfs_node_key_ptr_offset(lower, slot),
(nritems - slot) * sizeof(struct btrfs_key_ptr));
}
if (level) {
ret = btrfs_tree_mod_log_insert_key(lower, slot,
- BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_ADD);
BUG_ON(ret < 0);
}
btrfs_set_node_key(lower, key, slot);
@@ -2804,8 +2970,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
return ret;
}
copy_extent_buffer(split, c,
- btrfs_node_key_ptr_offset(0),
- btrfs_node_key_ptr_offset(mid),
+ btrfs_node_key_ptr_offset(split, 0),
+ btrfs_node_key_ptr_offset(c, mid),
(c_nritems - mid) * sizeof(struct btrfs_key_ptr));
btrfs_set_header_nritems(split, c_nritems - mid);
btrfs_set_header_nritems(c, mid);
@@ -2945,25 +3111,17 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
/* make room in the right data area */
data_end = leaf_data_end(right);
- memmove_extent_buffer(right,
- BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
- BTRFS_LEAF_DATA_OFFSET + data_end,
- BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
+ memmove_leaf_data(right, data_end - push_space, data_end,
+ BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
/* copy from the left data area */
- copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
- BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
- BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
- push_space);
+ copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
+ leaf_data_end(left), push_space);
- memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
- btrfs_item_nr_offset(0),
- right_nritems * sizeof(struct btrfs_item));
+ memmove_leaf_items(right, push_items, 0, right_nritems);
/* copy the items from left to right */
- copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
- btrfs_item_nr_offset(left_nritems - push_items),
- push_items * sizeof(struct btrfs_item));
+ copy_leaf_items(right, left, 0, left_nritems - push_items, push_items);
/* update the item pointers */
btrfs_init_map_token(&token, right);
@@ -3155,19 +3313,13 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
WARN_ON(!empty && push_items == btrfs_header_nritems(right));
/* push data from right to left */
- copy_extent_buffer(left, right,
- btrfs_item_nr_offset(btrfs_header_nritems(left)),
- btrfs_item_nr_offset(0),
- push_items * sizeof(struct btrfs_item));
+ copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
btrfs_item_offset(right, push_items - 1);
- copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
- leaf_data_end(left) - push_space,
- BTRFS_LEAF_DATA_OFFSET +
- btrfs_item_offset(right, push_items - 1),
- push_space);
+ copy_leaf_data(left, right, leaf_data_end(left) - push_space,
+ btrfs_item_offset(right, push_items - 1), push_space);
old_left_nritems = btrfs_header_nritems(left);
BUG_ON(old_left_nritems <= 0);
@@ -3190,15 +3342,12 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
if (push_items < right_nritems) {
push_space = btrfs_item_offset(right, push_items - 1) -
leaf_data_end(right);
- memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
- BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
- BTRFS_LEAF_DATA_OFFSET +
- leaf_data_end(right), push_space);
+ memmove_leaf_data(right,
+ BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
+ leaf_data_end(right), push_space);
- memmove_extent_buffer(right, btrfs_item_nr_offset(0),
- btrfs_item_nr_offset(push_items),
- (btrfs_header_nritems(right) - push_items) *
- sizeof(struct btrfs_item));
+ memmove_leaf_items(right, 0, push_items,
+ btrfs_header_nritems(right) - push_items);
}
btrfs_init_map_token(&token, right);
@@ -3330,14 +3479,10 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
btrfs_set_header_nritems(right, nritems);
data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
- copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
- btrfs_item_nr_offset(mid),
- nritems * sizeof(struct btrfs_item));
+ copy_leaf_items(right, l, 0, mid, nritems);
- copy_extent_buffer(right, l,
- BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
- data_copy_size, BTRFS_LEAF_DATA_OFFSET +
- leaf_data_end(l), data_copy_size);
+ copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size,
+ leaf_data_end(l), data_copy_size);
rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
@@ -3707,9 +3852,7 @@ static noinline int split_item(struct btrfs_path *path,
nritems = btrfs_header_nritems(leaf);
if (slot != nritems) {
/* shift the items */
- memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
- btrfs_item_nr_offset(slot),
- (nritems - slot) * sizeof(struct btrfs_item));
+ memmove_leaf_items(leaf, slot + 1, slot, nritems - slot);
}
btrfs_cpu_key_to_disk(&disk_key, new_key);
@@ -3820,9 +3963,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
/* shift the data */
if (from_end) {
- memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
- data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
- data_end, old_data_start + new_size - data_end);
+ memmove_leaf_data(leaf, data_end + size_diff, data_end,
+ old_data_start + new_size - data_end);
} else {
struct btrfs_disk_key disk_key;
u64 offset;
@@ -3847,9 +3989,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
}
}
- memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
- data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
- data_end, old_data_start - data_end);
+ memmove_leaf_data(leaf, data_end + size_diff, data_end,
+ old_data_start - data_end);
offset = btrfs_disk_key_offset(&disk_key);
btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
@@ -3914,9 +4055,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
}
/* shift the data */
- memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
- data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
- data_end, old_data - data_end);
+ memmove_leaf_data(leaf, data_end - data_size, data_end,
+ old_data - data_end);
data_end = old_data;
old_size = btrfs_item_size(leaf, slot);
@@ -3929,14 +4069,15 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
}
}
-/**
- * setup_items_for_insert - Helper called before inserting one or more items
- * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
- * in a function that doesn't call btrfs_search_slot
+/*
+ * Make space in the node before inserting one or more items.
*
* @root: root we are inserting items to
* @path: points to the leaf/slot where we are going to insert new items
* @batch: information about the batch of items to insert
+ *
+ * Main purpose is to save stack depth by doing the bulk of the work in a
+ * function that doesn't call btrfs_search_slot
*/
static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
const struct btrfs_item_batch *batch)
@@ -3999,15 +4140,11 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
ioff - batch->total_data_size);
}
/* shift the items */
- memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
- btrfs_item_nr_offset(slot),
- (nritems - slot) * sizeof(struct btrfs_item));
+ memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot);
/* shift the data */
- memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
- data_end - batch->total_data_size,
- BTRFS_LEAF_DATA_OFFSET + data_end,
- old_data - data_end);
+ memmove_leaf_data(leaf, data_end - batch->total_data_size,
+ data_end, old_data - data_end);
data_end = old_data;
}
@@ -4161,13 +4298,13 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
BUG_ON(ret < 0);
}
memmove_extent_buffer(parent,
- btrfs_node_key_ptr_offset(slot),
- btrfs_node_key_ptr_offset(slot + 1),
+ btrfs_node_key_ptr_offset(parent, slot),
+ btrfs_node_key_ptr_offset(parent, slot + 1),
sizeof(struct btrfs_key_ptr) *
(nritems - slot - 1));
} else if (level) {
ret = btrfs_tree_mod_log_insert_key(parent, slot,
- BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_REMOVE);
BUG_ON(ret < 0);
}
@@ -4242,10 +4379,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
for (i = 0; i < nr; i++)
dsize += btrfs_item_size(leaf, slot + i);
- memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
- data_end + dsize,
- BTRFS_LEAF_DATA_OFFSET + data_end,
- last_off - data_end);
+ memmove_leaf_data(leaf, data_end + dsize, data_end,
+ last_off - data_end);
btrfs_init_map_token(&token, leaf);
for (i = slot + nr; i < nritems; i++) {
@@ -4255,10 +4390,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_set_token_item_offset(&token, i, ioff + dsize);
}
- memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
- btrfs_item_nr_offset(slot + nr),
- sizeof(struct btrfs_item) *
- (nritems - slot - nr));
+ memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr);
}
btrfs_set_header_nritems(leaf, nritems - nr);
nritems -= nr;
@@ -4432,6 +4564,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
int ret = 1;
int keep_locks = path->keep_locks;
+ ASSERT(!path->nowait);
path->keep_locks = 1;
again:
cur = btrfs_read_lock_root_node(root);
@@ -4612,6 +4745,13 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
int ret;
int i;
+ /*
+ * The nowait semantics are used only for write paths, where we don't
+ * use the tree mod log and sequence numbers.
+ */
+ if (time_seq)
+ ASSERT(!path->nowait);
+
nritems = btrfs_header_nritems(path->nodes[0]);
if (nritems == 0)
return 1;
@@ -4630,7 +4770,14 @@ again:
if (path->need_commit_sem) {
path->need_commit_sem = 0;
need_commit_sem = true;
- down_read(&fs_info->commit_root_sem);
+ if (path->nowait) {
+ if (!down_read_trylock(&fs_info->commit_root_sem)) {
+ ret = -EAGAIN;
+ goto done;
+ }
+ } else {
+ down_read(&fs_info->commit_root_sem);
+ }
}
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
}
@@ -4706,7 +4853,7 @@ again:
next = c;
ret = read_block_for_search(root, path, &next, level,
slot, &key);
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN && !path->nowait)
goto again;
if (ret < 0) {
@@ -4716,6 +4863,10 @@ again:
if (!path->skip_locking) {
ret = btrfs_try_tree_read_lock(next);
+ if (!ret && path->nowait) {
+ ret = -EAGAIN;
+ goto done;
+ }
if (!ret && time_seq) {
/*
* If we don't get the lock, we may be racing
@@ -4746,7 +4897,7 @@ again:
ret = read_block_for_search(root, path, &next, level,
0, &key);
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN && !path->nowait)
goto again;
if (ret < 0) {
@@ -4754,8 +4905,16 @@ again:
goto done;
}
- if (!path->skip_locking)
- btrfs_tree_read_lock(next);
+ if (!path->skip_locking) {
+ if (path->nowait) {
+ if (!btrfs_try_tree_read_lock(next)) {
+ ret = -EAGAIN;
+ goto done;
+ }
+ } else {
+ btrfs_tree_read_lock(next);
+ }
+ }
}
ret = 0;
done:
@@ -4773,6 +4932,14 @@ done:
return ret;
}
+int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq)
+{
+ path->slots[0]++;
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
+ return btrfs_next_old_leaf(root, path, time_seq);
+ return 0;
+}
+
/*
* this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
* searching until it gets past min_objectid or finds an item of 'type'
@@ -4856,3 +5023,18 @@ int btrfs_previous_extent_item(struct btrfs_root *root,
}
return 1;
}
+
+int __init btrfs_ctree_init(void)
+{
+ btrfs_path_cachep = kmem_cache_create("btrfs_path",
+ sizeof(struct btrfs_path), 0,
+ SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_path_cachep)
+ return -ENOMEM;
+ return 0;
+}
+
+void __cold btrfs_ctree_exit(void)
+{
+ kmem_cache_destroy(btrfs_path_cachep);
+}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index df8c99c99df9..6965703a81b6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -28,12 +28,15 @@
#include <linux/refcount.h>
#include <linux/crc32c.h>
#include <linux/iomap.h>
+#include <linux/fscrypt.h>
#include "extent-io-tree.h"
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
#include "block-rsv.h"
#include "locking.h"
+#include "misc.h"
+#include "fs.h"
struct btrfs_trans_handle;
struct btrfs_transaction;
@@ -41,353 +44,15 @@ struct btrfs_pending_snapshot;
struct btrfs_delayed_ref_root;
struct btrfs_space_info;
struct btrfs_block_group;
-extern struct kmem_cache *btrfs_trans_handle_cachep;
-extern struct kmem_cache *btrfs_bit_radix_cachep;
-extern struct kmem_cache *btrfs_path_cachep;
-extern struct kmem_cache *btrfs_free_space_cachep;
-extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
struct btrfs_ordered_sum;
struct btrfs_ref;
struct btrfs_bio;
struct btrfs_ioctl_encoded_io_args;
-
-#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
-
-/*
- * Maximum number of mirrors that can be available for all profiles counting
- * the target device of dev-replace as one. During an active device replace
- * procedure, the target device of the copy operation is a mirror for the
- * filesystem data as well that can be used to read data in order to repair
- * read errors on other disks.
- *
- * Current value is derived from RAID1C4 with 4 copies.
- */
-#define BTRFS_MAX_MIRRORS (4 + 1)
-
-#define BTRFS_MAX_LEVEL 8
-
-#define BTRFS_OLDEST_GENERATION 0ULL
-
-/*
- * we can actually store much bigger names, but lets not confuse the rest
- * of linux
- */
-#define BTRFS_NAME_LEN 255
-
-/*
- * Theoretical limit is larger, but we keep this down to a sane
- * value. That should limit greatly the possibility of collisions on
- * inode ref items.
- */
-#define BTRFS_LINK_MAX 65535U
-
-#define BTRFS_EMPTY_DIR_SIZE 0
-
-/* ioprio of readahead is set to idle */
-#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
-
-#define BTRFS_DIRTY_METADATA_THRESH SZ_32M
-
-/*
- * Use large batch size to reduce overhead of metadata updates. On the reader
- * side, we only read it when we are close to ENOSPC and the read overhead is
- * mostly related to the number of CPUs, so it is OK to use arbitrary large
- * value here.
- */
-#define BTRFS_TOTAL_BYTES_PINNED_BATCH SZ_128M
-
-#define BTRFS_MAX_EXTENT_SIZE SZ_128M
-
-/*
- * Deltas are an effective way to populate global statistics. Give macro names
- * to make it clear what we're doing. An example is discard_extents in
- * btrfs_free_space_ctl.
- */
-#define BTRFS_STAT_NR_ENTRIES 2
-#define BTRFS_STAT_CURR 0
-#define BTRFS_STAT_PREV 1
-
-static inline unsigned long btrfs_chunk_item_size(int num_stripes)
-{
- BUG_ON(num_stripes == 0);
- return sizeof(struct btrfs_chunk) +
- sizeof(struct btrfs_stripe) * (num_stripes - 1);
-}
-
-/*
- * Runtime (in-memory) states of filesystem
- */
-enum {
- /* Global indicator of serious filesystem errors */
- BTRFS_FS_STATE_ERROR,
- /*
- * Filesystem is being remounted, allow to skip some operations, like
- * defrag
- */
- BTRFS_FS_STATE_REMOUNTING,
- /* Filesystem in RO mode */
- BTRFS_FS_STATE_RO,
- /* Track if a transaction abort has been reported on this filesystem */
- BTRFS_FS_STATE_TRANS_ABORTED,
- /*
- * Bio operations should be blocked on this filesystem because a source
- * or target device is being destroyed as part of a device replace
- */
- BTRFS_FS_STATE_DEV_REPLACING,
- /* The btrfs_fs_info created for self-tests */
- BTRFS_FS_STATE_DUMMY_FS_INFO,
-
- BTRFS_FS_STATE_NO_CSUMS,
-
- /* Indicates there was an error cleaning up a log tree. */
- BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
-
- BTRFS_FS_STATE_COUNT
-};
-
-#define BTRFS_BACKREF_REV_MAX 256
-#define BTRFS_BACKREF_REV_SHIFT 56
-#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \
- BTRFS_BACKREF_REV_SHIFT)
-
-#define BTRFS_OLD_BACKREF_REV 0
-#define BTRFS_MIXED_BACKREF_REV 1
-
-/*
- * every tree block (leaf or node) starts with this header.
- */
-struct btrfs_header {
- /* these first four must match the super block */
- u8 csum[BTRFS_CSUM_SIZE];
- u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
- __le64 bytenr; /* which block this node is supposed to live in */
- __le64 flags;
-
- /* allowed to be different from the super from here on down */
- u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
- __le64 generation;
- __le64 owner;
- __le32 nritems;
- u8 level;
-} __attribute__ ((__packed__));
-
-/*
- * this is a very generous portion of the super block, giving us
- * room to translate 14 chunks with 3 stripes each.
- */
-#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
-
-/*
- * just in case we somehow lose the roots and are not able to mount,
- * we store an array of the roots from previous transactions
- * in the super.
- */
-#define BTRFS_NUM_BACKUP_ROOTS 4
-struct btrfs_root_backup {
- __le64 tree_root;
- __le64 tree_root_gen;
-
- __le64 chunk_root;
- __le64 chunk_root_gen;
-
- __le64 extent_root;
- __le64 extent_root_gen;
-
- __le64 fs_root;
- __le64 fs_root_gen;
-
- __le64 dev_root;
- __le64 dev_root_gen;
-
- __le64 csum_root;
- __le64 csum_root_gen;
-
- __le64 total_bytes;
- __le64 bytes_used;
- __le64 num_devices;
- /* future */
- __le64 unused_64[4];
-
- u8 tree_root_level;
- u8 chunk_root_level;
- u8 extent_root_level;
- u8 fs_root_level;
- u8 dev_root_level;
- u8 csum_root_level;
- /* future and to align */
- u8 unused_8[10];
-} __attribute__ ((__packed__));
-
-#define BTRFS_SUPER_INFO_OFFSET SZ_64K
-#define BTRFS_SUPER_INFO_SIZE 4096
-
-/*
- * The reserved space at the beginning of each device.
- * It covers the primary super block and leaves space for potential use by other
- * tools like bootloaders or to lower potential damage of accidental overwrite.
- */
-#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M)
-
-/*
- * the super block basically lists the main trees of the FS
- * it currently lacks any block count etc etc
- */
-struct btrfs_super_block {
- /* the first 4 fields must match struct btrfs_header */
- u8 csum[BTRFS_CSUM_SIZE];
- /* FS specific UUID, visible to user */
- u8 fsid[BTRFS_FSID_SIZE];
- __le64 bytenr; /* this block number */
- __le64 flags;
-
- /* allowed to be different from the btrfs_header from here own down */
- __le64 magic;
- __le64 generation;
- __le64 root;
- __le64 chunk_root;
- __le64 log_root;
-
- /*
- * This member has never been utilized since the very beginning, thus
- * it's always 0 regardless of kernel version. We always use
- * generation + 1 to read log tree root. So here we mark it deprecated.
- */
- __le64 __unused_log_root_transid;
- __le64 total_bytes;
- __le64 bytes_used;
- __le64 root_dir_objectid;
- __le64 num_devices;
- __le32 sectorsize;
- __le32 nodesize;
- __le32 __unused_leafsize;
- __le32 stripesize;
- __le32 sys_chunk_array_size;
- __le64 chunk_root_generation;
- __le64 compat_flags;
- __le64 compat_ro_flags;
- __le64 incompat_flags;
- __le16 csum_type;
- u8 root_level;
- u8 chunk_root_level;
- u8 log_root_level;
- struct btrfs_dev_item dev_item;
-
- char label[BTRFS_LABEL_SIZE];
-
- __le64 cache_generation;
- __le64 uuid_tree_generation;
-
- /* the UUID written into btree blocks */
- u8 metadata_uuid[BTRFS_FSID_SIZE];
-
- /* Extent tree v2 */
- __le64 block_group_root;
- __le64 block_group_root_generation;
- u8 block_group_root_level;
-
- /* future expansion */
- u8 reserved8[7];
- __le64 reserved[25];
- u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
- struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
-
- /* Padded to 4096 bytes */
- u8 padding[565];
-} __attribute__ ((__packed__));
-static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
-
-/*
- * Compat flags that we support. If any incompat flags are set other than the
- * ones specified below then we will fail to mount
- */
-#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
-#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL
-#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL
-
-#define BTRFS_FEATURE_COMPAT_RO_SUPP \
- (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \
- BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
- BTRFS_FEATURE_COMPAT_RO_VERITY)
-
-#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
-#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
-
-#ifdef CONFIG_BTRFS_DEBUG
-/*
- * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG
- */
-#define BTRFS_FEATURE_INCOMPAT_SUPP \
- (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
- BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
- BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
- BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
- BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
- BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \
- BTRFS_FEATURE_INCOMPAT_RAID56 | \
- BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
- BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
- BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
- BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
- BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
- BTRFS_FEATURE_INCOMPAT_ZONED | \
- BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
-#else
-#define BTRFS_FEATURE_INCOMPAT_SUPP \
- (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
- BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
- BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
- BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
- BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
- BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \
- BTRFS_FEATURE_INCOMPAT_RAID56 | \
- BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
- BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
- BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
- BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
- BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
- BTRFS_FEATURE_INCOMPAT_ZONED)
-#endif
-
-#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
- (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
-#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL
-
-/*
- * A leaf is full of items. offset and size tell us where to find
- * the item in the leaf (relative to the start of the data area)
- */
-struct btrfs_item {
- struct btrfs_disk_key key;
- __le32 offset;
- __le32 size;
-} __attribute__ ((__packed__));
-
-/*
- * leaves have an item area and a data area:
- * [item0, item1....itemN] [free space] [dataN...data1, data0]
- *
- * The data is separate from the items to get the keys closer together
- * during searches.
- */
-struct btrfs_leaf {
- struct btrfs_header header;
- struct btrfs_item items[];
-} __attribute__ ((__packed__));
-
-/*
- * all non-leaf blocks are nodes, they hold only keys and pointers to
- * other blocks
- */
-struct btrfs_key_ptr {
- struct btrfs_disk_key key;
- __le64 blockptr;
- __le64 generation;
-} __attribute__ ((__packed__));
-
-struct btrfs_node {
- struct btrfs_header header;
- struct btrfs_key_ptr ptrs[];
-} __attribute__ ((__packed__));
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_balance_control;
+struct btrfs_delayed_root;
+struct reloc_control;
/* Read ahead values for struct btrfs_path.reada */
enum {
@@ -443,676 +108,11 @@ struct btrfs_path {
* header (ie. sizeof(struct btrfs_item) is not included).
*/
unsigned int search_for_extension:1;
-};
-#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
- sizeof(struct btrfs_item))
-struct btrfs_dev_replace {
- u64 replace_state; /* see #define above */
- time64_t time_started; /* seconds since 1-Jan-1970 */
- time64_t time_stopped; /* seconds since 1-Jan-1970 */
- atomic64_t num_write_errors;
- atomic64_t num_uncorrectable_read_errors;
-
- u64 cursor_left;
- u64 committed_cursor_left;
- u64 cursor_left_last_write_of_item;
- u64 cursor_right;
-
- u64 cont_reading_from_srcdev_mode; /* see #define above */
-
- int is_valid;
- int item_needs_writeback;
- struct btrfs_device *srcdev;
- struct btrfs_device *tgtdev;
-
- struct mutex lock_finishing_cancel_unmount;
- struct rw_semaphore rwsem;
-
- struct btrfs_scrub_progress scrub_progress;
-
- struct percpu_counter bio_counter;
- wait_queue_head_t replace_wait;
-};
-
-/*
- * free clusters are used to claim free space in relatively large chunks,
- * allowing us to do less seeky writes. They are used for all metadata
- * allocations. In ssd_spread mode they are also used for data allocations.
- */
-struct btrfs_free_cluster {
- spinlock_t lock;
- spinlock_t refill_lock;
- struct rb_root root;
-
- /* largest extent in this cluster */
- u64 max_size;
-
- /* first extent starting offset */
- u64 window_start;
-
- /* We did a full search and couldn't create a cluster */
- bool fragmented;
-
- struct btrfs_block_group *block_group;
- /*
- * when a cluster is allocated from a block group, we put the
- * cluster onto a list in the block group so that it can
- * be freed before the block group is freed.
- */
- struct list_head block_group_list;
-};
-
-enum btrfs_caching_type {
- BTRFS_CACHE_NO,
- BTRFS_CACHE_STARTED,
- BTRFS_CACHE_FINISHED,
- BTRFS_CACHE_ERROR,
-};
-
-/*
- * Tree to record all locked full stripes of a RAID5/6 block group
- */
-struct btrfs_full_stripe_locks_tree {
- struct rb_root root;
- struct mutex lock;
-};
-
-/* Discard control. */
-/*
- * Async discard uses multiple lists to differentiate the discard filter
- * parameters. Index 0 is for completely free block groups where we need to
- * ensure the entire block group is trimmed without being lossy. Indices
- * afterwards represent monotonically decreasing discard filter sizes to
- * prioritize what should be discarded next.
- */
-#define BTRFS_NR_DISCARD_LISTS 3
-#define BTRFS_DISCARD_INDEX_UNUSED 0
-#define BTRFS_DISCARD_INDEX_START 1
-
-struct btrfs_discard_ctl {
- struct workqueue_struct *discard_workers;
- struct delayed_work work;
- spinlock_t lock;
- struct btrfs_block_group *block_group;
- struct list_head discard_list[BTRFS_NR_DISCARD_LISTS];
- u64 prev_discard;
- u64 prev_discard_time;
- atomic_t discardable_extents;
- atomic64_t discardable_bytes;
- u64 max_discard_size;
- u64 delay_ms;
- u32 iops_limit;
- u32 kbps_limit;
- u64 discard_extent_bytes;
- u64 discard_bitmap_bytes;
- atomic64_t discard_bytes_saved;
-};
-
-void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
-
-/* fs_info */
-struct reloc_control;
-struct btrfs_device;
-struct btrfs_fs_devices;
-struct btrfs_balance_control;
-struct btrfs_delayed_root;
-
-/*
- * Block group or device which contains an active swapfile. Used for preventing
- * unsafe operations while a swapfile is active.
- *
- * These are sorted on (ptr, inode) (note that a block group or device can
- * contain more than one swapfile). We compare the pointer values because we
- * don't actually care what the object is, we just need a quick check whether
- * the object exists in the rbtree.
- */
-struct btrfs_swapfile_pin {
- struct rb_node node;
- void *ptr;
- struct inode *inode;
- /*
- * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr
- * points to a struct btrfs_device.
- */
- bool is_block_group;
- /*
- * Only used when 'is_block_group' is true and it is the number of
- * extents used by a swapfile for this block group ('ptr' field).
- */
- int bg_extent_count;
-};
-
-bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
-
-enum {
- BTRFS_FS_CLOSING_START,
- BTRFS_FS_CLOSING_DONE,
- BTRFS_FS_LOG_RECOVERING,
- BTRFS_FS_OPEN,
- BTRFS_FS_QUOTA_ENABLED,
- BTRFS_FS_UPDATE_UUID_TREE_GEN,
- BTRFS_FS_CREATING_FREE_SPACE_TREE,
- BTRFS_FS_BTREE_ERR,
- BTRFS_FS_LOG1_ERR,
- BTRFS_FS_LOG2_ERR,
- BTRFS_FS_QUOTA_OVERRIDE,
- /* Used to record internally whether fs has been frozen */
- BTRFS_FS_FROZEN,
- /*
- * Indicate that balance has been set up from the ioctl and is in the
- * main phase. The fs_info::balance_ctl is initialized.
- */
- BTRFS_FS_BALANCE_RUNNING,
-
- /*
- * Indicate that relocation of a chunk has started, it's set per chunk
- * and is toggled between chunks.
- */
- BTRFS_FS_RELOC_RUNNING,
-
- /* Indicate that the cleaner thread is awake and doing something. */
- BTRFS_FS_CLEANER_RUNNING,
-
- /*
- * The checksumming has an optimized version and is considered fast,
- * so we don't need to offload checksums to workqueues.
- */
- BTRFS_FS_CSUM_IMPL_FAST,
-
- /* Indicate that the discard workqueue can service discards. */
- BTRFS_FS_DISCARD_RUNNING,
-
- /* Indicate that we need to cleanup space cache v1 */
- BTRFS_FS_CLEANUP_SPACE_CACHE_V1,
-
- /* Indicate that we can't trust the free space tree for caching yet */
- BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED,
-
- /* Indicate whether there are any tree modification log users */
- BTRFS_FS_TREE_MOD_LOG_USERS,
-
- /* Indicate that we want the transaction kthread to commit right now. */
- BTRFS_FS_COMMIT_TRANS,
-
- /* Indicate we have half completed snapshot deletions pending. */
- BTRFS_FS_UNFINISHED_DROPS,
-
- /* Indicate we have to finish a zone to do next allocation. */
- BTRFS_FS_NEED_ZONE_FINISH,
-
-#if BITS_PER_LONG == 32
- /* Indicate if we have error/warn message printed on 32bit systems */
- BTRFS_FS_32BIT_ERROR,
- BTRFS_FS_32BIT_WARN,
-#endif
+ /* Stop search if any locks need to be taken (for read) */
+ unsigned int nowait:1;
};
/*
- * Exclusive operations (device replace, resize, device add/remove, balance)
- */
-enum btrfs_exclusive_operation {
- BTRFS_EXCLOP_NONE,
- BTRFS_EXCLOP_BALANCE_PAUSED,
- BTRFS_EXCLOP_BALANCE,
- BTRFS_EXCLOP_DEV_ADD,
- BTRFS_EXCLOP_DEV_REMOVE,
- BTRFS_EXCLOP_DEV_REPLACE,
- BTRFS_EXCLOP_RESIZE,
- BTRFS_EXCLOP_SWAP_ACTIVATE,
-};
-
-/* Store data about transaction commits, exported via sysfs. */
-struct btrfs_commit_stats {
- /* Total number of commits */
- u64 commit_count;
- /* The maximum commit duration so far in ns */
- u64 max_commit_dur;
- /* The last commit duration in ns */
- u64 last_commit_dur;
- /* The total commit duration in ns */
- u64 total_commit_dur;
-};
-
-struct btrfs_fs_info {
- u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
- unsigned long flags;
- struct btrfs_root *tree_root;
- struct btrfs_root *chunk_root;
- struct btrfs_root *dev_root;
- struct btrfs_root *fs_root;
- struct btrfs_root *quota_root;
- struct btrfs_root *uuid_root;
- struct btrfs_root *data_reloc_root;
- struct btrfs_root *block_group_root;
-
- /* the log root tree is a directory of all the other log roots */
- struct btrfs_root *log_root_tree;
-
- /* The tree that holds the global roots (csum, extent, etc) */
- rwlock_t global_root_lock;
- struct rb_root global_root_tree;
-
- spinlock_t fs_roots_radix_lock;
- struct radix_tree_root fs_roots_radix;
-
- /* block group cache stuff */
- rwlock_t block_group_cache_lock;
- struct rb_root_cached block_group_cache_tree;
-
- /* keep track of unallocated space */
- atomic64_t free_chunk_space;
-
- /* Track ranges which are used by log trees blocks/logged data extents */
- struct extent_io_tree excluded_extents;
-
- /* logical->physical extent mapping */
- struct extent_map_tree mapping_tree;
-
- /*
- * block reservation for extent, checksum, root tree and
- * delayed dir index item
- */
- struct btrfs_block_rsv global_block_rsv;
- /* block reservation for metadata operations */
- struct btrfs_block_rsv trans_block_rsv;
- /* block reservation for chunk tree */
- struct btrfs_block_rsv chunk_block_rsv;
- /* block reservation for delayed operations */
- struct btrfs_block_rsv delayed_block_rsv;
- /* block reservation for delayed refs */
- struct btrfs_block_rsv delayed_refs_rsv;
-
- struct btrfs_block_rsv empty_block_rsv;
-
- u64 generation;
- u64 last_trans_committed;
- /*
- * Generation of the last transaction used for block group relocation
- * since the filesystem was last mounted (or 0 if none happened yet).
- * Must be written and read while holding btrfs_fs_info::commit_root_sem.
- */
- u64 last_reloc_trans;
- u64 avg_delayed_ref_runtime;
-
- /*
- * this is updated to the current trans every time a full commit
- * is required instead of the faster short fsync log commits
- */
- u64 last_trans_log_full_commit;
- unsigned long mount_opt;
- /*
- * Track requests for actions that need to be done during transaction
- * commit (like for some mount options).
- */
- unsigned long pending_changes;
- unsigned long compress_type:4;
- unsigned int compress_level;
- u32 commit_interval;
- /*
- * It is a suggestive number, the read side is safe even it gets a
- * wrong number because we will write out the data into a regular
- * extent. The write side(mount/remount) is under ->s_umount lock,
- * so it is also safe.
- */
- u64 max_inline;
-
- struct btrfs_transaction *running_transaction;
- wait_queue_head_t transaction_throttle;
- wait_queue_head_t transaction_wait;
- wait_queue_head_t transaction_blocked_wait;
- wait_queue_head_t async_submit_wait;
-
- /*
- * Used to protect the incompat_flags, compat_flags, compat_ro_flags
- * when they are updated.
- *
- * Because we do not clear the flags for ever, so we needn't use
- * the lock on the read side.
- *
- * We also needn't use the lock when we mount the fs, because
- * there is no other task which will update the flag.
- */
- spinlock_t super_lock;
- struct btrfs_super_block *super_copy;
- struct btrfs_super_block *super_for_commit;
- struct super_block *sb;
- struct inode *btree_inode;
- struct mutex tree_log_mutex;
- struct mutex transaction_kthread_mutex;
- struct mutex cleaner_mutex;
- struct mutex chunk_mutex;
-
- /*
- * this is taken to make sure we don't set block groups ro after
- * the free space cache has been allocated on them
- */
- struct mutex ro_block_group_mutex;
-
- /* this is used during read/modify/write to make sure
- * no two ios are trying to mod the same stripe at the same
- * time
- */
- struct btrfs_stripe_hash_table *stripe_hash_table;
-
- /*
- * this protects the ordered operations list only while we are
- * processing all of the entries on it. This way we make
- * sure the commit code doesn't find the list temporarily empty
- * because another function happens to be doing non-waiting preflush
- * before jumping into the main commit.
- */
- struct mutex ordered_operations_mutex;
-
- struct rw_semaphore commit_root_sem;
-
- struct rw_semaphore cleanup_work_sem;
-
- struct rw_semaphore subvol_sem;
-
- spinlock_t trans_lock;
- /*
- * the reloc mutex goes with the trans lock, it is taken
- * during commit to protect us from the relocation code
- */
- struct mutex reloc_mutex;
-
- struct list_head trans_list;
- struct list_head dead_roots;
- struct list_head caching_block_groups;
-
- spinlock_t delayed_iput_lock;
- struct list_head delayed_iputs;
- atomic_t nr_delayed_iputs;
- wait_queue_head_t delayed_iputs_wait;
-
- atomic64_t tree_mod_seq;
-
- /* this protects tree_mod_log and tree_mod_seq_list */
- rwlock_t tree_mod_log_lock;
- struct rb_root tree_mod_log;
- struct list_head tree_mod_seq_list;
-
- atomic_t async_delalloc_pages;
-
- /*
- * this is used to protect the following list -- ordered_roots.
- */
- spinlock_t ordered_root_lock;
-
- /*
- * all fs/file tree roots in which there are data=ordered extents
- * pending writeback are added into this list.
- *
- * these can span multiple transactions and basically include
- * every dirty data page that isn't from nodatacow
- */
- struct list_head ordered_roots;
-
- struct mutex delalloc_root_mutex;
- spinlock_t delalloc_root_lock;
- /* all fs/file tree roots that have delalloc inodes. */
- struct list_head delalloc_roots;
-
- /*
- * there is a pool of worker threads for checksumming during writes
- * and a pool for checksumming after reads. This is because readers
- * can run with FS locks held, and the writers may be waiting for
- * those locks. We don't want ordering in the pending list to cause
- * deadlocks, and so the two are serviced separately.
- *
- * A third pool does submit_bio to avoid deadlocking with the other
- * two
- */
- struct btrfs_workqueue *workers;
- struct btrfs_workqueue *hipri_workers;
- struct btrfs_workqueue *delalloc_workers;
- struct btrfs_workqueue *flush_workers;
- struct workqueue_struct *endio_workers;
- struct workqueue_struct *endio_meta_workers;
- struct workqueue_struct *endio_raid56_workers;
- struct workqueue_struct *rmw_workers;
- struct workqueue_struct *compressed_write_workers;
- struct btrfs_workqueue *endio_write_workers;
- struct btrfs_workqueue *endio_freespace_worker;
- struct btrfs_workqueue *caching_workers;
-
- /*
- * fixup workers take dirty pages that didn't properly go through
- * the cow mechanism and make them safe to write. It happens
- * for the sys_munmap function call path
- */
- struct btrfs_workqueue *fixup_workers;
- struct btrfs_workqueue *delayed_workers;
-
- struct task_struct *transaction_kthread;
- struct task_struct *cleaner_kthread;
- u32 thread_pool_size;
-
- struct kobject *space_info_kobj;
- struct kobject *qgroups_kobj;
-
- /* used to keep from writing metadata until there is a nice batch */
- struct percpu_counter dirty_metadata_bytes;
- struct percpu_counter delalloc_bytes;
- struct percpu_counter ordered_bytes;
- s32 dirty_metadata_batch;
- s32 delalloc_batch;
-
- struct list_head dirty_cowonly_roots;
-
- struct btrfs_fs_devices *fs_devices;
-
- /*
- * The space_info list is effectively read only after initial
- * setup. It is populated at mount time and cleaned up after
- * all block groups are removed. RCU is used to protect it.
- */
- struct list_head space_info;
-
- struct btrfs_space_info *data_sinfo;
-
- struct reloc_control *reloc_ctl;
-
- /* data_alloc_cluster is only used in ssd_spread mode */
- struct btrfs_free_cluster data_alloc_cluster;
-
- /* all metadata allocations go through this cluster */
- struct btrfs_free_cluster meta_alloc_cluster;
-
- /* auto defrag inodes go here */
- spinlock_t defrag_inodes_lock;
- struct rb_root defrag_inodes;
- atomic_t defrag_running;
-
- /* Used to protect avail_{data, metadata, system}_alloc_bits */
- seqlock_t profiles_lock;
- /*
- * these three are in extended format (availability of single
- * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
- * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
- */
- u64 avail_data_alloc_bits;
- u64 avail_metadata_alloc_bits;
- u64 avail_system_alloc_bits;
-
- /* restriper state */
- spinlock_t balance_lock;
- struct mutex balance_mutex;
- atomic_t balance_pause_req;
- atomic_t balance_cancel_req;
- struct btrfs_balance_control *balance_ctl;
- wait_queue_head_t balance_wait_q;
-
- /* Cancellation requests for chunk relocation */
- atomic_t reloc_cancel_req;
-
- u32 data_chunk_allocations;
- u32 metadata_ratio;
-
- void *bdev_holder;
-
- /* private scrub information */
- struct mutex scrub_lock;
- atomic_t scrubs_running;
- atomic_t scrub_pause_req;
- atomic_t scrubs_paused;
- atomic_t scrub_cancel_req;
- wait_queue_head_t scrub_pause_wait;
- /*
- * The worker pointers are NULL iff the refcount is 0, ie. scrub is not
- * running.
- */
- refcount_t scrub_workers_refcnt;
- struct workqueue_struct *scrub_workers;
- struct workqueue_struct *scrub_wr_completion_workers;
- struct workqueue_struct *scrub_parity_workers;
- struct btrfs_subpage_info *subpage_info;
-
- struct btrfs_discard_ctl discard_ctl;
-
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- u32 check_integrity_print_mask;
-#endif
- /* is qgroup tracking in a consistent state? */
- u64 qgroup_flags;
-
- /* holds configuration and tracking. Protected by qgroup_lock */
- struct rb_root qgroup_tree;
- spinlock_t qgroup_lock;
-
- /*
- * used to avoid frequently calling ulist_alloc()/ulist_free()
- * when doing qgroup accounting, it must be protected by qgroup_lock.
- */
- struct ulist *qgroup_ulist;
-
- /*
- * Protect user change for quota operations. If a transaction is needed,
- * it must be started before locking this lock.
- */
- struct mutex qgroup_ioctl_lock;
-
- /* list of dirty qgroups to be written at next commit */
- struct list_head dirty_qgroups;
-
- /* used by qgroup for an efficient tree traversal */
- u64 qgroup_seq;
-
- /* qgroup rescan items */
- struct mutex qgroup_rescan_lock; /* protects the progress item */
- struct btrfs_key qgroup_rescan_progress;
- struct btrfs_workqueue *qgroup_rescan_workers;
- struct completion qgroup_rescan_completion;
- struct btrfs_work qgroup_rescan_work;
- bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */
-
- /* filesystem state */
- unsigned long fs_state;
-
- struct btrfs_delayed_root *delayed_root;
-
- /* Extent buffer radix tree */
- spinlock_t buffer_lock;
- /* Entries are eb->start / sectorsize */
- struct radix_tree_root buffer_radix;
-
- /* next backup root to be overwritten */
- int backup_root_index;
-
- /* device replace state */
- struct btrfs_dev_replace dev_replace;
-
- struct semaphore uuid_tree_rescan_sem;
-
- /* Used to reclaim the metadata space in the background. */
- struct work_struct async_reclaim_work;
- struct work_struct async_data_reclaim_work;
- struct work_struct preempt_reclaim_work;
-
- /* Reclaim partially filled block groups in the background */
- struct work_struct reclaim_bgs_work;
- struct list_head reclaim_bgs;
- int bg_reclaim_threshold;
-
- spinlock_t unused_bgs_lock;
- struct list_head unused_bgs;
- struct mutex unused_bg_unpin_mutex;
- /* Protect block groups that are going to be deleted */
- struct mutex reclaim_bgs_lock;
-
- /* Cached block sizes */
- u32 nodesize;
- u32 sectorsize;
- /* ilog2 of sectorsize, use to avoid 64bit division */
- u32 sectorsize_bits;
- u32 csum_size;
- u32 csums_per_leaf;
- u32 stripesize;
-
- /*
- * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular
- * filesystem, on zoned it depends on the device constraints.
- */
- u64 max_extent_size;
-
- /* Block groups and devices containing active swapfiles. */
- spinlock_t swapfile_pins_lock;
- struct rb_root swapfile_pins;
-
- struct crypto_shash *csum_shash;
-
- /* Type of exclusive operation running, protected by super_lock */
- enum btrfs_exclusive_operation exclusive_operation;
-
- /*
- * Zone size > 0 when in ZONED mode, otherwise it's used for a check
- * if the mode is enabled
- */
- u64 zone_size;
-
- /* Max size to emit ZONE_APPEND write command */
- u64 max_zone_append_size;
- struct mutex zoned_meta_io_lock;
- spinlock_t treelog_bg_lock;
- u64 treelog_bg;
-
- /*
- * Start of the dedicated data relocation block group, protected by
- * relocation_bg_lock.
- */
- spinlock_t relocation_bg_lock;
- u64 data_reloc_bg;
- struct mutex zoned_data_reloc_io_lock;
-
- u64 nr_global_roots;
-
- spinlock_t zone_active_bgs_lock;
- struct list_head zone_active_bgs;
-
- /* Updates are not protected by any lock */
- struct btrfs_commit_stats commit_stats;
-
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- spinlock_t ref_verify_lock;
- struct rb_root block_tree;
-#endif
-
-#ifdef CONFIG_BTRFS_DEBUG
- struct kobject *debug_kobj;
- struct kobject *discard_debug_kobj;
- struct list_head allocated_roots;
-
- spinlock_t eb_leak_lock;
- struct list_head allocated_ebs;
-#endif
-};
-
-static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
-{
- return sb->s_fs_info;
-}
-
-/*
* The state of btrfs root
*/
enum {
@@ -1174,11 +174,6 @@ enum {
BTRFS_ROOT_RESET_LOCKDEP_CLASS,
};
-static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
-{
- clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
-}
-
/*
* Record swapped tree blocks of a subvolume tree for delayed subtree trace
* code. For detail check comment in fs/btrfs/qgroup.c.
@@ -1340,6 +335,23 @@ struct btrfs_root {
#endif
};
+static inline bool btrfs_root_readonly(const struct btrfs_root *root)
+{
+ /* Byte-swap the constant at compile time, root_item::flags is LE */
+ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
+}
+
+static inline bool btrfs_root_dead(const struct btrfs_root *root)
+{
+ /* Byte-swap the constant at compile time, root_item::flags is LE */
+ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
+}
+
+static inline u64 btrfs_root_id(const struct btrfs_root *root)
+{
+ return root->root_key.objectid;
+}
+
/*
* Structure that conveys information about an extent that is going to replace
* all the extents in a file range.
@@ -1431,17 +443,14 @@ struct btrfs_drop_extents_args {
struct btrfs_file_private {
void *filldir_buf;
+ struct extent_state *llseek_cached_state;
};
-
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
{
-
return info->nodesize - sizeof(struct btrfs_header);
}
-#define BTRFS_LEAF_DATA_OFFSET offsetof(struct btrfs_leaf, items)
-
static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info)
{
return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item);
@@ -1452,1272 +461,14 @@ static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_fs_info *info)
return BTRFS_LEAF_DATA_SIZE(info) / sizeof(struct btrfs_key_ptr);
}
-#define BTRFS_FILE_EXTENT_INLINE_DATA_START \
- (offsetof(struct btrfs_file_extent_item, disk_bytenr))
-static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info)
-{
- return BTRFS_MAX_ITEM_SIZE(info) -
- BTRFS_FILE_EXTENT_INLINE_DATA_START;
-}
-
static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
{
return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item);
}
-/*
- * Flags for mount options.
- *
- * Note: don't forget to add new options to btrfs_show_options()
- */
-enum {
- BTRFS_MOUNT_NODATASUM = (1UL << 0),
- BTRFS_MOUNT_NODATACOW = (1UL << 1),
- BTRFS_MOUNT_NOBARRIER = (1UL << 2),
- BTRFS_MOUNT_SSD = (1UL << 3),
- BTRFS_MOUNT_DEGRADED = (1UL << 4),
- BTRFS_MOUNT_COMPRESS = (1UL << 5),
- BTRFS_MOUNT_NOTREELOG = (1UL << 6),
- BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7),
- BTRFS_MOUNT_SSD_SPREAD = (1UL << 8),
- BTRFS_MOUNT_NOSSD = (1UL << 9),
- BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10),
- BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11),
- BTRFS_MOUNT_SPACE_CACHE = (1UL << 12),
- BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13),
- BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14),
- BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15),
- BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16),
- BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17),
- BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18),
- BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19),
- BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20),
- BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21),
- BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22),
- BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23),
- BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24),
- BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25),
- BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26),
- BTRFS_MOUNT_REF_VERIFY = (1UL << 27),
- BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28),
- BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29),
- BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30),
-};
-
-#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
-#define BTRFS_DEFAULT_MAX_INLINE (2048)
-
-#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
-#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
-#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
-#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \
- BTRFS_MOUNT_##opt)
-
-#define btrfs_set_and_info(fs_info, opt, fmt, args...) \
-do { \
- if (!btrfs_test_opt(fs_info, opt)) \
- btrfs_info(fs_info, fmt, ##args); \
- btrfs_set_opt(fs_info->mount_opt, opt); \
-} while (0)
-
-#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \
-do { \
- if (btrfs_test_opt(fs_info, opt)) \
- btrfs_info(fs_info, fmt, ##args); \
- btrfs_clear_opt(fs_info->mount_opt, opt); \
-} while (0)
-
-/*
- * Requests for changes that need to be done during transaction commit.
- *
- * Internal mount options that are used for special handling of the real
- * mount options (eg. cannot be set during remount and have to be set during
- * transaction commit)
- */
-
-#define BTRFS_PENDING_COMMIT (0)
-
-#define btrfs_test_pending(info, opt) \
- test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
-#define btrfs_set_pending(info, opt) \
- set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
-#define btrfs_clear_pending(info, opt) \
- clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
-
-/*
- * Helpers for setting pending mount option changes.
- *
- * Expects corresponding macros
- * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name
- */
-#define btrfs_set_pending_and_info(info, opt, fmt, args...) \
-do { \
- if (!btrfs_raw_test_opt((info)->mount_opt, opt)) { \
- btrfs_info((info), fmt, ##args); \
- btrfs_set_pending((info), SET_##opt); \
- btrfs_clear_pending((info), CLEAR_##opt); \
- } \
-} while(0)
-
-#define btrfs_clear_pending_and_info(info, opt, fmt, args...) \
-do { \
- if (btrfs_raw_test_opt((info)->mount_opt, opt)) { \
- btrfs_info((info), fmt, ##args); \
- btrfs_set_pending((info), CLEAR_##opt); \
- btrfs_clear_pending((info), SET_##opt); \
- } \
-} while(0)
-
-/*
- * Inode flags
- */
-#define BTRFS_INODE_NODATASUM (1U << 0)
-#define BTRFS_INODE_NODATACOW (1U << 1)
-#define BTRFS_INODE_READONLY (1U << 2)
-#define BTRFS_INODE_NOCOMPRESS (1U << 3)
-#define BTRFS_INODE_PREALLOC (1U << 4)
-#define BTRFS_INODE_SYNC (1U << 5)
-#define BTRFS_INODE_IMMUTABLE (1U << 6)
-#define BTRFS_INODE_APPEND (1U << 7)
-#define BTRFS_INODE_NODUMP (1U << 8)
-#define BTRFS_INODE_NOATIME (1U << 9)
-#define BTRFS_INODE_DIRSYNC (1U << 10)
-#define BTRFS_INODE_COMPRESS (1U << 11)
-
-#define BTRFS_INODE_ROOT_ITEM_INIT (1U << 31)
-
-#define BTRFS_INODE_FLAG_MASK \
- (BTRFS_INODE_NODATASUM | \
- BTRFS_INODE_NODATACOW | \
- BTRFS_INODE_READONLY | \
- BTRFS_INODE_NOCOMPRESS | \
- BTRFS_INODE_PREALLOC | \
- BTRFS_INODE_SYNC | \
- BTRFS_INODE_IMMUTABLE | \
- BTRFS_INODE_APPEND | \
- BTRFS_INODE_NODUMP | \
- BTRFS_INODE_NOATIME | \
- BTRFS_INODE_DIRSYNC | \
- BTRFS_INODE_COMPRESS | \
- BTRFS_INODE_ROOT_ITEM_INIT)
-
-#define BTRFS_INODE_RO_VERITY (1U << 0)
-
-#define BTRFS_INODE_RO_FLAG_MASK (BTRFS_INODE_RO_VERITY)
-
-struct btrfs_map_token {
- struct extent_buffer *eb;
- char *kaddr;
- unsigned long offset;
-};
-
#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
((bytes) >> (fs_info)->sectorsize_bits)
-static inline void btrfs_init_map_token(struct btrfs_map_token *token,
- struct extent_buffer *eb)
-{
- token->eb = eb;
- token->kaddr = page_address(eb->pages[0]);
- token->offset = 0;
-}
-
-/* some macros to generate set/get functions for the struct fields. This
- * assumes there is a lefoo_to_cpu for every type, so lets make a simple
- * one for u8:
- */
-#define le8_to_cpu(v) (v)
-#define cpu_to_le8(v) (v)
-#define __le8 u8
-
-static inline u8 get_unaligned_le8(const void *p)
-{
- return *(u8 *)p;
-}
-
-static inline void put_unaligned_le8(u8 val, void *p)
-{
- *(u8 *)p = val;
-}
-
-#define read_eb_member(eb, ptr, type, member, result) (\
- read_extent_buffer(eb, (char *)(result), \
- ((unsigned long)(ptr)) + \
- offsetof(type, member), \
- sizeof(((type *)0)->member)))
-
-#define write_eb_member(eb, ptr, type, member, result) (\
- write_extent_buffer(eb, (char *)(result), \
- ((unsigned long)(ptr)) + \
- offsetof(type, member), \
- sizeof(((type *)0)->member)))
-
-#define DECLARE_BTRFS_SETGET_BITS(bits) \
-u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
- const void *ptr, unsigned long off); \
-void btrfs_set_token_##bits(struct btrfs_map_token *token, \
- const void *ptr, unsigned long off, \
- u##bits val); \
-u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
- const void *ptr, unsigned long off); \
-void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
- unsigned long off, u##bits val);
-
-DECLARE_BTRFS_SETGET_BITS(8)
-DECLARE_BTRFS_SETGET_BITS(16)
-DECLARE_BTRFS_SETGET_BITS(32)
-DECLARE_BTRFS_SETGET_BITS(64)
-
-#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
-static inline u##bits btrfs_##name(const struct extent_buffer *eb, \
- const type *s) \
-{ \
- static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
- return btrfs_get_##bits(eb, s, offsetof(type, member)); \
-} \
-static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \
- u##bits val) \
-{ \
- static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
- btrfs_set_##bits(eb, s, offsetof(type, member), val); \
-} \
-static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \
- const type *s) \
-{ \
- static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
- return btrfs_get_token_##bits(token, s, offsetof(type, member));\
-} \
-static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
- type *s, u##bits val) \
-{ \
- static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
- btrfs_set_token_##bits(token, s, offsetof(type, member), val); \
-}
-
-#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
-static inline u##bits btrfs_##name(const struct extent_buffer *eb) \
-{ \
- const type *p = page_address(eb->pages[0]) + \
- offset_in_page(eb->start); \
- return get_unaligned_le##bits(&p->member); \
-} \
-static inline void btrfs_set_##name(const struct extent_buffer *eb, \
- u##bits val) \
-{ \
- type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \
- put_unaligned_le##bits(val, &p->member); \
-}
-
-#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
-static inline u##bits btrfs_##name(const type *s) \
-{ \
- return get_unaligned_le##bits(&s->member); \
-} \
-static inline void btrfs_set_##name(type *s, u##bits val) \
-{ \
- put_unaligned_le##bits(val, &s->member); \
-}
-
-static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb,
- struct btrfs_dev_item *s)
-{
- static_assert(sizeof(u64) ==
- sizeof(((struct btrfs_dev_item *)0))->total_bytes);
- return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item,
- total_bytes));
-}
-static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb,
- struct btrfs_dev_item *s,
- u64 val)
-{
- static_assert(sizeof(u64) ==
- sizeof(((struct btrfs_dev_item *)0))->total_bytes);
- WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize));
- btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val);
-}
-
-
-BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
-BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
-BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
-BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
-BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item,
- start_offset, 64);
-BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
-BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
-BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
-BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
-BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
-BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
-
-BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
- total_bytes, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
- bytes_used, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
- io_align, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
- io_width, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
- sector_size, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item,
- dev_group, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
- seek_speed, 8);
-BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
- bandwidth, 8);
-BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
- generation, 64);
-
-static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d)
-{
- return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid);
-}
-
-static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d)
-{
- return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid);
-}
-
-BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
-BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
-BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
-BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
-BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
-BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
-BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
-BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
-BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
-BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
-BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
-
-static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
-{
- return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
-}
-
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
- stripe_len, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
- io_align, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
- io_width, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
- sector_size, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
- num_stripes, 16);
-BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
- sub_stripes, 16);
-BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
-
-static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
- int nr)
-{
- unsigned long offset = (unsigned long)c;
- offset += offsetof(struct btrfs_chunk, stripe);
- offset += nr * sizeof(struct btrfs_stripe);
- return (struct btrfs_stripe *)offset;
-}
-
-static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
-{
- return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
-}
-
-static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb,
- struct btrfs_chunk *c, int nr)
-{
- return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
-}
-
-static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb,
- struct btrfs_chunk *c, int nr)
-{
- return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
-}
-
-/* struct btrfs_block_group_item */
-BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item,
- used, 64);
-BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item,
- used, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid,
- struct btrfs_block_group_item, chunk_objectid, 64);
-
-BTRFS_SETGET_FUNCS(block_group_chunk_objectid,
- struct btrfs_block_group_item, chunk_objectid, 64);
-BTRFS_SETGET_FUNCS(block_group_flags,
- struct btrfs_block_group_item, flags, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags,
- struct btrfs_block_group_item, flags, 64);
-
-/* struct btrfs_free_space_info */
-BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info,
- extent_count, 32);
-BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32);
-
-/* struct btrfs_inode_ref */
-BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
-BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
-
-/* struct btrfs_inode_extref */
-BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
- parent_objectid, 64);
-BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
- name_len, 16);
-BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);
-
-/* struct btrfs_inode_item */
-BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
-BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
-BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
-BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
-BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
-BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
-BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
-BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
-BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
-BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
-BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
-BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item,
- generation, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item,
- sequence, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item,
- transid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item,
- nbytes, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item,
- block_group, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
-BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
-BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
-
-/* struct btrfs_dev_extent */
-BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
- chunk_tree, 64);
-BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
- chunk_objectid, 64);
-BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
- chunk_offset, 64);
-BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
-BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
-BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item,
- generation, 64);
-BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);
-
-BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8);
-
-static inline void btrfs_tree_block_key(const struct extent_buffer *eb,
- struct btrfs_tree_block_info *item,
- struct btrfs_disk_key *key)
-{
- read_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
-}
-
-static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb,
- struct btrfs_tree_block_info *item,
- struct btrfs_disk_key *key)
-{
- write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
-}
-
-BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref,
- root, 64);
-BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref,
- objectid, 64);
-BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref,
- offset, 64);
-BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref,
- count, 32);
-
-BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref,
- count, 32);
-
-BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
- type, 8);
-BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
- offset, 64);
-
-static inline u32 btrfs_extent_inline_ref_size(int type)
-{
- if (type == BTRFS_TREE_BLOCK_REF_KEY ||
- type == BTRFS_SHARED_BLOCK_REF_KEY)
- return sizeof(struct btrfs_extent_inline_ref);
- if (type == BTRFS_SHARED_DATA_REF_KEY)
- return sizeof(struct btrfs_shared_data_ref) +
- sizeof(struct btrfs_extent_inline_ref);
- if (type == BTRFS_EXTENT_DATA_REF_KEY)
- return sizeof(struct btrfs_extent_data_ref) +
- offsetof(struct btrfs_extent_inline_ref, offset);
- return 0;
-}
-
-/* struct btrfs_node */
-BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
-BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr,
- blockptr, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr,
- generation, 64);
-
-static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr)
-{
- unsigned long ptr;
- ptr = offsetof(struct btrfs_node, ptrs) +
- sizeof(struct btrfs_key_ptr) * nr;
- return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
-}
-
-static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb,
- int nr, u64 val)
-{
- unsigned long ptr;
- ptr = offsetof(struct btrfs_node, ptrs) +
- sizeof(struct btrfs_key_ptr) * nr;
- btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
-}
-
-static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr)
-{
- unsigned long ptr;
- ptr = offsetof(struct btrfs_node, ptrs) +
- sizeof(struct btrfs_key_ptr) * nr;
- return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
-}
-
-static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb,
- int nr, u64 val)
-{
- unsigned long ptr;
- ptr = offsetof(struct btrfs_node, ptrs) +
- sizeof(struct btrfs_key_ptr) * nr;
- btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
-}
-
-static inline unsigned long btrfs_node_key_ptr_offset(int nr)
-{
- return offsetof(struct btrfs_node, ptrs) +
- sizeof(struct btrfs_key_ptr) * nr;
-}
-
-void btrfs_node_key(const struct extent_buffer *eb,
- struct btrfs_disk_key *disk_key, int nr);
-
-static inline void btrfs_set_node_key(const struct extent_buffer *eb,
- struct btrfs_disk_key *disk_key, int nr)
-{
- unsigned long ptr;
- ptr = btrfs_node_key_ptr_offset(nr);
- write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
- struct btrfs_key_ptr, key, disk_key);
-}
-
-/* struct btrfs_item */
-BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32);
-BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32);
-
-static inline unsigned long btrfs_item_nr_offset(int nr)
-{
- return offsetof(struct btrfs_leaf, items) +
- sizeof(struct btrfs_item) * nr;
-}
-
-static inline struct btrfs_item *btrfs_item_nr(int nr)
-{
- return (struct btrfs_item *)btrfs_item_nr_offset(nr);
-}
-
-#define BTRFS_ITEM_SETGET_FUNCS(member) \
-static inline u32 btrfs_item_##member(const struct extent_buffer *eb, \
- int slot) \
-{ \
- return btrfs_raw_item_##member(eb, btrfs_item_nr(slot)); \
-} \
-static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \
- int slot, u32 val) \
-{ \
- btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val); \
-} \
-static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \
- int slot) \
-{ \
- struct btrfs_item *item = btrfs_item_nr(slot); \
- return btrfs_token_raw_item_##member(token, item); \
-} \
-static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \
- int slot, u32 val) \
-{ \
- struct btrfs_item *item = btrfs_item_nr(slot); \
- btrfs_set_token_raw_item_##member(token, item, val); \
-}
-
-BTRFS_ITEM_SETGET_FUNCS(offset)
-BTRFS_ITEM_SETGET_FUNCS(size);
-
-static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr)
-{
- return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr);
-}
-
-static inline void btrfs_item_key(const struct extent_buffer *eb,
- struct btrfs_disk_key *disk_key, int nr)
-{
- struct btrfs_item *item = btrfs_item_nr(nr);
- read_eb_member(eb, item, struct btrfs_item, key, disk_key);
-}
-
-static inline void btrfs_set_item_key(struct extent_buffer *eb,
- struct btrfs_disk_key *disk_key, int nr)
-{
- struct btrfs_item *item = btrfs_item_nr(nr);
- write_eb_member(eb, item, struct btrfs_item, key, disk_key);
-}
-
-BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
-
-/*
- * struct btrfs_root_ref
- */
-BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
-BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
-BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
-
-/* struct btrfs_dir_item */
-BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
-BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
-BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
-BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8);
-BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item,
- data_len, 16);
-BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item,
- name_len, 16);
-BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item,
- transid, 64);
-
-static inline void btrfs_dir_item_key(const struct extent_buffer *eb,
- const struct btrfs_dir_item *item,
- struct btrfs_disk_key *key)
-{
- read_eb_member(eb, item, struct btrfs_dir_item, location, key);
-}
-
-static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
- struct btrfs_dir_item *item,
- const struct btrfs_disk_key *key)
-{
- write_eb_member(eb, item, struct btrfs_dir_item, location, key);
-}
-
-BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header,
- num_entries, 64);
-BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
- num_bitmaps, 64);
-BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
- generation, 64);
-
-static inline void btrfs_free_space_key(const struct extent_buffer *eb,
- const struct btrfs_free_space_header *h,
- struct btrfs_disk_key *key)
-{
- read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
-}
-
-static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
- struct btrfs_free_space_header *h,
- const struct btrfs_disk_key *key)
-{
- write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
-}
-
-/* struct btrfs_disk_key */
-BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
- objectid, 64);
-BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
-BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
-
-#ifdef __LITTLE_ENDIAN
-
-/*
- * Optimized helpers for little-endian architectures where CPU and on-disk
- * structures have the same endianness and we can skip conversions.
- */
-
-static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key,
- const struct btrfs_disk_key *disk_key)
-{
- memcpy(cpu_key, disk_key, sizeof(struct btrfs_key));
-}
-
-static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key,
- const struct btrfs_key *cpu_key)
-{
- memcpy(disk_key, cpu_key, sizeof(struct btrfs_key));
-}
-
-static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
- struct btrfs_key *cpu_key, int nr)
-{
- struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
-
- btrfs_node_key(eb, disk_key, nr);
-}
-
-static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
- struct btrfs_key *cpu_key, int nr)
-{
- struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
-
- btrfs_item_key(eb, disk_key, nr);
-}
-
-static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
- const struct btrfs_dir_item *item,
- struct btrfs_key *cpu_key)
-{
- struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
-
- btrfs_dir_item_key(eb, item, disk_key);
-}
-
-#else
-
-static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
- const struct btrfs_disk_key *disk)
-{
- cpu->offset = le64_to_cpu(disk->offset);
- cpu->type = disk->type;
- cpu->objectid = le64_to_cpu(disk->objectid);
-}
-
-static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
- const struct btrfs_key *cpu)
-{
- disk->offset = cpu_to_le64(cpu->offset);
- disk->type = cpu->type;
- disk->objectid = cpu_to_le64(cpu->objectid);
-}
-
-static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
- struct btrfs_key *key, int nr)
-{
- struct btrfs_disk_key disk_key;
- btrfs_node_key(eb, &disk_key, nr);
- btrfs_disk_key_to_cpu(key, &disk_key);
-}
-
-static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
- struct btrfs_key *key, int nr)
-{
- struct btrfs_disk_key disk_key;
- btrfs_item_key(eb, &disk_key, nr);
- btrfs_disk_key_to_cpu(key, &disk_key);
-}
-
-static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
- const struct btrfs_dir_item *item,
- struct btrfs_key *key)
-{
- struct btrfs_disk_key disk_key;
- btrfs_dir_item_key(eb, item, &disk_key);
- btrfs_disk_key_to_cpu(key, &disk_key);
-}
-
-#endif
-
-/* struct btrfs_header */
-BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
-BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
- generation, 64);
-BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
-BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
-BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
-BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
-BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header,
- generation, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header,
- nritems, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64);
-
-static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag)
-{
- return (btrfs_header_flags(eb) & flag) == flag;
-}
-
-static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
-{
- u64 flags = btrfs_header_flags(eb);
- btrfs_set_header_flags(eb, flags | flag);
-}
-
-static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
-{
- u64 flags = btrfs_header_flags(eb);
- btrfs_set_header_flags(eb, flags & ~flag);
-}
-
-static inline int btrfs_header_backref_rev(const struct extent_buffer *eb)
-{
- u64 flags = btrfs_header_flags(eb);
- return flags >> BTRFS_BACKREF_REV_SHIFT;
-}
-
-static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
- int rev)
-{
- u64 flags = btrfs_header_flags(eb);
- flags &= ~BTRFS_BACKREF_REV_MASK;
- flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT;
- btrfs_set_header_flags(eb, flags);
-}
-
-static inline int btrfs_is_leaf(const struct extent_buffer *eb)
-{
- return btrfs_header_level(eb) == 0;
-}
-
-/* struct btrfs_root_item */
-BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item,
- generation, 64);
-BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
-BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
-BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
-
-BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
- generation, 64);
-BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
-BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8);
-BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
-BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
-BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
-BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64);
-BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
-BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
-BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
- last_snapshot, 64);
-BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item,
- generation_v2, 64);
-BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item,
- ctransid, 64);
-BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item,
- otransid, 64);
-BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item,
- stransid, 64);
-BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
- rtransid, 64);
-
-static inline bool btrfs_root_readonly(const struct btrfs_root *root)
-{
- /* Byte-swap the constant at compile time, root_item::flags is LE */
- return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
-}
-
-static inline bool btrfs_root_dead(const struct btrfs_root *root)
-{
- /* Byte-swap the constant at compile time, root_item::flags is LE */
- return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
-}
-
-static inline u64 btrfs_root_id(const struct btrfs_root *root)
-{
- return root->root_key.objectid;
-}
-
-/* struct btrfs_root_backup */
-BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
- tree_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
- tree_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
- tree_root_level, 8);
-
-BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
- chunk_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
- chunk_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
- chunk_root_level, 8);
-
-BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
- extent_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
- extent_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
- extent_root_level, 8);
-
-BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
- fs_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
- fs_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
- fs_root_level, 8);
-
-BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
- dev_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
- dev_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
- dev_root_level, 8);
-
-BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
- csum_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
- csum_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
- csum_root_level, 8);
-BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
- total_bytes, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
- bytes_used, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
- num_devices, 64);
-
-/*
- * For extent tree v2 we overload the extent root with the block group root, as
- * we will have multiple extent roots.
- */
-BTRFS_SETGET_STACK_FUNCS(backup_block_group_root, struct btrfs_root_backup,
- extent_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_gen, struct btrfs_root_backup,
- extent_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_level,
- struct btrfs_root_backup, extent_root_level, 8);
-
-/* struct btrfs_balance_item */
-BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
-
-static inline void btrfs_balance_data(const struct extent_buffer *eb,
- const struct btrfs_balance_item *bi,
- struct btrfs_disk_balance_args *ba)
-{
- read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
-}
-
-static inline void btrfs_set_balance_data(struct extent_buffer *eb,
- struct btrfs_balance_item *bi,
- const struct btrfs_disk_balance_args *ba)
-{
- write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
-}
-
-static inline void btrfs_balance_meta(const struct extent_buffer *eb,
- const struct btrfs_balance_item *bi,
- struct btrfs_disk_balance_args *ba)
-{
- read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
-}
-
-static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
- struct btrfs_balance_item *bi,
- const struct btrfs_disk_balance_args *ba)
-{
- write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
-}
-
-static inline void btrfs_balance_sys(const struct extent_buffer *eb,
- const struct btrfs_balance_item *bi,
- struct btrfs_disk_balance_args *ba)
-{
- read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
-}
-
-static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
- struct btrfs_balance_item *bi,
- const struct btrfs_disk_balance_args *ba)
-{
- write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
-}
-
-static inline void
-btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
- const struct btrfs_disk_balance_args *disk)
-{
- memset(cpu, 0, sizeof(*cpu));
-
- cpu->profiles = le64_to_cpu(disk->profiles);
- cpu->usage = le64_to_cpu(disk->usage);
- cpu->devid = le64_to_cpu(disk->devid);
- cpu->pstart = le64_to_cpu(disk->pstart);
- cpu->pend = le64_to_cpu(disk->pend);
- cpu->vstart = le64_to_cpu(disk->vstart);
- cpu->vend = le64_to_cpu(disk->vend);
- cpu->target = le64_to_cpu(disk->target);
- cpu->flags = le64_to_cpu(disk->flags);
- cpu->limit = le64_to_cpu(disk->limit);
- cpu->stripes_min = le32_to_cpu(disk->stripes_min);
- cpu->stripes_max = le32_to_cpu(disk->stripes_max);
-}
-
-static inline void
-btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
- const struct btrfs_balance_args *cpu)
-{
- memset(disk, 0, sizeof(*disk));
-
- disk->profiles = cpu_to_le64(cpu->profiles);
- disk->usage = cpu_to_le64(cpu->usage);
- disk->devid = cpu_to_le64(cpu->devid);
- disk->pstart = cpu_to_le64(cpu->pstart);
- disk->pend = cpu_to_le64(cpu->pend);
- disk->vstart = cpu_to_le64(cpu->vstart);
- disk->vend = cpu_to_le64(cpu->vend);
- disk->target = cpu_to_le64(cpu->target);
- disk->flags = cpu_to_le64(cpu->flags);
- disk->limit = cpu_to_le64(cpu->limit);
- disk->stripes_min = cpu_to_le32(cpu->stripes_min);
- disk->stripes_max = cpu_to_le32(cpu->stripes_max);
-}
-
-/* struct btrfs_super_block */
-BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
-BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
-BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
- generation, 64);
-BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
-BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
- struct btrfs_super_block, sys_chunk_array_size, 32);
-BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
- struct btrfs_super_block, chunk_root_generation, 64);
-BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
- root_level, 8);
-BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
- chunk_root, 64);
-BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
- chunk_root_level, 8);
-BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
- log_root, 64);
-BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
- log_root_level, 8);
-BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
- total_bytes, 64);
-BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
- bytes_used, 64);
-BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
- sectorsize, 32);
-BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
- nodesize, 32);
-BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
- stripesize, 32);
-BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
- root_dir_objectid, 64);
-BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
- num_devices, 64);
-BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
- compat_flags, 64);
-BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
- compat_ro_flags, 64);
-BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
- incompat_flags, 64);
-BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
- csum_type, 16);
-BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
- cache_generation, 64);
-BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
-BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
- uuid_tree_generation, 64);
-BTRFS_SETGET_STACK_FUNCS(super_block_group_root, struct btrfs_super_block,
- block_group_root, 64);
-BTRFS_SETGET_STACK_FUNCS(super_block_group_root_generation,
- struct btrfs_super_block,
- block_group_root_generation, 64);
-BTRFS_SETGET_STACK_FUNCS(super_block_group_root_level, struct btrfs_super_block,
- block_group_root_level, 8);
-
-int btrfs_super_csum_size(const struct btrfs_super_block *s);
-const char *btrfs_super_csum_name(u16 csum_type);
-const char *btrfs_super_csum_driver(u16 csum_type);
-size_t __attribute_const__ btrfs_get_num_csums(void);
-
-
-/*
- * The leaf data grows from end-to-front in the node.
- * this returns the address of the start of the last item,
- * which is the stop of the leaf data stack
- */
-static inline unsigned int leaf_data_end(const struct extent_buffer *leaf)
-{
- u32 nr = btrfs_header_nritems(leaf);
-
- if (nr == 0)
- return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
- return btrfs_item_offset(leaf, nr - 1);
-}
-
-/* struct btrfs_file_extent_item */
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item,
- type, 8);
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr,
- struct btrfs_file_extent_item, disk_bytenr, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset,
- struct btrfs_file_extent_item, offset, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation,
- struct btrfs_file_extent_item, generation, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes,
- struct btrfs_file_extent_item, num_bytes, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes,
- struct btrfs_file_extent_item, ram_bytes, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes,
- struct btrfs_file_extent_item, disk_num_bytes, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
- struct btrfs_file_extent_item, compression, 8);
-
-static inline unsigned long
-btrfs_file_extent_inline_start(const struct btrfs_file_extent_item *e)
-{
- return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
-}
-
-static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
-{
- return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
-}
-
-BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
-BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
- disk_bytenr, 64);
-BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
- generation, 64);
-BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
- disk_num_bytes, 64);
-BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
- offset, 64);
-BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
- num_bytes, 64);
-BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
- ram_bytes, 64);
-BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
- compression, 8);
-BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
- encryption, 8);
-BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
- other_encoding, 16);
-
-/*
- * this returns the number of bytes used by the item on disk, minus the
- * size of any extent headers. If a file is compressed on disk, this is
- * the compressed size
- */
-static inline u32 btrfs_file_extent_inline_item_len(
- const struct extent_buffer *eb,
- int nr)
-{
- return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
-}
-
-/* btrfs_qgroup_status_item */
-BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
- generation, 64);
-BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item,
- version, 64);
-BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
- flags, 64);
-BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item,
- rescan, 64);
-
-/* btrfs_qgroup_info_item */
-BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
- generation, 64);
-BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64);
-BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item,
- rfer_cmpr, 64);
-BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64);
-BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item,
- excl_cmpr, 64);
-
-BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation,
- struct btrfs_qgroup_info_item, generation, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item,
- rfer, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr,
- struct btrfs_qgroup_info_item, rfer_cmpr, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item,
- excl, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr,
- struct btrfs_qgroup_info_item, excl_cmpr, 64);
-
-/* btrfs_qgroup_limit_item */
-BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item,
- flags, 64);
-BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item,
- max_rfer, 64);
-BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item,
- max_excl, 64);
-BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
- rsv_rfer, 64);
-BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
- rsv_excl, 64);
-
-/* btrfs_dev_replace_item */
-BTRFS_SETGET_FUNCS(dev_replace_src_devid,
- struct btrfs_dev_replace_item, src_devid, 64);
-BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
- struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
- 64);
-BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
- replace_state, 64);
-BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
- time_started, 64);
-BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
- time_stopped, 64);
-BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
- num_write_errors, 64);
-BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
- struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
- 64);
-BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
- cursor_left, 64);
-BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
- cursor_right, 64);
-
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
- struct btrfs_dev_replace_item, src_devid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
- struct btrfs_dev_replace_item,
- cont_reading_from_srcdev_mode, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
- struct btrfs_dev_replace_item, replace_state, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
- struct btrfs_dev_replace_item, time_started, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
- struct btrfs_dev_replace_item, time_stopped, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
- struct btrfs_dev_replace_item, num_write_errors, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
- struct btrfs_dev_replace_item,
- num_uncorrectable_read_errors, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
- struct btrfs_dev_replace_item, cursor_left, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
- struct btrfs_dev_replace_item, cursor_right, 64);
-
-/* helper function to cast into the data area of the leaf. */
-#define btrfs_item_ptr(leaf, slot, type) \
- ((type *)(BTRFS_LEAF_DATA_OFFSET + \
- btrfs_item_offset(leaf, slot)))
-
-#define btrfs_item_ptr_offset(leaf, slot) \
- ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \
- btrfs_item_offset(leaf, slot)))
-
static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length)
{
return crc32c(crc, address, length);
@@ -2747,202 +498,15 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
return mapping_gfp_constraint(mapping, ~__GFP_FS);
}
-/* extent-tree.c */
-
-enum btrfs_inline_ref_type {
- BTRFS_REF_TYPE_INVALID,
- BTRFS_REF_TYPE_BLOCK,
- BTRFS_REF_TYPE_DATA,
- BTRFS_REF_TYPE_ANY,
-};
-
-int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
- struct btrfs_extent_inline_ref *iref,
- enum btrfs_inline_ref_type is_data);
-u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
-
-static inline u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums,
- u64 offset)
-{
- u64 offset_in_sectors = offset >> fs_info->sectorsize_bits;
-
- return csums + offset_in_sectors * fs_info->csum_size;
-}
-
-/*
- * Take the number of bytes to be checksummed and figure out how many leaves
- * it would require to store the csums for that many bytes.
- */
-static inline u64 btrfs_csum_bytes_to_leaves(
- const struct btrfs_fs_info *fs_info, u64 csum_bytes)
-{
- const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits;
-
- return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf);
-}
-
-/*
- * Use this if we would be adding new items, as we could split nodes as we cow
- * down the tree.
- */
-static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
- unsigned num_items)
-{
- return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
-}
-
-/*
- * Doing a truncate or a modification won't result in new nodes or leaves, just
- * what we need for COW.
- */
-static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
- unsigned num_items)
-{
- return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
-}
-
-int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
- u64 start, u64 num_bytes);
-void btrfs_free_excluded_extents(struct btrfs_block_group *cache);
-int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- unsigned long count);
-void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head);
-int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
-int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 offset, int metadata, u64 *refs, u64 *flags);
-int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
- int reserved);
-int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes);
-int btrfs_exclude_logged_extents(struct extent_buffer *eb);
-int btrfs_cross_ref_exist(struct btrfs_root *root,
- u64 objectid, u64 offset, u64 bytenr, bool strict,
- struct btrfs_path *path);
-struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 parent, u64 root_objectid,
- const struct btrfs_disk_key *key,
- int level, u64 hint,
- u64 empty_size,
- enum btrfs_lock_nesting nest);
-void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- u64 root_id,
- struct extent_buffer *buf,
- u64 parent, int last_ref);
-int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 owner,
- u64 offset, u64 ram_bytes,
- struct btrfs_key *ins);
-int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
- u64 root_objectid, u64 owner, u64 offset,
- struct btrfs_key *ins);
-int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,
- u64 min_alloc_size, u64 empty_size, u64 hint_byte,
- struct btrfs_key *ins, int is_data, int delalloc);
-int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref);
-int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref);
-int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
- struct extent_buffer *eb, u64 flags, int level);
-int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
-
-int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
- u64 start, u64 len, int delalloc);
-int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
- u64 len);
-int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
-int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_ref *generic_ref);
-
-void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
-
-/*
- * Different levels for to flush space when doing space reservations.
- *
- * The higher the level, the more methods we try to reclaim space.
- */
-enum btrfs_reserve_flush_enum {
- /* If we are in the transaction, we can't flush anything.*/
- BTRFS_RESERVE_NO_FLUSH,
-
- /*
- * Flush space by:
- * - Running delayed inode items
- * - Allocating a new chunk
- */
- BTRFS_RESERVE_FLUSH_LIMIT,
-
- /*
- * Flush space by:
- * - Running delayed inode items
- * - Running delayed refs
- * - Running delalloc and waiting for ordered extents
- * - Allocating a new chunk
- */
- BTRFS_RESERVE_FLUSH_EVICT,
-
- /*
- * Flush space by above mentioned methods and by:
- * - Running delayed iputs
- * - Committing transaction
- *
- * Can be interrupted by a fatal signal.
- */
- BTRFS_RESERVE_FLUSH_DATA,
- BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE,
- BTRFS_RESERVE_FLUSH_ALL,
-
- /*
- * Pretty much the same as FLUSH_ALL, but can also steal space from
- * global rsv.
- *
- * Can be interrupted by a fatal signal.
- */
- BTRFS_RESERVE_FLUSH_ALL_STEAL,
-};
-
-enum btrfs_flush_state {
- FLUSH_DELAYED_ITEMS_NR = 1,
- FLUSH_DELAYED_ITEMS = 2,
- FLUSH_DELAYED_REFS_NR = 3,
- FLUSH_DELAYED_REFS = 4,
- FLUSH_DELALLOC = 5,
- FLUSH_DELALLOC_WAIT = 6,
- FLUSH_DELALLOC_FULL = 7,
- ALLOC_CHUNK = 8,
- ALLOC_CHUNK_FORCE = 9,
- RUN_DELAYED_IPUTS = 10,
- COMMIT_TRANS = 11,
-};
-
-int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
- struct btrfs_block_rsv *rsv,
- int nitems, bool use_global_rsv);
-void btrfs_subvolume_release_metadata(struct btrfs_root *root,
- struct btrfs_block_rsv *rsv);
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
-
-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
- u64 disk_num_bytes, bool noflush);
-u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end);
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 num_bytes, u64 *actual_bytes);
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
-int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
-int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
-int btrfs_start_write_no_snapshotting(struct btrfs_root *root);
-void btrfs_end_write_no_snapshotting(struct btrfs_root *root);
-void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
-
/* ctree.c */
+int __init btrfs_ctree_init(void);
+void __cold btrfs_ctree_exit(void);
int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
int *slot);
int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
@@ -3103,14 +667,7 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
(path)->slots[0]++ \
)
-static inline int btrfs_next_old_item(struct btrfs_root *root,
- struct btrfs_path *p, u64 time_seq)
-{
- ++p->slots[0];
- if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
- return btrfs_next_old_leaf(root, p, time_seq);
- return 0;
-}
+int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq);
/*
* Search the tree again to find a leaf with greater keys.
@@ -3128,878 +685,6 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
return btrfs_next_old_item(root, p, 0);
}
int btrfs_leaf_free_space(struct extent_buffer *leaf);
-int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
- int for_reloc);
-int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *node,
- struct extent_buffer *parent);
-static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
-{
- /*
- * Do it this way so we only ever do one test_bit in the normal case.
- */
- if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) {
- if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags))
- return 2;
- return 1;
- }
- return 0;
-}
-
-/*
- * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
- * anything except sleeping. This function is used to check the status of
- * the fs.
- * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount,
- * since setting and checking for SB_RDONLY in the superblock's flags is not
- * atomic.
- */
-static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
-{
- return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) ||
- btrfs_fs_closing(fs_info);
-}
-
-static inline void btrfs_set_sb_rdonly(struct super_block *sb)
-{
- sb->s_flags |= SB_RDONLY;
- set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
-}
-
-static inline void btrfs_clear_sb_rdonly(struct super_block *sb)
-{
- sb->s_flags &= ~SB_RDONLY;
- clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
-}
-
-/* root-item.c */
-int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
- u64 ref_id, u64 dirid, u64 sequence, const char *name,
- int name_len);
-int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
- u64 ref_id, u64 dirid, u64 *sequence, const char *name,
- int name_len);
-int btrfs_del_root(struct btrfs_trans_handle *trans,
- const struct btrfs_key *key);
-int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- const struct btrfs_key *key,
- struct btrfs_root_item *item);
-int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_key *key,
- struct btrfs_root_item *item);
-int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
- struct btrfs_path *path, struct btrfs_root_item *root_item,
- struct btrfs_key *root_key);
-int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info);
-void btrfs_set_root_node(struct btrfs_root_item *item,
- struct extent_buffer *node);
-void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
-void btrfs_update_root_times(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
-
-/* uuid-tree.c */
-int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
- u64 subid);
-int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
- u64 subid);
-int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info);
-
-/* dir-item.c */
-int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
- const char *name, int name_len);
-int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
- int name_len, struct btrfs_inode *dir,
- struct btrfs_key *location, u8 type, u64 index);
-struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, u64 dir,
- const char *name, int name_len,
- int mod);
-struct btrfs_dir_item *
-btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, u64 dir,
- u64 index, const char *name, int name_len,
- int mod);
-struct btrfs_dir_item *
-btrfs_search_dir_index_item(struct btrfs_root *root,
- struct btrfs_path *path, u64 dirid,
- const char *name, int name_len);
-int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_dir_item *di);
-int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, u64 objectid,
- const char *name, u16 name_len,
- const void *data, u16 data_len);
-struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, u64 dir,
- const char *name, u16 name_len,
- int mod);
-struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
- const char *name,
- int name_len);
-
-/* orphan.c */
-int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 offset);
-int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 offset);
-int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
-
-/* file-item.c */
-int btrfs_del_csums(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr, u64 len);
-blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
-int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 objectid, u64 pos,
- u64 disk_offset, u64 disk_num_bytes,
- u64 num_bytes, u64 offset, u64 ram_bytes,
- u8 compression, u8 encryption, u16 other_encoding);
-int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, u64 objectid,
- u64 bytenr, int mod);
-int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_ordered_sum *sums);
-blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
- u64 offset, bool one_ordered);
-int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
- struct list_head *list, int search_commit);
-void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
- const struct btrfs_path *path,
- struct btrfs_file_extent_item *fi,
- const bool new_inline,
- struct extent_map *em);
-int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
- u64 len);
-int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
- u64 len);
-void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size);
-u64 btrfs_file_extent_end(const struct btrfs_path *path);
-
-/* inode.c */
-void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num);
-void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
- int mirror_num, enum btrfs_compression_type compress_type);
-int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
- u32 pgoff, u8 *csum, const u8 * const csum_expected);
-int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
- u32 bio_offset, struct page *page, u32 pgoff);
-unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
- u32 bio_offset, struct page *page,
- u64 start, u64 end);
-int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
- u32 bio_offset, struct page *page, u32 pgoff);
-struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
- u64 start, u64 len);
-noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
- u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes, bool strict);
-
-void __btrfs_del_delalloc_inode(struct btrfs_root *root,
- struct btrfs_inode *inode);
-struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
-int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
-int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
- struct btrfs_inode *dir, struct btrfs_inode *inode,
- const char *name, int name_len);
-int btrfs_add_link(struct btrfs_trans_handle *trans,
- struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
- const char *name, int name_len, int add_backref, u64 index);
-int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry);
-int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
- int front);
-
-int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
- bool in_reclaim_context);
-int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
- unsigned int extra_bits,
- struct extent_state **cached_state);
-struct btrfs_new_inode_args {
- /* Input */
- struct inode *dir;
- struct dentry *dentry;
- struct inode *inode;
- bool orphan;
- bool subvol;
-
- /*
- * Output from btrfs_new_inode_prepare(), input to
- * btrfs_create_new_inode().
- */
- struct posix_acl *default_acl;
- struct posix_acl *acl;
-};
-int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
- unsigned int *trans_num_items);
-int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
- struct btrfs_new_inode_args *args);
-void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args);
-struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
- struct inode *dir);
- void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
- u32 bits);
-void btrfs_clear_delalloc_extent(struct inode *inode,
- struct extent_state *state, u32 bits);
-void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
- struct extent_state *other);
-void btrfs_split_delalloc_extent(struct inode *inode,
- struct extent_state *orig, u64 split);
-void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
-vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
-void btrfs_evict_inode(struct inode *inode);
-int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
-struct inode *btrfs_alloc_inode(struct super_block *sb);
-void btrfs_destroy_inode(struct inode *inode);
-void btrfs_free_inode(struct inode *inode);
-int btrfs_drop_inode(struct inode *inode);
-int __init btrfs_init_cachep(void);
-void __cold btrfs_destroy_cachep(void);
-struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
- struct btrfs_root *root, struct btrfs_path *path);
-struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
-struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 end);
-int btrfs_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode);
-int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode);
-int btrfs_orphan_add(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode);
-int btrfs_orphan_cleanup(struct btrfs_root *root);
-int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size);
-void btrfs_add_delayed_iput(struct inode *inode);
-void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
-int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info);
-int btrfs_prealloc_file_range(struct inode *inode, int mode,
- u64 start, u64 num_bytes, u64 min_size,
- loff_t actual_len, u64 *alloc_hint);
-int btrfs_prealloc_file_range_trans(struct inode *inode,
- struct btrfs_trans_handle *trans, int mode,
- u64 start, u64 num_bytes, u64 min_size,
- loff_t actual_len, u64 *alloc_hint);
-int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
- u64 start, u64 end, int *page_started, unsigned long *nr_written,
- struct writeback_control *wbc);
-int btrfs_writepage_cow_fixup(struct page *page);
-void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
- struct page *page, u64 start,
- u64 end, bool uptodate);
-int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
- int compress_type);
-int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
- u64 file_offset, u64 disk_bytenr,
- u64 disk_io_size,
- struct page **pages);
-ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
- struct btrfs_ioctl_encoded_io_args *encoded);
-ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
- const struct btrfs_ioctl_encoded_io_args *encoded);
-
-ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before);
-
-extern const struct dentry_operations btrfs_dentry_operations;
-
-/* Inode locking type flags, by default the exclusive lock is taken */
-#define BTRFS_ILOCK_SHARED (1U << 0)
-#define BTRFS_ILOCK_TRY (1U << 1)
-#define BTRFS_ILOCK_MMAP (1U << 2)
-
-int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags);
-void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags);
-void btrfs_update_inode_bytes(struct btrfs_inode *inode,
- const u64 add_bytes,
- const u64 del_bytes);
-void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end);
-
-/* ioctl.c */
-long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
-int btrfs_fileattr_set(struct user_namespace *mnt_userns,
- struct dentry *dentry, struct fileattr *fa);
-int btrfs_ioctl_get_supported_features(void __user *arg);
-void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
-int __pure btrfs_is_empty_uuid(u8 *uuid);
-int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
- struct btrfs_ioctl_defrag_range_args *range,
- u64 newer_than, unsigned long max_to_defrag);
-void btrfs_get_block_group_info(struct list_head *groups_list,
- struct btrfs_ioctl_space_info *space);
-void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
- struct btrfs_ioctl_balance_args *bargs);
-bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
- enum btrfs_exclusive_operation type);
-bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
- enum btrfs_exclusive_operation type);
-void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
-void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
-void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
- enum btrfs_exclusive_operation op);
-
-
-/* file.c */
-int __init btrfs_auto_defrag_init(void);
-void __cold btrfs_auto_defrag_exit(void);
-int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, u32 extent_thresh);
-int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
-void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
-int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
-void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
- int skip_pinned);
-extern const struct file_operations btrfs_file_operations;
-int btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
- struct btrfs_drop_extents_args *args);
-int btrfs_replace_file_extents(struct btrfs_inode *inode,
- struct btrfs_path *path, const u64 start,
- const u64 end,
- struct btrfs_replace_extent_info *extent_info,
- struct btrfs_trans_handle **trans_out);
-int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, u64 start, u64 end);
-ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
- const struct btrfs_ioctl_encoded_io_args *encoded);
-int btrfs_release_file(struct inode *inode, struct file *file);
-int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
- size_t num_pages, loff_t pos, size_t write_bytes,
- struct extent_state **cached, bool noreserve);
-int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
-int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes);
-void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
-
-/* tree-defrag.c */
-int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
-
-/* super.c */
-int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
- unsigned long new_flags);
-int btrfs_sync_fs(struct super_block *sb, int wait);
-char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
- u64 subvol_objectid);
-
-static inline __printf(2, 3) __cold
-void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
-{
-}
-
-#ifdef CONFIG_PRINTK_INDEX
-
-#define btrfs_printk(fs_info, fmt, args...) \
-do { \
- printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \
- _btrfs_printk(fs_info, fmt, ##args); \
-} while (0)
-
-__printf(2, 3)
-__cold
-void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
-
-#elif defined(CONFIG_PRINTK)
-
-#define btrfs_printk(fs_info, fmt, args...) \
- _btrfs_printk(fs_info, fmt, ##args)
-
-__printf(2, 3)
-__cold
-void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
-
-#else
-
-#define btrfs_printk(fs_info, fmt, args...) \
- btrfs_no_printk(fs_info, fmt, ##args)
-#endif
-
-#define btrfs_emerg(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_INFO fmt, ##args)
-
-/*
- * Wrappers that use printk_in_rcu
- */
-#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
-
-/*
- * Wrappers that use a ratelimited printk_in_rcu
- */
-#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
-
-/*
- * Wrappers that use a ratelimited printk
- */
-#define btrfs_emerg_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args)
-
-#if defined(CONFIG_DYNAMIC_DEBUG)
-#define btrfs_debug(fs_info, fmt, args...) \
- _dynamic_func_call_no_desc(fmt, btrfs_printk, \
- fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
- _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \
- fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
- _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \
- fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl(fs_info, fmt, args...) \
- _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \
- fs_info, KERN_DEBUG fmt, ##args)
-#elif defined(DEBUG)
-#define btrfs_debug(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args)
-#else
-#define btrfs_debug(fs_info, fmt, args...) \
- btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
- btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl(fs_info, fmt, args...) \
- btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
-#endif
-
-#define btrfs_printk_in_rcu(fs_info, fmt, args...) \
-do { \
- rcu_read_lock(); \
- btrfs_printk(fs_info, fmt, ##args); \
- rcu_read_unlock(); \
-} while (0)
-
-#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \
-do { \
- rcu_read_lock(); \
- btrfs_no_printk(fs_info, fmt, ##args); \
- rcu_read_unlock(); \
-} while (0)
-
-#define btrfs_printk_ratelimited(fs_info, fmt, args...) \
-do { \
- static DEFINE_RATELIMIT_STATE(_rs, \
- DEFAULT_RATELIMIT_INTERVAL, \
- DEFAULT_RATELIMIT_BURST); \
- if (__ratelimit(&_rs)) \
- btrfs_printk(fs_info, fmt, ##args); \
-} while (0)
-
-#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \
-do { \
- rcu_read_lock(); \
- btrfs_printk_ratelimited(fs_info, fmt, ##args); \
- rcu_read_unlock(); \
-} while (0)
-
-#ifdef CONFIG_BTRFS_ASSERT
-__cold __noreturn
-static inline void assertfail(const char *expr, const char *file, int line)
-{
- pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
- BUG();
-}
-
-#define ASSERT(expr) \
- (likely(expr) ? (void)0 : assertfail(#expr, __FILE__, __LINE__))
-
-#else
-static inline void assertfail(const char *expr, const char* file, int line) { }
-#define ASSERT(expr) (void)(expr)
-#endif
-
-#if BITS_PER_LONG == 32
-#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT)
-/*
- * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical
- * addresses of extents.
- *
- * For 4K page size it's about 10T, for 64K it's 160T.
- */
-#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8)
-void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info);
-void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info);
-#endif
-
-/*
- * Get the correct offset inside the page of extent buffer.
- *
- * @eb: target extent buffer
- * @start: offset inside the extent buffer
- *
- * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases.
- */
-static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb,
- unsigned long offset)
-{
- /*
- * For sectorsize == PAGE_SIZE case, eb->start will always be aligned
- * to PAGE_SIZE, thus adding it won't cause any difference.
- *
- * For sectorsize < PAGE_SIZE, we must only read the data that belongs
- * to the eb, thus we have to take the eb->start into consideration.
- */
- return offset_in_page(offset + eb->start);
-}
-
-static inline unsigned long get_eb_page_index(unsigned long offset)
-{
- /*
- * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough.
- *
- * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE,
- * and have ensured that all tree blocks are contained in one page,
- * thus we always get index == 0.
- */
- return offset >> PAGE_SHIFT;
-}
-
-/*
- * Use that for functions that are conditionally exported for sanity tests but
- * otherwise static
- */
-#ifndef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-#define EXPORT_FOR_TESTS static
-#else
-#define EXPORT_FOR_TESTS
-#endif
-
-__cold
-static inline void btrfs_print_v0_err(struct btrfs_fs_info *fs_info)
-{
- btrfs_err(fs_info,
-"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel");
-}
-
-__printf(5, 6)
-__cold
-void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...);
-
-const char * __attribute_const__ btrfs_decode_error(int errno);
-
-__cold
-void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
- const char *function,
- unsigned int line, int errno);
-
-/*
- * Call btrfs_abort_transaction as early as possible when an error condition is
- * detected, that way the exact line number is reported.
- */
-#define btrfs_abort_transaction(trans, errno) \
-do { \
- /* Report first abort since mount */ \
- if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
- &((trans)->fs_info->fs_state))) { \
- if ((errno) != -EIO && (errno) != -EROFS) { \
- WARN(1, KERN_DEBUG \
- "BTRFS: Transaction aborted (error %d)\n", \
- (errno)); \
- } else { \
- btrfs_debug((trans)->fs_info, \
- "Transaction aborted (error %d)", \
- (errno)); \
- } \
- } \
- __btrfs_abort_transaction((trans), __func__, \
- __LINE__, (errno)); \
-} while (0)
-
-#ifdef CONFIG_PRINTK_INDEX
-
-#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
-do { \
- printk_index_subsys_emit( \
- "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \
- KERN_CRIT, fmt); \
- __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
- (errno), fmt, ##args); \
-} while (0)
-
-#else
-
-#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
- __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
- (errno), fmt, ##args)
-
-#endif
-
-#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
- &(fs_info)->fs_state)))
-#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \
- (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \
- &(fs_info)->fs_state)))
-
-__printf(5, 6)
-__cold
-void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...);
-/*
- * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
- * will panic(). Otherwise we BUG() here.
- */
-#define btrfs_panic(fs_info, errno, fmt, args...) \
-do { \
- __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
- BUG(); \
-} while (0)
-
-
-/* compatibility and incompatibility defines */
-
-#define btrfs_set_fs_incompat(__fs_info, opt) \
- __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \
- #opt)
-
-static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
- u64 flag, const char* name)
-{
- struct btrfs_super_block *disk_super;
- u64 features;
-
- disk_super = fs_info->super_copy;
- features = btrfs_super_incompat_flags(disk_super);
- if (!(features & flag)) {
- spin_lock(&fs_info->super_lock);
- features = btrfs_super_incompat_flags(disk_super);
- if (!(features & flag)) {
- features |= flag;
- btrfs_set_super_incompat_flags(disk_super, features);
- btrfs_info(fs_info,
- "setting incompat feature flag for %s (0x%llx)",
- name, flag);
- }
- spin_unlock(&fs_info->super_lock);
- }
-}
-
-#define btrfs_clear_fs_incompat(__fs_info, opt) \
- __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \
- #opt)
-
-static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info,
- u64 flag, const char* name)
-{
- struct btrfs_super_block *disk_super;
- u64 features;
-
- disk_super = fs_info->super_copy;
- features = btrfs_super_incompat_flags(disk_super);
- if (features & flag) {
- spin_lock(&fs_info->super_lock);
- features = btrfs_super_incompat_flags(disk_super);
- if (features & flag) {
- features &= ~flag;
- btrfs_set_super_incompat_flags(disk_super, features);
- btrfs_info(fs_info,
- "clearing incompat feature flag for %s (0x%llx)",
- name, flag);
- }
- spin_unlock(&fs_info->super_lock);
- }
-}
-
-#define btrfs_fs_incompat(fs_info, opt) \
- __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
-
-static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
-{
- struct btrfs_super_block *disk_super;
- disk_super = fs_info->super_copy;
- return !!(btrfs_super_incompat_flags(disk_super) & flag);
-}
-
-#define btrfs_set_fs_compat_ro(__fs_info, opt) \
- __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \
- #opt)
-
-static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info,
- u64 flag, const char *name)
-{
- struct btrfs_super_block *disk_super;
- u64 features;
-
- disk_super = fs_info->super_copy;
- features = btrfs_super_compat_ro_flags(disk_super);
- if (!(features & flag)) {
- spin_lock(&fs_info->super_lock);
- features = btrfs_super_compat_ro_flags(disk_super);
- if (!(features & flag)) {
- features |= flag;
- btrfs_set_super_compat_ro_flags(disk_super, features);
- btrfs_info(fs_info,
- "setting compat-ro feature flag for %s (0x%llx)",
- name, flag);
- }
- spin_unlock(&fs_info->super_lock);
- }
-}
-
-#define btrfs_clear_fs_compat_ro(__fs_info, opt) \
- __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \
- #opt)
-
-static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info,
- u64 flag, const char *name)
-{
- struct btrfs_super_block *disk_super;
- u64 features;
-
- disk_super = fs_info->super_copy;
- features = btrfs_super_compat_ro_flags(disk_super);
- if (features & flag) {
- spin_lock(&fs_info->super_lock);
- features = btrfs_super_compat_ro_flags(disk_super);
- if (features & flag) {
- features &= ~flag;
- btrfs_set_super_compat_ro_flags(disk_super, features);
- btrfs_info(fs_info,
- "clearing compat-ro feature flag for %s (0x%llx)",
- name, flag);
- }
- spin_unlock(&fs_info->super_lock);
- }
-}
-
-#define btrfs_fs_compat_ro(fs_info, opt) \
- __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
-
-static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
-{
- struct btrfs_super_block *disk_super;
- disk_super = fs_info->super_copy;
- return !!(btrfs_super_compat_ro_flags(disk_super) & flag);
-}
-
-/* acl.c */
-#ifdef CONFIG_BTRFS_FS_POSIX_ACL
-struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
-int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
- struct posix_acl *acl, int type);
-int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
- struct posix_acl *acl, int type);
-#else
-#define btrfs_get_acl NULL
-#define btrfs_set_acl NULL
-static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans,
- struct inode *inode, struct posix_acl *acl,
- int type)
-{
- return -EOPNOTSUPP;
-}
-#endif
-
-/* relocation.c */
-int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start);
-int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
-int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
-int btrfs_recover_relocation(struct btrfs_fs_info *fs_info);
-int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len);
-int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *buf,
- struct extent_buffer *cow);
-void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
- u64 *bytes_to_reserve);
-int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending);
-int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info);
-struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info,
- u64 bytenr);
-int btrfs_should_ignore_reloc_root(struct btrfs_root *root);
-
-/* scrub.c */
-int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
- u64 end, struct btrfs_scrub_progress *progress,
- int readonly, int is_dev_replace);
-void btrfs_scrub_pause(struct btrfs_fs_info *fs_info);
-void btrfs_scrub_continue(struct btrfs_fs_info *fs_info);
-int btrfs_scrub_cancel(struct btrfs_fs_info *info);
-int btrfs_scrub_cancel_dev(struct btrfs_device *dev);
-int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
- struct btrfs_scrub_progress *progress);
-static inline void btrfs_init_full_stripe_locks_tree(
- struct btrfs_full_stripe_locks_tree *locks_root)
-{
- locks_root->root = RB_ROOT;
- mutex_init(&locks_root->lock);
-}
-
-/* dev-replace.c */
-void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
-void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
-void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
-
-static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
-{
- btrfs_bio_counter_sub(fs_info, 1);
-}
static inline int is_fstree(u64 rootid)
{
@@ -4010,72 +695,16 @@ static inline int is_fstree(u64 rootid)
return 0;
}
-static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
-{
- return signal_pending(current);
-}
-
-/* verity.c */
-#ifdef CONFIG_FS_VERITY
-
-extern const struct fsverity_operations btrfs_verityops;
-int btrfs_drop_verity_items(struct btrfs_inode *inode);
-
-BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item,
- encryption, 8);
-BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item,
- size, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
- struct btrfs_verity_descriptor_item, encryption, 8);
-BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
- struct btrfs_verity_descriptor_item, size, 64);
-
-#else
-
-static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
-{
- return 0;
-}
-
-#endif
-
-/* Sanity test specific functions */
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_destroy_inode(struct inode *inode);
-static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
-{
- return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
-}
-#else
-static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
-{
- return 0;
-}
-#endif
-
-static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
-{
- return fs_info->zone_size > 0;
-}
-
-/*
- * Count how many fs_info->max_extent_size cover the @size
- */
-static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size)
-{
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (!fs_info)
- return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
-#endif
-
- return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size);
-}
-
static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
{
return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
}
+int btrfs_super_csum_size(const struct btrfs_super_block *s);
+const char *btrfs_super_csum_name(u16 csum_type);
+const char *btrfs_super_csum_driver(u16 csum_type);
+size_t __attribute_const__ btrfs_get_num_csums(void);
+
/*
* We use page status Private2 to indicate there is an ordered extent with
* unfinished IO.
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
new file mode 100644
index 000000000000..0a3c261b69c9
--- /dev/null
+++ b/fs/btrfs/defrag.c
@@ -0,0 +1,1376 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+#include "locking.h"
+#include "accessors.h"
+#include "messages.h"
+#include "delalloc-space.h"
+#include "subpage.h"
+#include "defrag.h"
+#include "file-item.h"
+#include "super.h"
+
+static struct kmem_cache *btrfs_inode_defrag_cachep;
+
+/*
+ * When auto defrag is enabled we queue up these defrag structs to remember
+ * which inodes need defragging passes.
+ */
+struct inode_defrag {
+ struct rb_node rb_node;
+ /* Inode number */
+ u64 ino;
+ /*
+ * Transid where the defrag was added, we search for extents newer than
+ * this.
+ */
+ u64 transid;
+
+ /* Root objectid */
+ u64 root;
+
+ /*
+ * The extent size threshold for autodefrag.
+ *
+ * This value is different for compressed/non-compressed extents, thus
+ * needs to be passed from higher layer.
+ * (aka, inode_should_defrag())
+ */
+ u32 extent_thresh;
+};
+
+static int __compare_inode_defrag(struct inode_defrag *defrag1,
+ struct inode_defrag *defrag2)
+{
+ if (defrag1->root > defrag2->root)
+ return 1;
+ else if (defrag1->root < defrag2->root)
+ return -1;
+ else if (defrag1->ino > defrag2->ino)
+ return 1;
+ else if (defrag1->ino < defrag2->ino)
+ return -1;
+ else
+ return 0;
+}
+
+/*
+ * Pop a record for an inode into the defrag tree. The lock must be held
+ * already.
+ *
+ * If you're inserting a record for an older transid than an existing record,
+ * the transid already in the tree is lowered.
+ *
+ * If an existing record is found the defrag item you pass in is freed.
+ */
+static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
+ struct inode_defrag *defrag)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct inode_defrag *entry;
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ int ret;
+
+ p = &fs_info->defrag_inodes.rb_node;
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct inode_defrag, rb_node);
+
+ ret = __compare_inode_defrag(defrag, entry);
+ if (ret < 0)
+ p = &parent->rb_left;
+ else if (ret > 0)
+ p = &parent->rb_right;
+ else {
+ /*
+ * If we're reinserting an entry for an old defrag run,
+ * make sure to lower the transid of our existing
+ * record.
+ */
+ if (defrag->transid < entry->transid)
+ entry->transid = defrag->transid;
+ entry->extent_thresh = min(defrag->extent_thresh,
+ entry->extent_thresh);
+ return -EEXIST;
+ }
+ }
+ set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
+ rb_link_node(&defrag->rb_node, parent, p);
+ rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
+ return 0;
+}
+
+static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
+ return 0;
+
+ if (btrfs_fs_closing(fs_info))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Insert a defrag record for this inode if auto defrag is enabled.
+ */
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode, u32 extent_thresh)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct inode_defrag *defrag;
+ u64 transid;
+ int ret;
+
+ if (!__need_auto_defrag(fs_info))
+ return 0;
+
+ if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
+ return 0;
+
+ if (trans)
+ transid = trans->transid;
+ else
+ transid = inode->root->last_trans;
+
+ defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
+ if (!defrag)
+ return -ENOMEM;
+
+ defrag->ino = btrfs_ino(inode);
+ defrag->transid = transid;
+ defrag->root = root->root_key.objectid;
+ defrag->extent_thresh = extent_thresh;
+
+ spin_lock(&fs_info->defrag_inodes_lock);
+ if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
+ /*
+ * If we set IN_DEFRAG flag and evict the inode from memory,
+ * and then re-read this inode, this new inode doesn't have
+ * IN_DEFRAG flag. At the case, we may find the existed defrag.
+ */
+ ret = __btrfs_add_inode_defrag(inode, defrag);
+ if (ret)
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ } else {
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ }
+ spin_unlock(&fs_info->defrag_inodes_lock);
+ return 0;
+}
+
+/*
+ * Pick the defragable inode that we want, if it doesn't exist, we will get the
+ * next one.
+ */
+static struct inode_defrag *btrfs_pick_defrag_inode(
+ struct btrfs_fs_info *fs_info, u64 root, u64 ino)
+{
+ struct inode_defrag *entry = NULL;
+ struct inode_defrag tmp;
+ struct rb_node *p;
+ struct rb_node *parent = NULL;
+ int ret;
+
+ tmp.ino = ino;
+ tmp.root = root;
+
+ spin_lock(&fs_info->defrag_inodes_lock);
+ p = fs_info->defrag_inodes.rb_node;
+ while (p) {
+ parent = p;
+ entry = rb_entry(parent, struct inode_defrag, rb_node);
+
+ ret = __compare_inode_defrag(&tmp, entry);
+ if (ret < 0)
+ p = parent->rb_left;
+ else if (ret > 0)
+ p = parent->rb_right;
+ else
+ goto out;
+ }
+
+ if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
+ parent = rb_next(parent);
+ if (parent)
+ entry = rb_entry(parent, struct inode_defrag, rb_node);
+ else
+ entry = NULL;
+ }
+out:
+ if (entry)
+ rb_erase(parent, &fs_info->defrag_inodes);
+ spin_unlock(&fs_info->defrag_inodes_lock);
+ return entry;
+}
+
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+ struct inode_defrag *defrag;
+ struct rb_node *node;
+
+ spin_lock(&fs_info->defrag_inodes_lock);
+ node = rb_first(&fs_info->defrag_inodes);
+ while (node) {
+ rb_erase(node, &fs_info->defrag_inodes);
+ defrag = rb_entry(node, struct inode_defrag, rb_node);
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+
+ cond_resched_lock(&fs_info->defrag_inodes_lock);
+
+ node = rb_first(&fs_info->defrag_inodes);
+ }
+ spin_unlock(&fs_info->defrag_inodes_lock);
+}
+
+#define BTRFS_DEFRAG_BATCH 1024
+
+static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+ struct inode_defrag *defrag)
+{
+ struct btrfs_root *inode_root;
+ struct inode *inode;
+ struct btrfs_ioctl_defrag_range_args range;
+ int ret = 0;
+ u64 cur = 0;
+
+again:
+ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
+ goto cleanup;
+ if (!__need_auto_defrag(fs_info))
+ goto cleanup;
+
+ /* Get the inode */
+ inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
+ if (IS_ERR(inode_root)) {
+ ret = PTR_ERR(inode_root);
+ goto cleanup;
+ }
+
+ inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
+ btrfs_put_root(inode_root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ goto cleanup;
+ }
+
+ if (cur >= i_size_read(inode)) {
+ iput(inode);
+ goto cleanup;
+ }
+
+ /* Do a chunk of defrag */
+ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
+ memset(&range, 0, sizeof(range));
+ range.len = (u64)-1;
+ range.start = cur;
+ range.extent_thresh = defrag->extent_thresh;
+
+ sb_start_write(fs_info->sb);
+ ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+ BTRFS_DEFRAG_BATCH);
+ sb_end_write(fs_info->sb);
+ iput(inode);
+
+ if (ret < 0)
+ goto cleanup;
+
+ cur = max(cur + fs_info->sectorsize, range.start);
+ goto again;
+
+cleanup:
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ return ret;
+}
+
+/*
+ * Run through the list of inodes in the FS that need defragging.
+ */
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+ struct inode_defrag *defrag;
+ u64 first_ino = 0;
+ u64 root_objectid = 0;
+
+ atomic_inc(&fs_info->defrag_running);
+ while (1) {
+ /* Pause the auto defragger. */
+ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
+ break;
+
+ if (!__need_auto_defrag(fs_info))
+ break;
+
+ /* find an inode to defrag */
+ defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino);
+ if (!defrag) {
+ if (root_objectid || first_ino) {
+ root_objectid = 0;
+ first_ino = 0;
+ continue;
+ } else {
+ break;
+ }
+ }
+
+ first_ino = defrag->ino + 1;
+ root_objectid = defrag->root;
+
+ __btrfs_run_defrag_inode(fs_info, defrag);
+ }
+ atomic_dec(&fs_info->defrag_running);
+
+ /*
+ * During unmount, we use the transaction_wait queue to wait for the
+ * defragger to stop.
+ */
+ wake_up(&fs_info->transaction_wait);
+ return 0;
+}
+
+/*
+ * Defrag all the leaves in a given btree.
+ * Read all the leaves and try to get key order to
+ * better reflect disk order
+ */
+
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_path *path = NULL;
+ struct btrfs_key key;
+ int ret = 0;
+ int wret;
+ int level;
+ int next_key_ret = 0;
+ u64 last_ret = 0;
+
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+ goto out;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ level = btrfs_header_level(root->node);
+
+ if (level == 0)
+ goto out;
+
+ if (root->defrag_progress.objectid == 0) {
+ struct extent_buffer *root_node;
+ u32 nritems;
+
+ root_node = btrfs_lock_root_node(root);
+ nritems = btrfs_header_nritems(root_node);
+ root->defrag_max.objectid = 0;
+ /* from above we know this is not a leaf */
+ btrfs_node_key_to_cpu(root_node, &root->defrag_max,
+ nritems - 1);
+ btrfs_tree_unlock(root_node);
+ free_extent_buffer(root_node);
+ memset(&key, 0, sizeof(key));
+ } else {
+ memcpy(&key, &root->defrag_progress, sizeof(key));
+ }
+
+ path->keep_locks = 1;
+
+ ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = 0;
+ goto out;
+ }
+ btrfs_release_path(path);
+ /*
+ * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
+ * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
+ * a deadlock (attempting to write lock an already write locked leaf).
+ */
+ path->lowest_level = 1;
+ wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+
+ if (wret < 0) {
+ ret = wret;
+ goto out;
+ }
+ if (!path->nodes[1]) {
+ ret = 0;
+ goto out;
+ }
+ /*
+ * The node at level 1 must always be locked when our path has
+ * keep_locks set and lowest_level is 1, regardless of the value of
+ * path->slots[1].
+ */
+ BUG_ON(path->locks[1] == 0);
+ ret = btrfs_realloc_node(trans, root,
+ path->nodes[1], 0,
+ &last_ret,
+ &root->defrag_progress);
+ if (ret) {
+ WARN_ON(ret == -EAGAIN);
+ goto out;
+ }
+ /*
+ * Now that we reallocated the node we can find the next key. Note that
+ * btrfs_find_next_key() can release our path and do another search
+ * without COWing, this is because even with path->keep_locks = 1,
+ * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
+ * node when path->slots[node_level - 1] does not point to the last
+ * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
+ * we search for the next key after reallocating our node.
+ */
+ path->slots[1] = btrfs_header_nritems(path->nodes[1]);
+ next_key_ret = btrfs_find_next_key(root, path, &key, 1,
+ BTRFS_OLDEST_GENERATION);
+ if (next_key_ret == 0) {
+ memcpy(&root->defrag_progress, &key, sizeof(key));
+ ret = -EAGAIN;
+ }
+out:
+ btrfs_free_path(path);
+ if (ret == -EAGAIN) {
+ if (root->defrag_max.objectid > root->defrag_progress.objectid)
+ goto done;
+ if (root->defrag_max.type > root->defrag_progress.type)
+ goto done;
+ if (root->defrag_max.offset > root->defrag_progress.offset)
+ goto done;
+ ret = 0;
+ }
+done:
+ if (ret != -EAGAIN)
+ memset(&root->defrag_progress, 0,
+ sizeof(root->defrag_progress));
+
+ return ret;
+}
+
+/*
+ * Defrag specific helper to get an extent map.
+ *
+ * Differences between this and btrfs_get_extent() are:
+ *
+ * - No extent_map will be added to inode->extent_tree
+ * To reduce memory usage in the long run.
+ *
+ * - Extra optimization to skip file extents older than @newer_than
+ * By using btrfs_search_forward() we can skip entire file ranges that
+ * have extents created in past transactions, because btrfs_search_forward()
+ * will not visit leaves and nodes with a generation smaller than given
+ * minimal generation threshold (@newer_than).
+ *
+ * Return valid em if we find a file extent matching the requirement.
+ * Return NULL if we can not find a file extent matching the requirement.
+ *
+ * Return ERR_PTR() for error.
+ */
+static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
+ u64 start, u64 newer_than)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_path path = { 0 };
+ struct extent_map *em;
+ struct btrfs_key key;
+ u64 ino = btrfs_ino(inode);
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
+
+ if (newer_than) {
+ ret = btrfs_search_forward(root, &key, &path, newer_than);
+ if (ret < 0)
+ goto err;
+ /* Can't find anything newer */
+ if (ret > 0)
+ goto not_found;
+ } else {
+ ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
+ if (ret < 0)
+ goto err;
+ }
+ if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
+ /*
+ * If btrfs_search_slot() makes path to point beyond nritems,
+ * we should not have an empty leaf, as this inode must at
+ * least have its INODE_ITEM.
+ */
+ ASSERT(btrfs_header_nritems(path.nodes[0]));
+ path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1;
+ }
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+ /* Perfect match, no need to go one slot back */
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
+ key.offset == start)
+ goto iterate;
+
+ /* We didn't find a perfect match, needs to go one slot back */
+ if (path.slots[0] > 0) {
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+ path.slots[0]--;
+ }
+
+iterate:
+ /* Iterate through the path to find a file extent covering @start */
+ while (true) {
+ u64 extent_end;
+
+ if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
+ goto next;
+
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+
+ /*
+ * We may go one slot back to INODE_REF/XATTR item, then
+ * need to go forward until we reach an EXTENT_DATA.
+ * But we should still has the correct ino as key.objectid.
+ */
+ if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY)
+ goto next;
+
+ /* It's beyond our target range, definitely not extent found */
+ if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY)
+ goto not_found;
+
+ /*
+ * | |<- File extent ->|
+ * \- start
+ *
+ * This means there is a hole between start and key.offset.
+ */
+ if (key.offset > start) {
+ em->start = start;
+ em->orig_start = start;
+ em->block_start = EXTENT_MAP_HOLE;
+ em->len = key.offset - start;
+ break;
+ }
+
+ fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
+ struct btrfs_file_extent_item);
+ extent_end = btrfs_file_extent_end(&path);
+
+ /*
+ * |<- file extent ->| |
+ * \- start
+ *
+ * We haven't reached start, search next slot.
+ */
+ if (extent_end <= start)
+ goto next;
+
+ /* Now this extent covers @start, convert it to em */
+ btrfs_extent_item_to_extent_map(inode, &path, fi, em);
+ break;
+next:
+ ret = btrfs_next_item(root, &path);
+ if (ret < 0)
+ goto err;
+ if (ret > 0)
+ goto not_found;
+ }
+ btrfs_release_path(&path);
+ return em;
+
+not_found:
+ btrfs_release_path(&path);
+ free_extent_map(em);
+ return NULL;
+
+err:
+ btrfs_release_path(&path);
+ free_extent_map(em);
+ return ERR_PTR(ret);
+}
+
+static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
+ u64 newer_than, bool locked)
+{
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_map *em;
+ const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
+
+ /*
+ * Hopefully we have this extent in the tree already, try without the
+ * full extent lock.
+ */
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, sectorsize);
+ read_unlock(&em_tree->lock);
+
+ /*
+ * We can get a merged extent, in that case, we need to re-search
+ * tree to get the original em for defrag.
+ *
+ * If @newer_than is 0 or em::generation < newer_than, we can trust
+ * this em, as either we don't care about the generation, or the
+ * merged extent map will be rejected anyway.
+ */
+ if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
+ newer_than && em->generation >= newer_than) {
+ free_extent_map(em);
+ em = NULL;
+ }
+
+ if (!em) {
+ struct extent_state *cached = NULL;
+ u64 end = start + sectorsize - 1;
+
+ /* Get the big lock and read metadata off disk. */
+ if (!locked)
+ lock_extent(io_tree, start, end, &cached);
+ em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
+ if (!locked)
+ unlock_extent(io_tree, start, end, &cached);
+
+ if (IS_ERR(em))
+ return NULL;
+ }
+
+ return em;
+}
+
+static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
+ const struct extent_map *em)
+{
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ return BTRFS_MAX_COMPRESSED;
+ return fs_info->max_extent_size;
+}
+
+static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
+ u32 extent_thresh, u64 newer_than, bool locked)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct extent_map *next;
+ bool ret = false;
+
+ /* This is the last extent */
+ if (em->start + em->len >= i_size_read(inode))
+ return false;
+
+ /*
+ * Here we need to pass @newer_then when checking the next extent, or
+ * we will hit a case we mark current extent for defrag, but the next
+ * one will not be a target.
+ * This will just cause extra IO without really reducing the fragments.
+ */
+ next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
+ /* No more em or hole */
+ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+ goto out;
+ if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
+ goto out;
+ /*
+ * If the next extent is at its max capacity, defragging current extent
+ * makes no sense, as the total number of extents won't change.
+ */
+ if (next->len >= get_extent_max_capacity(fs_info, em))
+ goto out;
+ /* Skip older extent */
+ if (next->generation < newer_than)
+ goto out;
+ /* Also check extent size */
+ if (next->len >= extent_thresh)
+ goto out;
+
+ ret = true;
+out:
+ free_extent_map(next);
+ return ret;
+}
+
+/*
+ * Prepare one page to be defragged.
+ *
+ * This will ensure:
+ *
+ * - Returned page is locked and has been set up properly.
+ * - No ordered extent exists in the page.
+ * - The page is uptodate.
+ *
+ * NOTE: Caller should also wait for page writeback after the cluster is
+ * prepared, here we don't do writeback wait for each page.
+ */
+static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index)
+{
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ gfp_t mask = btrfs_alloc_write_mask(mapping);
+ u64 page_start = (u64)index << PAGE_SHIFT;
+ u64 page_end = page_start + PAGE_SIZE - 1;
+ struct extent_state *cached_state = NULL;
+ struct page *page;
+ int ret;
+
+again:
+ page = find_or_create_page(mapping, index, mask);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Since we can defragment files opened read-only, we can encounter
+ * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
+ * can't do I/O using huge pages yet, so return an error for now.
+ * Filesystem transparent huge pages are typically only used for
+ * executables that explicitly enable them, so this isn't very
+ * restrictive.
+ */
+ if (PageCompound(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-ETXTBSY);
+ }
+
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(ret);
+ }
+
+ /* Wait for any existing ordered extent in the range */
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
+
+ lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
+ unlock_extent(&inode->io_tree, page_start, page_end,
+ &cached_state);
+ if (!ordered)
+ break;
+
+ unlock_page(page);
+ btrfs_start_ordered_extent(ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ lock_page(page);
+ /*
+ * We unlocked the page above, so we need check if it was
+ * released or not.
+ */
+ if (page->mapping != mapping || !PagePrivate(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
+ }
+
+ /*
+ * Now the page range has no ordered extent any more. Read the page to
+ * make it uptodate.
+ */
+ if (!PageUptodate(page)) {
+ btrfs_read_folio(NULL, page_folio(page));
+ lock_page(page);
+ if (page->mapping != mapping || !PagePrivate(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
+ }
+ return page;
+}
+
+struct defrag_target_range {
+ struct list_head list;
+ u64 start;
+ u64 len;
+};
+
+/*
+ * Collect all valid target extents.
+ *
+ * @start: file offset to lookup
+ * @len: length to lookup
+ * @extent_thresh: file extent size threshold, any extent size >= this value
+ * will be ignored
+ * @newer_than: only defrag extents newer than this value
+ * @do_compress: whether the defrag is doing compression
+ * if true, @extent_thresh will be ignored and all regular
+ * file extents meeting @newer_than will be targets.
+ * @locked: if the range has already held extent lock
+ * @target_list: list of targets file extents
+ */
+static int defrag_collect_targets(struct btrfs_inode *inode,
+ u64 start, u64 len, u32 extent_thresh,
+ u64 newer_than, bool do_compress,
+ bool locked, struct list_head *target_list,
+ u64 *last_scanned_ret)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ bool last_is_target = false;
+ u64 cur = start;
+ int ret = 0;
+
+ while (cur < start + len) {
+ struct extent_map *em;
+ struct defrag_target_range *new;
+ bool next_mergeable = true;
+ u64 range_len;
+
+ last_is_target = false;
+ em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked);
+ if (!em)
+ break;
+
+ /*
+ * If the file extent is an inlined one, we may still want to
+ * defrag it (fallthrough) if it will cause a regular extent.
+ * This is for users who want to convert inline extents to
+ * regular ones through max_inline= mount option.
+ */
+ if (em->block_start == EXTENT_MAP_INLINE &&
+ em->len <= inode->root->fs_info->max_inline)
+ goto next;
+
+ /* Skip hole/delalloc/preallocated extents */
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ em->block_start == EXTENT_MAP_DELALLOC ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ goto next;
+
+ /* Skip older extent */
+ if (em->generation < newer_than)
+ goto next;
+
+ /* This em is under writeback, no need to defrag */
+ if (em->generation == (u64)-1)
+ goto next;
+
+ /*
+ * Our start offset might be in the middle of an existing extent
+ * map, so take that into account.
+ */
+ range_len = em->len - (cur - em->start);
+ /*
+ * If this range of the extent map is already flagged for delalloc,
+ * skip it, because:
+ *
+ * 1) We could deadlock later, when trying to reserve space for
+ * delalloc, because in case we can't immediately reserve space
+ * the flusher can start delalloc and wait for the respective
+ * ordered extents to complete. The deadlock would happen
+ * because we do the space reservation while holding the range
+ * locked, and starting writeback, or finishing an ordered
+ * extent, requires locking the range;
+ *
+ * 2) If there's delalloc there, it means there's dirty pages for
+ * which writeback has not started yet (we clean the delalloc
+ * flag when starting writeback and after creating an ordered
+ * extent). If we mark pages in an adjacent range for defrag,
+ * then we will have a larger contiguous range for delalloc,
+ * very likely resulting in a larger extent after writeback is
+ * triggered (except in a case of free space fragmentation).
+ */
+ if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
+ EXTENT_DELALLOC, 0, NULL))
+ goto next;
+
+ /*
+ * For do_compress case, we want to compress all valid file
+ * extents, thus no @extent_thresh or mergeable check.
+ */
+ if (do_compress)
+ goto add;
+
+ /* Skip too large extent */
+ if (range_len >= extent_thresh)
+ goto next;
+
+ /*
+ * Skip extents already at its max capacity, this is mostly for
+ * compressed extents, which max cap is only 128K.
+ */
+ if (em->len >= get_extent_max_capacity(fs_info, em))
+ goto next;
+
+ /*
+ * Normally there are no more extents after an inline one, thus
+ * @next_mergeable will normally be false and not defragged.
+ * So if an inline extent passed all above checks, just add it
+ * for defrag, and be converted to regular extents.
+ */
+ if (em->block_start == EXTENT_MAP_INLINE)
+ goto add;
+
+ next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
+ extent_thresh, newer_than, locked);
+ if (!next_mergeable) {
+ struct defrag_target_range *last;
+
+ /* Empty target list, no way to merge with last entry */
+ if (list_empty(target_list))
+ goto next;
+ last = list_entry(target_list->prev,
+ struct defrag_target_range, list);
+ /* Not mergeable with last entry */
+ if (last->start + last->len != cur)
+ goto next;
+
+ /* Mergeable, fall through to add it to @target_list. */
+ }
+
+add:
+ last_is_target = true;
+ range_len = min(extent_map_end(em), start + len) - cur;
+ /*
+ * This one is a good target, check if it can be merged into
+ * last range of the target list.
+ */
+ if (!list_empty(target_list)) {
+ struct defrag_target_range *last;
+
+ last = list_entry(target_list->prev,
+ struct defrag_target_range, list);
+ ASSERT(last->start + last->len <= cur);
+ if (last->start + last->len == cur) {
+ /* Mergeable, enlarge the last entry */
+ last->len += range_len;
+ goto next;
+ }
+ /* Fall through to allocate a new entry */
+ }
+
+ /* Allocate new defrag_target_range */
+ new = kmalloc(sizeof(*new), GFP_NOFS);
+ if (!new) {
+ free_extent_map(em);
+ ret = -ENOMEM;
+ break;
+ }
+ new->start = cur;
+ new->len = range_len;
+ list_add_tail(&new->list, target_list);
+
+next:
+ cur = extent_map_end(em);
+ free_extent_map(em);
+ }
+ if (ret < 0) {
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+
+ list_for_each_entry_safe(entry, tmp, target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
+ }
+ }
+ if (!ret && last_scanned_ret) {
+ /*
+ * If the last extent is not a target, the caller can skip to
+ * the end of that extent.
+ * Otherwise, we can only go the end of the specified range.
+ */
+ if (!last_is_target)
+ *last_scanned_ret = max(cur, *last_scanned_ret);
+ else
+ *last_scanned_ret = max(start + len, *last_scanned_ret);
+ }
+ return ret;
+}
+
+#define CLUSTER_SIZE (SZ_256K)
+static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
+
+/*
+ * Defrag one contiguous target range.
+ *
+ * @inode: target inode
+ * @target: target range to defrag
+ * @pages: locked pages covering the defrag range
+ * @nr_pages: number of locked pages
+ *
+ * Caller should ensure:
+ *
+ * - Pages are prepared
+ * Pages should be locked, no ordered extent in the pages range,
+ * no writeback.
+ *
+ * - Extent bits are locked
+ */
+static int defrag_one_locked_target(struct btrfs_inode *inode,
+ struct defrag_target_range *target,
+ struct page **pages, int nr_pages,
+ struct extent_state **cached_state)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_changeset *data_reserved = NULL;
+ const u64 start = target->start;
+ const u64 len = target->len;
+ unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
+ unsigned long start_index = start >> PAGE_SHIFT;
+ unsigned long first_index = page_index(pages[0]);
+ int ret = 0;
+ int i;
+
+ ASSERT(last_index - first_index + 1 <= nr_pages);
+
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
+ if (ret < 0)
+ return ret;
+ clear_extent_bit(&inode->io_tree, start, start + len - 1,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, cached_state);
+ set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
+
+ /* Update the page status */
+ for (i = start_index - first_index; i <= last_index - first_index; i++) {
+ ClearPageChecked(pages[i]);
+ btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
+ }
+ btrfs_delalloc_release_extents(inode, len);
+ extent_changeset_free(data_reserved);
+
+ return ret;
+}
+
+static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
+ u32 extent_thresh, u64 newer_than, bool do_compress,
+ u64 *last_scanned_ret)
+{
+ struct extent_state *cached_state = NULL;
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+ LIST_HEAD(target_list);
+ struct page **pages;
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ u64 last_index = (start + len - 1) >> PAGE_SHIFT;
+ u64 start_index = start >> PAGE_SHIFT;
+ unsigned int nr_pages = last_index - start_index + 1;
+ int ret = 0;
+ int i;
+
+ ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
+ ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
+
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
+
+ /* Prepare all pages */
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = defrag_prepare_one_page(inode, start_index + i);
+ if (IS_ERR(pages[i])) {
+ ret = PTR_ERR(pages[i]);
+ pages[i] = NULL;
+ goto free_pages;
+ }
+ }
+ for (i = 0; i < nr_pages; i++)
+ wait_on_page_writeback(pages[i]);
+
+ /* Lock the pages range */
+ lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
+ (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ &cached_state);
+ /*
+ * Now we have a consistent view about the extent map, re-check
+ * which range really needs to be defragged.
+ *
+ * And this time we have extent locked already, pass @locked = true
+ * so that we won't relock the extent range and cause deadlock.
+ */
+ ret = defrag_collect_targets(inode, start, len, extent_thresh,
+ newer_than, do_compress, true,
+ &target_list, last_scanned_ret);
+ if (ret < 0)
+ goto unlock_extent;
+
+ list_for_each_entry(entry, &target_list, list) {
+ ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
+ &cached_state);
+ if (ret < 0)
+ break;
+ }
+
+ list_for_each_entry_safe(entry, tmp, &target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
+ }
+unlock_extent:
+ unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
+ (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ &cached_state);
+free_pages:
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i]) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ }
+ }
+ kfree(pages);
+ return ret;
+}
+
+static int defrag_one_cluster(struct btrfs_inode *inode,
+ struct file_ra_state *ra,
+ u64 start, u32 len, u32 extent_thresh,
+ u64 newer_than, bool do_compress,
+ unsigned long *sectors_defragged,
+ unsigned long max_sectors,
+ u64 *last_scanned_ret)
+{
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+ LIST_HEAD(target_list);
+ int ret;
+
+ ret = defrag_collect_targets(inode, start, len, extent_thresh,
+ newer_than, do_compress, false,
+ &target_list, NULL);
+ if (ret < 0)
+ goto out;
+
+ list_for_each_entry(entry, &target_list, list) {
+ u32 range_len = entry->len;
+
+ /* Reached or beyond the limit */
+ if (max_sectors && *sectors_defragged >= max_sectors) {
+ ret = 1;
+ break;
+ }
+
+ if (max_sectors)
+ range_len = min_t(u32, range_len,
+ (max_sectors - *sectors_defragged) * sectorsize);
+
+ /*
+ * If defrag_one_range() has updated last_scanned_ret,
+ * our range may already be invalid (e.g. hole punched).
+ * Skip if our range is before last_scanned_ret, as there is
+ * no need to defrag the range anymore.
+ */
+ if (entry->start + range_len <= *last_scanned_ret)
+ continue;
+
+ if (ra)
+ page_cache_sync_readahead(inode->vfs_inode.i_mapping,
+ ra, NULL, entry->start >> PAGE_SHIFT,
+ ((entry->start + range_len - 1) >> PAGE_SHIFT) -
+ (entry->start >> PAGE_SHIFT) + 1);
+ /*
+ * Here we may not defrag any range if holes are punched before
+ * we locked the pages.
+ * But that's fine, it only affects the @sectors_defragged
+ * accounting.
+ */
+ ret = defrag_one_range(inode, entry->start, range_len,
+ extent_thresh, newer_than, do_compress,
+ last_scanned_ret);
+ if (ret < 0)
+ break;
+ *sectors_defragged += range_len >>
+ inode->root->fs_info->sectorsize_bits;
+ }
+out:
+ list_for_each_entry_safe(entry, tmp, &target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
+ }
+ if (ret >= 0)
+ *last_scanned_ret = max(*last_scanned_ret, start + len);
+ return ret;
+}
+
+/*
+ * Entry point to file defragmentation.
+ *
+ * @inode: inode to be defragged
+ * @ra: readahead state (can be NUL)
+ * @range: defrag options including range and flags
+ * @newer_than: minimum transid to defrag
+ * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
+ * will be defragged.
+ *
+ * Return <0 for error.
+ * Return >=0 for the number of sectors defragged, and range->start will be updated
+ * to indicate the file offset where next defrag should be started at.
+ * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
+ * defragging all the range).
+ */
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+ struct btrfs_ioctl_defrag_range_args *range,
+ u64 newer_than, unsigned long max_to_defrag)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ unsigned long sectors_defragged = 0;
+ u64 isize = i_size_read(inode);
+ u64 cur;
+ u64 last_byte;
+ bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS);
+ bool ra_allocated = false;
+ int compress_type = BTRFS_COMPRESS_ZLIB;
+ int ret = 0;
+ u32 extent_thresh = range->extent_thresh;
+ pgoff_t start_index;
+
+ if (isize == 0)
+ return 0;
+
+ if (range->start >= isize)
+ return -EINVAL;
+
+ if (do_compress) {
+ if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
+ return -EINVAL;
+ if (range->compress_type)
+ compress_type = range->compress_type;
+ }
+
+ if (extent_thresh == 0)
+ extent_thresh = SZ_256K;
+
+ if (range->start + range->len > range->start) {
+ /* Got a specific range */
+ last_byte = min(isize, range->start + range->len);
+ } else {
+ /* Defrag until file end */
+ last_byte = isize;
+ }
+
+ /* Align the range */
+ cur = round_down(range->start, fs_info->sectorsize);
+ last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
+
+ /*
+ * If we were not given a ra, allocate a readahead context. As
+ * readahead is just an optimization, defrag will work without it so
+ * we don't error out.
+ */
+ if (!ra) {
+ ra_allocated = true;
+ ra = kzalloc(sizeof(*ra), GFP_KERNEL);
+ if (ra)
+ file_ra_state_init(ra, inode->i_mapping);
+ }
+
+ /*
+ * Make writeback start from the beginning of the range, so that the
+ * defrag range can be written sequentially.
+ */
+ start_index = cur >> PAGE_SHIFT;
+ if (start_index < inode->i_mapping->writeback_index)
+ inode->i_mapping->writeback_index = start_index;
+
+ while (cur < last_byte) {
+ const unsigned long prev_sectors_defragged = sectors_defragged;
+ u64 last_scanned = cur;
+ u64 cluster_end;
+
+ if (btrfs_defrag_cancelled(fs_info)) {
+ ret = -EAGAIN;
+ break;
+ }
+
+ /* We want the cluster end at page boundary when possible */
+ cluster_end = (((cur >> PAGE_SHIFT) +
+ (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
+ cluster_end = min(cluster_end, last_byte);
+
+ btrfs_inode_lock(BTRFS_I(inode), 0);
+ if (IS_SWAPFILE(inode)) {
+ ret = -ETXTBSY;
+ btrfs_inode_unlock(BTRFS_I(inode), 0);
+ break;
+ }
+ if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
+ btrfs_inode_unlock(BTRFS_I(inode), 0);
+ break;
+ }
+ if (do_compress)
+ BTRFS_I(inode)->defrag_compress = compress_type;
+ ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
+ cluster_end + 1 - cur, extent_thresh,
+ newer_than, do_compress, &sectors_defragged,
+ max_to_defrag, &last_scanned);
+
+ if (sectors_defragged > prev_sectors_defragged)
+ balance_dirty_pages_ratelimited(inode->i_mapping);
+
+ btrfs_inode_unlock(BTRFS_I(inode), 0);
+ if (ret < 0)
+ break;
+ cur = max(cluster_end + 1, last_scanned);
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ cond_resched();
+ }
+
+ if (ra_allocated)
+ kfree(ra);
+ /*
+ * Update range.start for autodefrag, this will indicate where to start
+ * in next run.
+ */
+ range->start = cur;
+ if (sectors_defragged) {
+ /*
+ * We have defragged some sectors, for compression case they
+ * need to be written back immediately.
+ */
+ if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
+ filemap_flush(inode->i_mapping);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_flush(inode->i_mapping);
+ }
+ if (range->compress_type == BTRFS_COMPRESS_LZO)
+ btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+ else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
+ btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+ ret = sectors_defragged;
+ }
+ if (do_compress) {
+ btrfs_inode_lock(BTRFS_I(inode), 0);
+ BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
+ btrfs_inode_unlock(BTRFS_I(inode), 0);
+ }
+ return ret;
+}
+
+void __cold btrfs_auto_defrag_exit(void)
+{
+ kmem_cache_destroy(btrfs_inode_defrag_cachep);
+}
+
+int __init btrfs_auto_defrag_init(void)
+{
+ btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
+ sizeof(struct inode_defrag), 0,
+ SLAB_MEM_SPREAD,
+ NULL);
+ if (!btrfs_inode_defrag_cachep)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h
new file mode 100644
index 000000000000..5305f2283b5e
--- /dev/null
+++ b/fs/btrfs/defrag.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_DEFRAG_H
+#define BTRFS_DEFRAG_H
+
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+ struct btrfs_ioctl_defrag_range_args *range,
+ u64 newer_than, unsigned long max_to_defrag);
+int __init btrfs_auto_defrag_init(void);
+void __cold btrfs_auto_defrag_exit(void);
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode, u32 extent_thresh);
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+
+static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
+{
+ return signal_pending(current);
+}
+
+#endif
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 1e8f17ff829e..7ddb1d104e8e 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
+#include "messages.h"
#include "ctree.h"
#include "delalloc-space.h"
#include "block-rsv.h"
@@ -8,6 +9,7 @@
#include "transaction.h"
#include "qgroup.h"
#include "block-group.h"
+#include "fs.h"
/*
* HOW DOES THIS WORK
@@ -127,9 +129,11 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
}
int btrfs_check_data_free_space(struct btrfs_inode *inode,
- struct extent_changeset **reserved, u64 start, u64 len)
+ struct extent_changeset **reserved, u64 start,
+ u64 len, bool noflush)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;
int ret;
/* align the range */
@@ -137,7 +141,12 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode,
round_down(start, fs_info->sectorsize);
start = round_down(start, fs_info->sectorsize);
- ret = btrfs_alloc_data_chunk_ondemand(inode, len);
+ if (noflush)
+ flush = BTRFS_RESERVE_NO_FLUSH;
+ else if (btrfs_is_free_space_inode(inode))
+ flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
+
+ ret = btrfs_reserve_data_bytes(fs_info, len, flush);
if (ret < 0)
return ret;
@@ -193,8 +202,8 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
btrfs_qgroup_free_data(inode, reserved, start, len);
}
-/**
- * Release any excessive reservation
+/*
+ * Release any excessive reservations for an inode.
*
* @inode: the inode we need to release from
* @qgroup_free: free or convert qgroup meta. Unlike normal operation, qgroup
@@ -368,12 +377,12 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
return 0;
}
-/**
- * Release a metadata reservation for an inode
+/*
+ * Release a metadata reservation for an inode.
*
- * @inode: the inode to release the reservation for.
- * @num_bytes: the number of bytes we are releasing.
- * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
+ * @inode: the inode to release the reservation for.
+ * @num_bytes: the number of bytes we are releasing.
+ * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
*
* This will release the metadata reservation for an inode. This can be called
* once we complete IO for a given set of bytes to release their metadata
@@ -396,10 +405,11 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
btrfs_inode_rsv_release(inode, qgroup_free);
}
-/**
- * btrfs_delalloc_release_extents - release our outstanding_extents
- * @inode: the inode to balance the reservation for.
- * @num_bytes: the number of bytes we originally reserved with
+/*
+ * Release our outstanding_extents for an inode.
+ *
+ * @inode: the inode to balance the reservation for.
+ * @num_bytes: the number of bytes we originally reserved with
*
* When we reserve space we increase outstanding_extents for the extents we may
* add. Once we've set the range as delalloc or created our ordered extents we
@@ -424,37 +434,37 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
btrfs_inode_rsv_release(inode, true);
}
-/**
- * btrfs_delalloc_reserve_space - reserve data and metadata space for
- * delalloc
- * @inode: inode we're writing to
- * @start: start range we are writing to
- * @len: how long the range we are writing to
- * @reserved: mandatory parameter, record actually reserved qgroup ranges of
- * current reservation.
+/*
+ * Reserve data and metadata space for delalloc
+ *
+ * @inode: inode we're writing to
+ * @start: start range we are writing to
+ * @len: how long the range we are writing to
+ * @reserved: mandatory parameter, record actually reserved qgroup ranges of
+ * current reservation.
*
* This will do the following things
*
- * - reserve space in data space info for num bytes
- * and reserve precious corresponding qgroup space
+ * - reserve space in data space info for num bytes and reserve precious
+ * corresponding qgroup space
* (Done in check_data_free_space)
*
* - reserve space for metadata space, based on the number of outstanding
- * extents and how much csums will be needed
- * also reserve metadata space in a per root over-reserve method.
+ * extents and how much csums will be needed also reserve metadata space in a
+ * per root over-reserve method.
* - add to the inodes->delalloc_bytes
* - add it to the fs_info's delalloc inodes list.
* (Above 3 all done in delalloc_reserve_metadata)
*
* Return 0 for success
- * Return <0 for error(-ENOSPC or -EQUOT)
+ * Return <0 for error(-ENOSPC or -EDQUOT)
*/
int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len)
{
int ret;
- ret = btrfs_check_data_free_space(inode, reserved, start, len);
+ ret = btrfs_check_data_free_space(inode, reserved, start, len, false);
if (ret < 0)
return ret;
ret = btrfs_delalloc_reserve_metadata(inode, len, len, false);
@@ -466,7 +476,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
return ret;
}
-/**
+/*
* Release data and metadata space for delalloc
*
* @inode: inode we're releasing space for
@@ -475,10 +485,10 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
* @len: length of the space already reserved
* @qgroup_free: should qgroup reserved-space also be freed
*
- * This function will release the metadata space that was not used and will
- * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
- * list if there are no delalloc bytes left.
- * Also it will handle the qgroup reserved space.
+ * Release the metadata space that was not used and will decrement
+ * ->delalloc_bytes and remove it from the fs_info->delalloc_inodes list if
+ * there are no delalloc bytes left. Also it will handle the qgroup reserved
+ * space.
*/
void btrfs_delalloc_release_space(struct btrfs_inode *inode,
struct extent_changeset *reserved,
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
index 28bf5c3ef430..c5d573f2366e 100644
--- a/fs/btrfs/delalloc-space.h
+++ b/fs/btrfs/delalloc-space.h
@@ -7,7 +7,8 @@ struct extent_changeset;
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
int btrfs_check_data_free_space(struct btrfs_inode *inode,
- struct extent_changeset **reserved, u64 start, u64 len);
+ struct extent_changeset **reserved, u64 start, u64 len,
+ bool noflush);
void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
void btrfs_delalloc_release_space(struct btrfs_inode *inode,
@@ -19,5 +20,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
bool qgroup_free);
int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
+int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ u64 disk_num_bytes, bool noflush);
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
#endif /* BTRFS_DELALLOC_SPACE_H */
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index e7f34871a132..0095c6e4c3d1 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -6,14 +6,19 @@
#include <linux/slab.h>
#include <linux/iversion.h>
+#include "ctree.h"
+#include "fs.h"
+#include "messages.h"
#include "misc.h"
#include "delayed-inode.h"
#include "disk-io.h"
#include "transaction.h"
-#include "ctree.h"
#include "qgroup.h"
#include "locking.h"
#include "inode-item.h"
+#include "space-info.h"
+#include "accessors.h"
+#include "file-item.h"
#define BTRFS_DELAYED_WRITEBACK 512
#define BTRFS_DELAYED_BACKGROUND 128
@@ -302,15 +307,21 @@ static inline void btrfs_release_prepared_delayed_node(
__btrfs_release_delayed_node(node, 1);
}
-static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
+static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
+ struct btrfs_delayed_node *node,
+ enum btrfs_delayed_item_type type)
{
struct btrfs_delayed_item *item;
+
item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
if (item) {
item->data_len = data_len;
- item->ins_or_del = 0;
+ item->type = type;
item->bytes_reserved = 0;
- item->delayed_node = NULL;
+ item->delayed_node = node;
+ RB_CLEAR_NODE(&item->rb_node);
+ INIT_LIST_HEAD(&item->log_list);
+ item->logged = false;
refcount_set(&item->refs, 1);
}
return item;
@@ -319,72 +330,32 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
/*
* __btrfs_lookup_delayed_item - look up the delayed item by key
* @delayed_node: pointer to the delayed node
- * @key: the key to look up
- * @prev: used to store the prev item if the right item isn't found
- * @next: used to store the next item if the right item isn't found
+ * @index: the dir index value to lookup (offset of a dir index key)
*
* Note: if we don't find the right item, we will return the prev item and
* the next item.
*/
static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
struct rb_root *root,
- struct btrfs_key *key,
- struct btrfs_delayed_item **prev,
- struct btrfs_delayed_item **next)
+ u64 index)
{
- struct rb_node *node, *prev_node = NULL;
+ struct rb_node *node = root->rb_node;
struct btrfs_delayed_item *delayed_item = NULL;
- int ret = 0;
-
- node = root->rb_node;
while (node) {
delayed_item = rb_entry(node, struct btrfs_delayed_item,
rb_node);
- prev_node = node;
- ret = btrfs_comp_cpu_keys(&delayed_item->key, key);
- if (ret < 0)
+ if (delayed_item->index < index)
node = node->rb_right;
- else if (ret > 0)
+ else if (delayed_item->index > index)
node = node->rb_left;
else
return delayed_item;
}
- if (prev) {
- if (!prev_node)
- *prev = NULL;
- else if (ret < 0)
- *prev = delayed_item;
- else if ((node = rb_prev(prev_node)) != NULL) {
- *prev = rb_entry(node, struct btrfs_delayed_item,
- rb_node);
- } else
- *prev = NULL;
- }
-
- if (next) {
- if (!prev_node)
- *next = NULL;
- else if (ret > 0)
- *next = delayed_item;
- else if ((node = rb_next(prev_node)) != NULL) {
- *next = rb_entry(node, struct btrfs_delayed_item,
- rb_node);
- } else
- *next = NULL;
- }
return NULL;
}
-static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
- struct btrfs_delayed_node *delayed_node,
- struct btrfs_key *key)
-{
- return __btrfs_lookup_delayed_item(&delayed_node->ins_root.rb_root, key,
- NULL, NULL);
-}
-
static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
struct btrfs_delayed_item *ins)
{
@@ -392,15 +363,13 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
struct rb_node *parent_node = NULL;
struct rb_root_cached *root;
struct btrfs_delayed_item *item;
- int cmp;
bool leftmost = true;
- if (ins->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM)
+ if (ins->type == BTRFS_DELAYED_INSERTION_ITEM)
root = &delayed_node->ins_root;
- else if (ins->ins_or_del == BTRFS_DELAYED_DELETION_ITEM)
- root = &delayed_node->del_root;
else
- BUG();
+ root = &delayed_node->del_root;
+
p = &root->rb_root.rb_node;
node = &ins->rb_node;
@@ -409,11 +378,10 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
item = rb_entry(parent_node, struct btrfs_delayed_item,
rb_node);
- cmp = btrfs_comp_cpu_keys(&item->key, &ins->key);
- if (cmp < 0) {
+ if (item->index < ins->index) {
p = &(*p)->rb_right;
leftmost = false;
- } else if (cmp > 0) {
+ } else if (item->index > ins->index) {
p = &(*p)->rb_left;
} else {
return -EEXIST;
@@ -422,14 +390,10 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
rb_link_node(node, parent_node, p);
rb_insert_color_cached(node, root, leftmost);
- ins->delayed_node = delayed_node;
-
- /* Delayed items are always for dir index items. */
- ASSERT(ins->key.type == BTRFS_DIR_INDEX_KEY);
- if (ins->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM &&
- ins->key.offset >= delayed_node->index_cnt)
- delayed_node->index_cnt = ins->key.offset + 1;
+ if (ins->type == BTRFS_DELAYED_INSERTION_ITEM &&
+ ins->index >= delayed_node->index_cnt)
+ delayed_node->index_cnt = ins->index + 1;
delayed_node->count++;
atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
@@ -451,21 +415,21 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
struct rb_root_cached *root;
struct btrfs_delayed_root *delayed_root;
- /* Not associated with any delayed_node */
- if (!delayed_item->delayed_node)
+ /* Not inserted, ignore it. */
+ if (RB_EMPTY_NODE(&delayed_item->rb_node))
return;
+
delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
BUG_ON(!delayed_root);
- BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM &&
- delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM);
- if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM)
+ if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM)
root = &delayed_item->delayed_node->ins_root;
else
root = &delayed_item->delayed_node->del_root;
rb_erase_cached(&delayed_item->rb_node, root);
+ RB_CLEAR_NODE(&delayed_item->rb_node);
delayed_item->delayed_node->count--;
finish_one_item(delayed_root);
@@ -520,12 +484,11 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item(
}
static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_delayed_item *item)
{
struct btrfs_block_rsv *src_rsv;
struct btrfs_block_rsv *dst_rsv;
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
u64 num_bytes;
int ret;
@@ -545,14 +508,14 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
if (!ret) {
trace_btrfs_space_reservation(fs_info, "delayed_item",
- item->key.objectid,
+ item->delayed_node->inode_id,
num_bytes, 1);
/*
* For insertions we track reserved metadata space by accounting
* for the number of leaves that will be used, based on the delayed
* node's index_items_size field.
*/
- if (item->ins_or_del == BTRFS_DELAYED_DELETION_ITEM)
+ if (item->type == BTRFS_DELAYED_DELETION_ITEM)
item->bytes_reserved = num_bytes;
}
@@ -574,8 +537,8 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
* to release/reserve qgroup space.
*/
trace_btrfs_space_reservation(fs_info, "delayed_item",
- item->key.objectid, item->bytes_reserved,
- 0);
+ item->delayed_node->inode_id,
+ item->bytes_reserved, 0);
btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL);
}
@@ -688,6 +651,8 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
struct btrfs_delayed_item *next;
const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info);
struct btrfs_item_batch batch;
+ struct btrfs_key first_key;
+ const u32 first_data_size = first_item->data_len;
int total_size;
char *ins_data = NULL;
int ret;
@@ -716,9 +681,9 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
ASSERT(first_item->bytes_reserved == 0);
list_add_tail(&first_item->tree_list, &item_list);
- batch.total_data_size = first_item->data_len;
+ batch.total_data_size = first_data_size;
batch.nr = 1;
- total_size = first_item->data_len + sizeof(struct btrfs_item);
+ total_size = first_data_size + sizeof(struct btrfs_item);
curr = first_item;
while (true) {
@@ -732,8 +697,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
* We cannot allow gaps in the key space if we're doing log
* replay.
*/
- if (continuous_keys_only &&
- (next->key.offset != curr->key.offset + 1))
+ if (continuous_keys_only && (next->index != curr->index + 1))
break;
ASSERT(next->bytes_reserved == 0);
@@ -750,8 +714,11 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
}
if (batch.nr == 1) {
- batch.keys = &first_item->key;
- batch.data_sizes = &first_item->data_len;
+ first_key.objectid = node->inode_id;
+ first_key.type = BTRFS_DIR_INDEX_KEY;
+ first_key.offset = first_item->index;
+ batch.keys = &first_key;
+ batch.data_sizes = &first_data_size;
} else {
struct btrfs_key *ins_keys;
u32 *ins_sizes;
@@ -768,7 +735,9 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
batch.keys = ins_keys;
batch.data_sizes = ins_sizes;
list_for_each_entry(curr, &item_list, tree_list) {
- ins_keys[i] = curr->key;
+ ins_keys[i].objectid = node->inode_id;
+ ins_keys[i].type = BTRFS_DIR_INDEX_KEY;
+ ins_keys[i].offset = curr->index;
ins_sizes[i] = curr->data_len;
i++;
}
@@ -864,6 +833,7 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_delayed_item *item)
{
+ const u64 ino = item->delayed_node->inode_id;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_delayed_item *curr, *next;
struct extent_buffer *leaf = path->nodes[0];
@@ -902,7 +872,9 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
slot++;
btrfs_item_key_to_cpu(leaf, &key, slot);
- if (btrfs_comp_cpu_keys(&next->key, &key) != 0)
+ if (key.objectid != ino ||
+ key.type != BTRFS_DIR_INDEX_KEY ||
+ key.offset != next->index)
break;
nitems++;
curr = next;
@@ -920,9 +892,8 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
* Check btrfs_delayed_item_reserve_metadata() to see why we
* don't need to release/reserve qgroup space.
*/
- trace_btrfs_space_reservation(fs_info, "delayed_item",
- item->key.objectid, total_reserved_size,
- 0);
+ trace_btrfs_space_reservation(fs_info, "delayed_item", ino,
+ total_reserved_size, 0);
btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv,
total_reserved_size, NULL);
}
@@ -940,8 +911,12 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_delayed_node *node)
{
+ struct btrfs_key key;
int ret = 0;
+ key.objectid = node->inode_id;
+ key.type = BTRFS_DIR_INDEX_KEY;
+
while (ret == 0) {
struct btrfs_delayed_item *item;
@@ -952,7 +927,8 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
break;
}
- ret = btrfs_search_slot(trans, root, &item->key, path, -1, 1);
+ key.offset = item->index;
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
/*
* There's no matching item in the leaf. This means we
@@ -1441,7 +1417,7 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *dir,
- struct btrfs_disk_key *disk_key, u8 type,
+ struct btrfs_disk_key *disk_key, u8 flags,
u64 index)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1457,23 +1433,22 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
if (IS_ERR(delayed_node))
return PTR_ERR(delayed_node);
- delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len);
+ delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len,
+ delayed_node,
+ BTRFS_DELAYED_INSERTION_ITEM);
if (!delayed_item) {
ret = -ENOMEM;
goto release_node;
}
- delayed_item->key.objectid = btrfs_ino(dir);
- delayed_item->key.type = BTRFS_DIR_INDEX_KEY;
- delayed_item->key.offset = index;
- delayed_item->ins_or_del = BTRFS_DELAYED_INSERTION_ITEM;
+ delayed_item->index = index;
dir_item = (struct btrfs_dir_item *)delayed_item->data;
dir_item->location = *disk_key;
btrfs_set_stack_dir_transid(dir_item, trans->transid);
btrfs_set_stack_dir_data_len(dir_item, 0);
btrfs_set_stack_dir_name_len(dir_item, name_len);
- btrfs_set_stack_dir_type(dir_item, type);
+ btrfs_set_stack_dir_flags(dir_item, flags);
memcpy((char *)(dir_item + 1), name, name_len);
data_len = delayed_item->data_len + sizeof(struct btrfs_item);
@@ -1490,8 +1465,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
}
if (reserve_leaf_space) {
- ret = btrfs_delayed_item_reserve_metadata(trans, dir->root,
- delayed_item);
+ ret = btrfs_delayed_item_reserve_metadata(trans, delayed_item);
/*
* Space was reserved for a dir index item insertion when we
* started the transaction, so getting a failure here should be
@@ -1538,12 +1512,12 @@ release_node:
static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_node *node,
- struct btrfs_key *key)
+ u64 index)
{
struct btrfs_delayed_item *item;
mutex_lock(&node->mutex);
- item = __btrfs_lookup_delayed_insertion_item(node, key);
+ item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index);
if (!item) {
mutex_unlock(&node->mutex);
return 1;
@@ -1589,32 +1563,25 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
{
struct btrfs_delayed_node *node;
struct btrfs_delayed_item *item;
- struct btrfs_key item_key;
int ret;
node = btrfs_get_or_create_delayed_node(dir);
if (IS_ERR(node))
return PTR_ERR(node);
- item_key.objectid = btrfs_ino(dir);
- item_key.type = BTRFS_DIR_INDEX_KEY;
- item_key.offset = index;
-
- ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node,
- &item_key);
+ ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index);
if (!ret)
goto end;
- item = btrfs_alloc_delayed_item(0);
+ item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM);
if (!item) {
ret = -ENOMEM;
goto end;
}
- item->key = item_key;
- item->ins_or_del = BTRFS_DELAYED_DELETION_ITEM;
+ item->index = index;
- ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, item);
+ ret = btrfs_delayed_item_reserve_metadata(trans, item);
/*
* we have reserved enough space when we start a new transaction,
* so reserving metadata failure is impossible.
@@ -1679,8 +1646,8 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
* We can only do one readdir with delayed items at a time because of
* item->readdir_list.
*/
- btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
- btrfs_inode_lock(inode, 0);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
+ btrfs_inode_lock(BTRFS_I(inode), 0);
mutex_lock(&delayed_node->mutex);
item = __btrfs_first_delayed_insertion_item(delayed_node);
@@ -1743,9 +1710,9 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
int ret = 0;
list_for_each_entry(curr, del_list, readdir_list) {
- if (curr->key.offset > index)
+ if (curr->index > index)
break;
- if (curr->key.offset == index) {
+ if (curr->index == index) {
ret = 1;
break;
}
@@ -1779,19 +1746,19 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
list_del(&curr->readdir_list);
- if (curr->key.offset < ctx->pos) {
+ if (curr->index < ctx->pos) {
if (refcount_dec_and_test(&curr->refs))
kfree(curr);
continue;
}
- ctx->pos = curr->key.offset;
+ ctx->pos = curr->index;
di = (struct btrfs_dir_item *)curr->data;
name = (char *)(di + 1);
name_len = btrfs_stack_dir_name_len(di);
- d_type = fs_ftype_to_dtype(di->type);
+ d_type = fs_ftype_to_dtype(btrfs_dir_flags_to_ftype(di->type));
btrfs_disk_key_to_cpu(&location, &di->location);
over = !dir_emit(ctx, name, name_len,
@@ -2085,3 +2052,113 @@ void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info)
}
}
+void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list)
+{
+ struct btrfs_delayed_node *node;
+ struct btrfs_delayed_item *item;
+
+ node = btrfs_get_delayed_node(inode);
+ if (!node)
+ return;
+
+ mutex_lock(&node->mutex);
+ item = __btrfs_first_delayed_insertion_item(node);
+ while (item) {
+ /*
+ * It's possible that the item is already in a log list. This
+ * can happen in case two tasks are trying to log the same
+ * directory. For example if we have tasks A and task B:
+ *
+ * Task A collected the delayed items into a log list while
+ * under the inode's log_mutex (at btrfs_log_inode()), but it
+ * only releases the items after logging the inodes they point
+ * to (if they are new inodes), which happens after unlocking
+ * the log mutex;
+ *
+ * Task B enters btrfs_log_inode() and acquires the log_mutex
+ * of the same directory inode, before task B releases the
+ * delayed items. This can happen for example when logging some
+ * inode we need to trigger logging of its parent directory, so
+ * logging two files that have the same parent directory can
+ * lead to this.
+ *
+ * If this happens, just ignore delayed items already in a log
+ * list. All the tasks logging the directory are under a log
+ * transaction and whichever finishes first can not sync the log
+ * before the other completes and leaves the log transaction.
+ */
+ if (!item->logged && list_empty(&item->log_list)) {
+ refcount_inc(&item->refs);
+ list_add_tail(&item->log_list, ins_list);
+ }
+ item = __btrfs_next_delayed_item(item);
+ }
+
+ item = __btrfs_first_delayed_deletion_item(node);
+ while (item) {
+ /* It may be non-empty, for the same reason mentioned above. */
+ if (!item->logged && list_empty(&item->log_list)) {
+ refcount_inc(&item->refs);
+ list_add_tail(&item->log_list, del_list);
+ }
+ item = __btrfs_next_delayed_item(item);
+ }
+ mutex_unlock(&node->mutex);
+
+ /*
+ * We are called during inode logging, which means the inode is in use
+ * and can not be evicted before we finish logging the inode. So we never
+ * have the last reference on the delayed inode.
+ * Also, we don't use btrfs_release_delayed_node() because that would
+ * requeue the delayed inode (change its order in the list of prepared
+ * nodes) and we don't want to do such change because we don't create or
+ * delete delayed items.
+ */
+ ASSERT(refcount_read(&node->refs) > 1);
+ refcount_dec(&node->refs);
+}
+
+void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list)
+{
+ struct btrfs_delayed_node *node;
+ struct btrfs_delayed_item *item;
+ struct btrfs_delayed_item *next;
+
+ node = btrfs_get_delayed_node(inode);
+ if (!node)
+ return;
+
+ mutex_lock(&node->mutex);
+
+ list_for_each_entry_safe(item, next, ins_list, log_list) {
+ item->logged = true;
+ list_del_init(&item->log_list);
+ if (refcount_dec_and_test(&item->refs))
+ kfree(item);
+ }
+
+ list_for_each_entry_safe(item, next, del_list, log_list) {
+ item->logged = true;
+ list_del_init(&item->log_list);
+ if (refcount_dec_and_test(&item->refs))
+ kfree(item);
+ }
+
+ mutex_unlock(&node->mutex);
+
+ /*
+ * We are called during inode logging, which means the inode is in use
+ * and can not be evicted before we finish logging the inode. So we never
+ * have the last reference on the delayed inode.
+ * Also, we don't use btrfs_release_delayed_node() because that would
+ * requeue the delayed inode (change its order in the list of prepared
+ * nodes) and we don't want to do such change because we don't create or
+ * delete delayed items.
+ */
+ ASSERT(refcount_read(&node->refs) > 1);
+ refcount_dec(&node->refs);
+}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 9795dc295a18..4f21daa3dbc7 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -16,9 +16,10 @@
#include <linux/refcount.h>
#include "ctree.h"
-/* types of the delayed item */
-#define BTRFS_DELAYED_INSERTION_ITEM 1
-#define BTRFS_DELAYED_DELETION_ITEM 2
+enum btrfs_delayed_item_type {
+ BTRFS_DELAYED_INSERTION_ITEM,
+ BTRFS_DELAYED_DELETION_ITEM
+};
struct btrfs_delayed_root {
spinlock_t lock;
@@ -73,14 +74,27 @@ struct btrfs_delayed_node {
struct btrfs_delayed_item {
struct rb_node rb_node;
- struct btrfs_key key;
+ /* Offset value of the corresponding dir index key. */
+ u64 index;
struct list_head tree_list; /* used for batch insert/delete items */
struct list_head readdir_list; /* used for readdir items */
+ /*
+ * Used when logging a directory.
+ * Insertions and deletions to this list are protected by the parent
+ * delayed node's mutex.
+ */
+ struct list_head log_list;
u64 bytes_reserved;
struct btrfs_delayed_node *delayed_node;
refcount_t refs;
- int ins_or_del;
- u32 data_len;
+ enum btrfs_delayed_item_type type:8;
+ /*
+ * Track if this delayed item was already logged.
+ * Protected by the mutex of the parent delayed inode.
+ */
+ bool logged;
+ /* The maximum leaf size is 64K, so u16 is more than enough. */
+ u16 data_len;
char data[];
};
@@ -99,7 +113,7 @@ static inline void btrfs_init_delayed_root(
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *dir,
- struct btrfs_disk_key *disk_key, u8 type,
+ struct btrfs_disk_key *disk_key, u8 flags,
u64 index);
int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
@@ -144,6 +158,14 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
struct list_head *ins_list);
+/* Used during directory logging. */
+void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list);
+void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list);
+
/* for init */
int __init btrfs_delayed_inode_init(void);
void __cold btrfs_delayed_inode_exit(void);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 36a3debe9493..573ebab886e2 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -6,12 +6,14 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/sort.h>
+#include "messages.h"
#include "ctree.h"
#include "delayed-ref.h"
#include "transaction.h"
#include "qgroup.h"
#include "space-info.h"
#include "tree-mod-log.h"
+#include "fs.h"
struct kmem_cache *btrfs_delayed_ref_head_cachep;
struct kmem_cache *btrfs_delayed_tree_ref_cachep;
@@ -69,14 +71,14 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
return btrfs_check_space_for_delayed_refs(trans->fs_info);
}
-/**
- * Release a ref head's reservation
+/*
+ * Release a ref head's reservation.
*
* @fs_info: the filesystem
* @nr: number of items to drop
*
- * This drops the delayed ref head's count from the delayed refs rsv and frees
- * any excess reservation we had.
+ * Drops the delayed ref head's count from the delayed refs rsv and free any
+ * excess reservation we had.
*/
void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
{
@@ -102,8 +104,7 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
}
/*
- * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
- * @trans - the trans that may have generated delayed refs
+ * Adjust the size of the delayed refs rsv.
*
* This is to be called anytime we may have adjusted trans->delayed_ref_updates,
* it'll calculate the additional size and add it to the delayed_refs_rsv.
@@ -137,8 +138,8 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
trans->delayed_ref_updates = 0;
}
-/**
- * Transfer bytes to our delayed refs rsv
+/*
+ * Transfer bytes to our delayed refs rsv.
*
* @fs_info: the filesystem
* @src: source block rsv to transfer from
@@ -186,8 +187,8 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
delayed_refs_rsv->space_info, to_free);
}
-/**
- * Refill based on our delayed refs usage
+/*
+ * Refill based on our delayed refs usage.
*
* @fs_info: the filesystem
* @flush: control how we can flush for this reservation.
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 41cddd3ff059..78696d331639 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -18,11 +18,13 @@
#include "volumes.h"
#include "async-thread.h"
#include "check-integrity.h"
-#include "rcu-string.h"
#include "dev-replace.h"
#include "sysfs.h"
#include "zoned.h"
#include "block-group.h"
+#include "fs.h"
+#include "accessors.h"
+#include "scrub.h"
/*
* Device replace overview
@@ -246,7 +248,6 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
struct block_device *bdev;
- struct rcu_string *name;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
@@ -290,19 +291,12 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
}
- device = btrfs_alloc_device(NULL, &devid, NULL);
+ device = btrfs_alloc_device(NULL, &devid, NULL, device_path);
if (IS_ERR(device)) {
ret = PTR_ERR(device);
goto error;
}
- name = rcu_string_strdup(device_path, GFP_KERNEL);
- if (!name) {
- btrfs_free_device(device);
- ret = -ENOMEM;
- goto error;
- }
- rcu_assign_pointer(device->name, name);
ret = lookup_bdev(device_path, &device->devt);
if (ret)
goto error;
@@ -456,14 +450,6 @@ out:
return ret;
}
-static char* btrfs_dev_name(struct btrfs_device *device)
-{
- if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
- return "<missing disk>";
- else
- return rcu_str_deref(device->name);
-}
-
static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
struct btrfs_device *src_dev)
{
@@ -545,10 +531,7 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
if (!cache)
continue;
- spin_lock(&cache->lock);
- cache->to_copy = 1;
- spin_unlock(&cache->lock);
-
+ set_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
btrfs_put_block_group(cache);
}
if (iter_ret < 0)
@@ -577,7 +560,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
return true;
spin_lock(&cache->lock);
- if (cache->removed) {
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
spin_unlock(&cache->lock);
return true;
}
@@ -610,9 +593,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
}
/* Last stripe on this device */
- spin_lock(&cache->lock);
- cache->to_copy = 0;
- spin_unlock(&cache->lock);
+ clear_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
return true;
}
@@ -684,7 +665,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
"dev_replace from %s (devid %llu) to %s started",
btrfs_dev_name(src_device),
src_device->devid,
- rcu_str_deref(tgt_device->name));
+ btrfs_dev_name(tgt_device));
/*
* from now on, the writes to the srcdev are all duplicated to
@@ -943,7 +924,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
"btrfs_scrub_dev(%s, %llu, %s) failed %d",
btrfs_dev_name(src_device),
src_device->devid,
- rcu_str_deref(tgt_device->name), scrub_ret);
+ btrfs_dev_name(tgt_device), scrub_ret);
error:
up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
@@ -961,7 +942,7 @@ error:
"dev_replace from %s (devid %llu) to %s finished",
btrfs_dev_name(src_device),
src_device->devid,
- rcu_str_deref(tgt_device->name));
+ btrfs_dev_name(tgt_device));
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state);
tgt_device->devid = src_device->devid;
src_device->devid = BTRFS_DEV_REPLACE_DEVID;
@@ -1288,11 +1269,6 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
return 1;
}
-void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
-{
- percpu_counter_inc(&fs_info->dev_replace.bio_counter);
-}
-
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
{
percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 3911049a5f23..675082ccec89 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -7,6 +7,10 @@
#define BTRFS_DEV_REPLACE_H
struct btrfs_ioctl_dev_replace_args;
+struct btrfs_fs_info;
+struct btrfs_trans_handle;
+struct btrfs_dev_replace;
+struct btrfs_block_group;
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans);
@@ -21,5 +25,13 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
struct btrfs_block_group *cache,
u64 physical);
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
+
+static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
+{
+ btrfs_bio_counter_sub(fs_info, 1);
+}
+
#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 72fb2c518a2b..082eb0e19598 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -3,9 +3,12 @@
* Copyright (C) 2007 Oracle. All rights reserved.
*/
+#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
+#include "accessors.h"
+#include "dir-item.h"
/*
* insert a name into a directory, doing overflow properly if there is a hash
@@ -81,7 +84,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
btrfs_cpu_key_to_disk(&disk_key, &location);
btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
- btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
+ btrfs_set_dir_flags(leaf, dir_item, BTRFS_FT_XATTR);
btrfs_set_dir_name_len(leaf, dir_item, name_len);
btrfs_set_dir_transid(leaf, dir_item, trans->transid);
btrfs_set_dir_data_len(leaf, dir_item, data_len);
@@ -103,8 +106,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
* to use for the second index (if one is created).
* Will return 0 or -ENOMEM
*/
-int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
- int name_len, struct btrfs_inode *dir,
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
+ const struct fscrypt_str *name, struct btrfs_inode *dir,
struct btrfs_key *location, u8 type, u64 index)
{
int ret = 0;
@@ -120,7 +123,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
key.objectid = btrfs_ino(dir);
key.type = BTRFS_DIR_ITEM_KEY;
- key.offset = btrfs_name_hash(name, name_len);
+ key.offset = btrfs_name_hash(name->name, name->len);
path = btrfs_alloc_path();
if (!path)
@@ -128,9 +131,9 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
btrfs_cpu_key_to_disk(&disk_key, location);
- data_size = sizeof(*dir_item) + name_len;
+ data_size = sizeof(*dir_item) + name->len;
dir_item = insert_with_overflow(trans, root, path, &key, data_size,
- name, name_len);
+ name->name, name->len);
if (IS_ERR(dir_item)) {
ret = PTR_ERR(dir_item);
if (ret == -EEXIST)
@@ -138,15 +141,18 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
goto out_free;
}
+ if (IS_ENCRYPTED(&dir->vfs_inode))
+ type |= BTRFS_FT_ENCRYPTED;
+
leaf = path->nodes[0];
btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
- btrfs_set_dir_type(leaf, dir_item, type);
+ btrfs_set_dir_flags(leaf, dir_item, type);
btrfs_set_dir_data_len(leaf, dir_item, 0);
- btrfs_set_dir_name_len(leaf, dir_item, name_len);
+ btrfs_set_dir_name_len(leaf, dir_item, name->len);
btrfs_set_dir_transid(leaf, dir_item, trans->transid);
name_ptr = (unsigned long)(dir_item + 1);
- write_extent_buffer(leaf, name, name_ptr, name_len);
+ write_extent_buffer(leaf, name->name, name_ptr, name->len);
btrfs_mark_buffer_dirty(leaf);
second_insert:
@@ -157,7 +163,7 @@ second_insert:
}
btrfs_release_path(path);
- ret2 = btrfs_insert_delayed_dir_index(trans, name, name_len, dir,
+ ret2 = btrfs_insert_delayed_dir_index(trans, name->name, name->len, dir,
&disk_key, type, index);
out_free:
btrfs_free_path(path);
@@ -206,7 +212,7 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir(
struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
- const char *name, int name_len,
+ const struct fscrypt_str *name,
int mod)
{
struct btrfs_key key;
@@ -214,9 +220,10 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
key.objectid = dir;
key.type = BTRFS_DIR_ITEM_KEY;
- key.offset = btrfs_name_hash(name, name_len);
+ key.offset = btrfs_name_hash(name->name, name->len);
- di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name->name,
+ name->len, mod);
if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
return NULL;
@@ -224,7 +231,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
}
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
- const char *name, int name_len)
+ const struct fscrypt_str *name)
{
int ret;
struct btrfs_key key;
@@ -240,9 +247,10 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
key.objectid = dir;
key.type = BTRFS_DIR_ITEM_KEY;
- key.offset = btrfs_name_hash(name, name_len);
+ key.offset = btrfs_name_hash(name->name, name->len);
- di = btrfs_lookup_match_dir(NULL, root, path, &key, name, name_len, 0);
+ di = btrfs_lookup_match_dir(NULL, root, path, &key, name->name,
+ name->len, 0);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
/* Nothing found, we're safe */
@@ -262,11 +270,8 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
goto out;
}
- /*
- * see if there is room in the item to insert this
- * name
- */
- data_size = sizeof(*di) + name_len;
+ /* See if there is room in the item to insert this name. */
+ data_size = sizeof(*di) + name->len;
leaf = path->nodes[0];
slot = path->slots[0];
if (data_size + btrfs_item_size(leaf, slot) +
@@ -303,8 +308,7 @@ struct btrfs_dir_item *
btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
- u64 index, const char *name, int name_len,
- int mod)
+ u64 index, const struct fscrypt_str *name, int mod)
{
struct btrfs_dir_item *di;
struct btrfs_key key;
@@ -313,7 +317,8 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = index;
- di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name->name,
+ name->len, mod);
if (di == ERR_PTR(-ENOENT))
return NULL;
@@ -321,9 +326,8 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
}
struct btrfs_dir_item *
-btrfs_search_dir_index_item(struct btrfs_root *root,
- struct btrfs_path *path, u64 dirid,
- const char *name, int name_len)
+btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path,
+ u64 dirid, const struct fscrypt_str *name)
{
struct btrfs_dir_item *di;
struct btrfs_key key;
@@ -338,7 +342,7 @@ btrfs_search_dir_index_item(struct btrfs_root *root,
break;
di = btrfs_match_dir_item_name(root->fs_info, path,
- name, name_len);
+ name->name, name->len);
if (di)
return di;
}
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
new file mode 100644
index 000000000000..aab4b7cc7fa0
--- /dev/null
+++ b/fs/btrfs/dir-item.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_DIR_ITEM_H
+#define BTRFS_DIR_ITEM_H
+
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+ const struct fscrypt_str *name);
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
+ const struct fscrypt_str *name, struct btrfs_inode *dir,
+ struct btrfs_key *location, u8 type, u64 index);
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ const struct fscrypt_str *name, int mod);
+struct btrfs_dir_item *btrfs_lookup_dir_index_item(
+ struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ u64 index, const struct fscrypt_str *name, int mod);
+struct btrfs_dir_item *btrfs_search_dir_index_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 dirid,
+ const struct fscrypt_str *name);
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_dir_item *di);
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid,
+ const char *name, u16 name_len,
+ const void *data, u16 data_len);
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ const char *name, u16 name_len,
+ int mod);
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ const char *name,
+ int name_len);
+
+#endif
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index e1b7bd927d69..ff2e524d9937 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -11,6 +11,7 @@
#include "block-group.h"
#include "discard.h"
#include "free-space-cache.h"
+#include "fs.h"
/*
* This contains the logic to handle async discard.
@@ -61,7 +62,7 @@
#define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL)
#define BTRFS_DISCARD_MAX_IOPS (10U)
-/* Montonically decreasing minimum length filters after index 0 */
+/* Monotonically decreasing minimum length filters after index 0 */
static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
0,
BTRFS_ASYNC_DISCARD_MAX_FILTER,
@@ -146,10 +147,11 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
return running;
}
-/**
- * find_next_block_group - find block_group that's up next for discarding
- * @discard_ctl: discard control
- * @now: current time
+/*
+ * Find block_group that's up next for discarding.
+ *
+ * @discard_ctl: discard control
+ * @now: current time
*
* Iterate over the discard lists to find the next block_group up for
* discarding checking the discard_eligible_time of block_group.
@@ -184,17 +186,17 @@ static struct btrfs_block_group *find_next_block_group(
return ret_block_group;
}
-/**
- * Wrap find_next_block_group()
+/*
+ * Look up next block group and set it for use.
*
* @discard_ctl: discard control
* @discard_state: the discard_state of the block_group after state management
* @discard_index: the discard_index of the block_group after state management
* @now: time when discard was invoked, in ns
*
- * This wraps find_next_block_group() and sets the block_group to be in use.
- * discard_state's control flow is managed here. Variables related to
- * discard_state are reset here as needed (eg discard_cursor). @discard_state
+ * Wrap find_next_block_group() and set the block_group to be in use.
+ * @discard_state's control flow is managed here. Variables related to
+ * @discard_state are reset here as needed (eg. @discard_cursor). @discard_state
* and @discard_index are remembered as it may change while we're discarding,
* but we want the discard to execute in the context determined here.
*/
@@ -233,10 +235,11 @@ again:
return block_group;
}
-/**
- * btrfs_discard_check_filter - updates a block groups filters
- * @block_group: block group of interest
- * @bytes: recently freed region size after coalescing
+/*
+ * Update a block group's filters.
+ *
+ * @block_group: block group of interest
+ * @bytes: recently freed region size after coalescing
*
* Async discard maintains multiple lists with progressively smaller filters
* to prioritize discarding based on size. Should a free space that matches
@@ -271,8 +274,9 @@ void btrfs_discard_check_filter(struct btrfs_block_group *block_group,
}
}
-/**
- * btrfs_update_discard_index - moves a block group along the discard lists
+/*
+ * Move a block group along the discard lists.
+ *
* @discard_ctl: discard control
* @block_group: block_group of interest
*
@@ -291,13 +295,14 @@ static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
add_to_discard_list(discard_ctl, block_group);
}
-/**
- * btrfs_discard_cancel_work - remove a block_group from the discard lists
+/*
+ * Remove a block_group from the discard lists.
+ *
* @discard_ctl: discard control
* @block_group: block_group of interest
*
- * This removes @block_group from the discard lists. If necessary, it waits on
- * the current work and then reschedules the delayed work.
+ * Remove @block_group from the discard lists. If necessary, wait on the
+ * current work and then reschedule the delayed work.
*/
void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
@@ -308,12 +313,13 @@ void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
}
}
-/**
- * btrfs_discard_queue_work - handles queuing the block_groups
+/*
+ * Handles queuing the block_groups.
+ *
* @discard_ctl: discard control
* @block_group: block_group of interest
*
- * This maintains the LRU order of the discard lists.
+ * Maintain the LRU order of the discard lists.
*/
void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
@@ -383,7 +389,8 @@ static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
}
/*
- * btrfs_discard_schedule_work - responsible for scheduling the discard work
+ * Responsible for scheduling the discard work.
+ *
* @discard_ctl: discard control
* @override: override the current timer
*
@@ -401,15 +408,16 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
spin_unlock(&discard_ctl->lock);
}
-/**
- * btrfs_finish_discard_pass - determine next step of a block_group
+/*
+ * Determine next step of a block_group.
+ *
* @discard_ctl: discard control
* @block_group: block_group of interest
*
- * This determines the next step for a block group after it's finished going
- * through a pass on a discard list. If it is unused and fully trimmed, we can
- * mark it unused and send it to the unused_bgs path. Otherwise, pass it onto
- * the appropriate filter list or let it fall off.
+ * Determine the next step for a block group after it's finished going through
+ * a pass on a discard list. If it is unused and fully trimmed, we can mark it
+ * unused and send it to the unused_bgs path. Otherwise, pass it onto the
+ * appropriate filter list or let it fall off.
*/
static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
@@ -426,12 +434,13 @@ static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
}
}
-/**
- * btrfs_discard_workfn - discard work function
+/*
+ * Discard work queue callback
+ *
* @work: work
*
- * This finds the next block_group to start discarding and then discards a
- * single region. It does this in a two-pass fashion: first extents and second
+ * Find the next block_group to start discarding and then discard a single
+ * region. It does this in a two-pass fashion: first extents and second
* bitmaps. Completely discarded block groups are sent to the unused_bgs path.
*/
static void btrfs_discard_workfn(struct work_struct *work)
@@ -507,11 +516,12 @@ static void btrfs_discard_workfn(struct work_struct *work)
spin_unlock(&discard_ctl->lock);
}
-/**
- * btrfs_run_discard_work - determines if async discard should be running
+/*
+ * Determine if async discard should be running.
+ *
* @discard_ctl: discard control
*
- * Checks if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
+ * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
*/
bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
{
@@ -523,8 +533,9 @@ bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
}
-/**
- * btrfs_discard_calc_delay - recalculate the base delay
+/*
+ * Recalculate the base delay.
+ *
* @discard_ctl: discard control
*
* Recalculate the base delay which is based off the total number of
@@ -545,7 +556,7 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
spin_lock(&discard_ctl->lock);
/*
- * The following is to fix a potential -1 discrepenancy that we're not
+ * The following is to fix a potential -1 discrepancy that we're not
* sure how to reproduce. But given that this is the only place that
* utilizes these numbers and this is only called by from
* btrfs_finish_extent_commit() which is synchronized, we can correct
@@ -578,13 +589,14 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
spin_unlock(&discard_ctl->lock);
}
-/**
- * btrfs_discard_update_discardable - propagate discard counters
+/*
+ * Propagate discard counters.
+ *
* @block_group: block_group of interest
*
- * This propagates deltas of counters up to the discard_ctl. It maintains a
- * current counter and a previous counter passing the delta up to the global
- * stat. Then the current counter value becomes the previous counter value.
+ * Propagate deltas of counters up to the discard_ctl. It maintains a current
+ * counter and a previous counter passing the delta up to the global stat.
+ * Then the current counter value becomes the previous counter value.
*/
void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
{
@@ -619,8 +631,9 @@ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
}
}
-/**
- * btrfs_discard_punt_unused_bgs_list - punt unused_bgs list to discard lists
+/*
+ * Punt unused_bgs list to discard lists.
+ *
* @fs_info: fs_info of interest
*
* The unused_bgs list needs to be punted to the discard lists because the
@@ -644,8 +657,9 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->unused_bgs_lock);
}
-/**
- * btrfs_discard_purge_list - purge discard lists
+/*
+ * Purge discard lists.
+ *
* @discard_ctl: discard control
*
* If we are disabling async discard, we may have intercepted block groups that
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2633137c3e9f..0888d484df80 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -23,7 +23,7 @@
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "volumes.h"
+#include "bio.h"
#include "print-tree.h"
#include "locking.h"
#include "tree-log.h"
@@ -43,6 +43,15 @@
#include "space-info.h"
#include "zoned.h"
#include "subpage.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "root-tree.h"
+#include "defrag.h"
+#include "uuid-tree.h"
+#include "relocation.h"
+#include "scrub.h"
+#include "super.h"
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
BTRFS_HEADER_FLAG_RELOC |\
@@ -75,12 +84,12 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
* just before they are sent down the IO stack.
*/
struct async_submit_bio {
- struct inode *inode;
+ struct btrfs_inode *inode;
struct bio *bio;
- extent_submit_bio_start_t *submit_bio_start;
+ enum btrfs_wq_submit_cmd submit_cmd;
int mirror_num;
- /* Optional parameter for submit_bio_start used by direct io */
+ /* Optional parameter for used by direct io */
u64 dio_file_offset;
struct btrfs_work work;
blk_status_t status;
@@ -131,8 +140,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
if (atomic)
return -EAGAIN;
- lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
- &cached_state);
+ lock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state);
if (extent_buffer_uptodate(eb) &&
btrfs_header_generation(eb) == parent_transid) {
ret = 0;
@@ -145,8 +153,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
ret = 1;
clear_extent_buffer_uptodate(eb);
out:
- unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
- &cached_state);
+ unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+ &cached_state);
return ret;
}
@@ -167,11 +175,9 @@ static bool btrfs_supported_super_csum(u16 csum_type)
* Return 0 if the superblock checksum type matches the checksum value of that
* algorithm. Pass the raw disk superblock data.
*/
-static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
- char *raw_disk_sb)
+int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
+ const struct btrfs_super_block *disk_sb)
{
- struct btrfs_super_block *disk_sb =
- (struct btrfs_super_block *)raw_disk_sb;
char result[BTRFS_CSUM_SIZE];
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
@@ -182,7 +188,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
* BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
* filled with zeros and is included in the checksum.
*/
- crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
+ crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE,
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
if (memcmp(disk_sb->csum, result, fs_info->csum_size))
@@ -249,40 +255,54 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
return ret;
}
+static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
+ int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ u64 start = eb->start;
+ int i, num_pages = num_extent_pages(eb);
+ int ret = 0;
+
+ if (sb_rdonly(fs_info->sb))
+ return -EROFS;
+
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = eb->pages[i];
+
+ ret = btrfs_repair_io_failure(fs_info, 0, start, PAGE_SIZE,
+ start, p, start - page_offset(p), mirror_num);
+ if (ret)
+ break;
+ start += PAGE_SIZE;
+ }
+
+ return ret;
+}
+
/*
* helper to read a given tree block, doing retries as required when
* the checksums don't match and we have alternate mirrors to try.
*
- * @parent_transid: expected transid, skip check if 0
- * @level: expected level, mandatory check
- * @first_key: expected key of first slot, skip check if NULL
+ * @check: expected tree parentness check, see the comments of the
+ * structure for details.
*/
int btrfs_read_extent_buffer(struct extent_buffer *eb,
- u64 parent_transid, int level,
- struct btrfs_key *first_key)
+ struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct extent_io_tree *io_tree;
int failed = 0;
int ret;
int num_copies = 0;
int mirror_num = 0;
int failed_mirror = 0;
- io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
+ ASSERT(check);
+
while (1) {
clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
- ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
- if (!ret) {
- if (verify_parent_transid(io_tree, eb,
- parent_transid, 0))
- ret = -EIO;
- else if (btrfs_verify_level_key(eb, level,
- first_key, parent_transid))
- ret = -EUCLEAN;
- else
- break;
- }
+ ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check);
+ if (!ret)
+ break;
num_copies = btrfs_num_copies(fs_info,
eb->start, eb->len);
@@ -458,7 +478,8 @@ static int check_tree_block_fsid(struct extent_buffer *eb)
}
/* Do basic extent buffer checks at read time */
-static int validate_extent_buffer(struct extent_buffer *eb)
+static int validate_extent_buffer(struct extent_buffer *eb,
+ struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
u64 found_start;
@@ -468,6 +489,8 @@ static int validate_extent_buffer(struct extent_buffer *eb)
const u8 *header_csum;
int ret = 0;
+ ASSERT(check);
+
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
btrfs_err_rl(fs_info,
@@ -506,6 +529,45 @@ static int validate_extent_buffer(struct extent_buffer *eb)
goto out;
}
+ if (found_level != check->level) {
+ ret = -EIO;
+ goto out;
+ }
+ if (unlikely(check->transid &&
+ btrfs_header_generation(eb) != check->transid)) {
+ btrfs_err_rl(eb->fs_info,
+"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
+ eb->start, eb->read_mirror, check->transid,
+ btrfs_header_generation(eb));
+ ret = -EIO;
+ goto out;
+ }
+ if (check->has_first_key) {
+ struct btrfs_key *expect_key = &check->first_key;
+ struct btrfs_key found_key;
+
+ if (found_level)
+ btrfs_node_key_to_cpu(eb, &found_key, 0);
+ else
+ btrfs_item_key_to_cpu(eb, &found_key, 0);
+ if (unlikely(btrfs_comp_cpu_keys(expect_key, &found_key))) {
+ btrfs_err(fs_info,
+"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
+ eb->start, check->transid,
+ expect_key->objectid,
+ expect_key->type, expect_key->offset,
+ found_key.objectid, found_key.type,
+ found_key.offset);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+ if (check->owner_root) {
+ ret = btrfs_check_eb_owner(eb, check->owner_root);
+ if (ret < 0)
+ goto out;
+ }
+
/*
* If this is a leaf block and it is corrupt, set the corrupt bit so
* that we don't try and read the other copies of this block, just
@@ -530,13 +592,15 @@ out:
}
static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
- int mirror)
+ int mirror, struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
struct extent_buffer *eb;
bool reads_done;
int ret = 0;
+ ASSERT(check);
+
/*
* We don't allow bio merge for subpage metadata read, so we should
* only get one eb for each endio hook.
@@ -560,7 +624,7 @@ static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
ret = -EIO;
goto err;
}
- ret = validate_extent_buffer(eb);
+ ret = validate_extent_buffer(eb, check);
if (ret < 0)
goto err;
@@ -590,7 +654,8 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
ASSERT(page->private);
if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
- return validate_subpage_buffer(page, start, end, mirror);
+ return validate_subpage_buffer(page, start, end, mirror,
+ &bbio->parent_check);
eb = (struct extent_buffer *)page->private;
@@ -609,7 +674,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
ret = -EIO;
goto err;
}
- ret = validate_extent_buffer(eb);
+ ret = validate_extent_buffer(eb, &bbio->parent_check);
err:
if (ret) {
/*
@@ -631,8 +696,18 @@ static void run_one_async_start(struct btrfs_work *work)
blk_status_t ret;
async = container_of(work, struct async_submit_bio, work);
- ret = async->submit_bio_start(async->inode, async->bio,
- async->dio_file_offset);
+ switch (async->submit_cmd) {
+ case WQ_SUBMIT_METADATA:
+ ret = btree_submit_bio_start(async->bio);
+ break;
+ case WQ_SUBMIT_DATA:
+ ret = btrfs_submit_bio_start(async->inode, async->bio);
+ break;
+ case WQ_SUBMIT_DATA_DIO:
+ ret = btrfs_submit_bio_start_direct_io(async->inode,
+ async->bio, async->dio_file_offset);
+ break;
+ }
if (ret)
async->status = ret;
}
@@ -647,16 +722,14 @@ static void run_one_async_start(struct btrfs_work *work)
*/
static void run_one_async_done(struct btrfs_work *work)
{
- struct async_submit_bio *async;
- struct inode *inode;
-
- async = container_of(work, struct async_submit_bio, work);
- inode = async->inode;
+ struct async_submit_bio *async =
+ container_of(work, struct async_submit_bio, work);
+ struct btrfs_inode *inode = async->inode;
+ struct btrfs_bio *bbio = btrfs_bio(async->bio);
/* If an error occurred we just want to clean up the bio and move on */
if (async->status) {
- async->bio->bi_status = async->status;
- bio_endio(async->bio);
+ btrfs_bio_end_io(bbio, async->status);
return;
}
@@ -666,7 +739,7 @@ static void run_one_async_done(struct btrfs_work *work)
* This changes nothing when cgroups aren't in use.
*/
async->bio->bi_opf |= REQ_CGROUP_PUNT;
- btrfs_submit_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
+ btrfs_submit_bio(inode->root->fs_info, async->bio, async->mirror_num);
}
static void run_one_async_free(struct btrfs_work *work)
@@ -684,11 +757,10 @@ static void run_one_async_free(struct btrfs_work *work)
* - true if the work has been succesfuly submitted
* - false in case of error
*/
-bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
- u64 dio_file_offset,
- extent_submit_bio_start_t *submit_bio_start)
+bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num,
+ u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd)
{
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct async_submit_bio *async;
async = kmalloc(sizeof(*async), GFP_NOFS);
@@ -698,7 +770,7 @@ bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
async->inode = inode;
async->bio = bio;
async->mirror_num = mirror_num;
- async->submit_bio_start = submit_bio_start;
+ async->submit_cmd = cmd;
btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
run_one_async_free);
@@ -732,8 +804,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
return errno_to_blk_status(ret);
}
-static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
- u64 dio_file_offset)
+blk_status_t btree_submit_bio_start(struct bio *bio)
{
/*
* when we're called for a write, we're already in the async
@@ -754,12 +825,14 @@ static bool should_async_write(struct btrfs_fs_info *fs_info,
return true;
}
-void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num)
+void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
blk_status_t ret;
bio->bi_opf |= REQ_META;
+ bbio->is_metadata = 1;
if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
btrfs_submit_bio(fs_info, bio, mirror_num);
@@ -770,14 +843,13 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_
* Kthread helpers are used to submit writes so that checksumming can
* happen in parallel across all CPUs.
*/
- if (should_async_write(fs_info, BTRFS_I(inode)) &&
- btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btree_submit_bio_start))
+ if (should_async_write(fs_info, inode) &&
+ btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA))
return;
ret = btree_csum_one_bio(bio);
if (ret) {
- bio->bi_status = ret;
- bio_endio(bio);
+ btrfs_bio_end_io(bbio, ret);
return;
}
@@ -924,28 +996,28 @@ struct extent_buffer *btrfs_find_create_tree_block(
* Read tree block at logical address @bytenr and do variant basic but critical
* verification.
*
- * @owner_root: the objectid of the root owner for this block.
- * @parent_transid: expected transid of this tree block, skip check if 0
- * @level: expected level, mandatory check
- * @first_key: expected key in slot 0, skip check if NULL
+ * @check: expected tree parentness check, see comments of the
+ * structure for details.
*/
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 owner_root, u64 parent_transid,
- int level, struct btrfs_key *first_key)
+ struct btrfs_tree_parent_check *check)
{
struct extent_buffer *buf = NULL;
int ret;
- buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
+ ASSERT(check);
+
+ buf = btrfs_find_create_tree_block(fs_info, bytenr, check->owner_root,
+ check->level);
if (IS_ERR(buf))
return buf;
- ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key);
+ ret = btrfs_read_extent_buffer(buf, check);
if (ret) {
free_extent_buffer_stale(buf);
return ERR_PTR(ret);
}
- if (btrfs_check_eb_owner(buf, owner_root)) {
+ if (btrfs_check_eb_owner(buf, check->owner_root)) {
free_extent_buffer_stale(buf);
return ERR_PTR(-EUCLEAN);
}
@@ -1032,9 +1104,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->anon_dev = 0;
if (!dummy) {
extent_io_tree_init(fs_info, &root->dirty_log_pages,
- IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
+ IO_TREE_ROOT_DIRTY_LOG_PAGES);
extent_io_tree_init(fs_info, &root->log_csum_range,
- IO_TREE_LOG_CSUM_RANGE, NULL);
+ IO_TREE_LOG_CSUM_RANGE);
}
spin_lock_init(&root->root_item_lock);
@@ -1172,6 +1244,13 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
return btrfs_global_root(fs_info, &key);
}
+struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
+ return fs_info->block_group_root;
+ return btrfs_extent_root(fs_info, 0);
+}
+
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid)
{
@@ -1202,7 +1281,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
leaf = NULL;
- goto fail_unlock;
+ goto fail;
}
root->node = leaf;
@@ -1237,9 +1316,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
return root;
-fail_unlock:
- if (leaf)
- btrfs_tree_unlock(leaf);
fail:
btrfs_put_root(root);
@@ -1357,6 +1433,7 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
struct btrfs_key *key)
{
struct btrfs_root *root;
+ struct btrfs_tree_parent_check check = { 0 };
struct btrfs_fs_info *fs_info = tree_root->fs_info;
u64 generation;
int ret;
@@ -1376,9 +1453,11 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
generation = btrfs_root_generation(&root->root_item);
level = btrfs_root_level(&root->root_item);
- root->node = read_tree_block(fs_info,
- btrfs_root_bytenr(&root->root_item),
- key->objectid, generation, level, NULL);
+ check.level = level;
+ check.transid = generation;
+ check.owner_root = key->objectid;
+ root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item),
+ &check);
if (IS_ERR(root->node)) {
ret = PTR_ERR(root->node);
root->node = NULL;
@@ -1524,6 +1603,9 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
if (objectid == BTRFS_UUID_TREE_OBJECTID)
return btrfs_grab_root(fs_info->uuid_root) ?
fs_info->uuid_root : ERR_PTR(-ENOENT);
+ if (objectid == BTRFS_BLOCK_GROUP_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->block_group_root) ?
+ fs_info->block_group_root : ERR_PTR(-ENOENT);
if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
struct btrfs_root *root = btrfs_global_root(fs_info, &key);
@@ -1980,14 +2062,7 @@ static void backup_super_roots(struct btrfs_fs_info *info)
btrfs_set_backup_chunk_root_level(root_backup,
btrfs_header_level(info->chunk_root->node));
- if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) {
- btrfs_set_backup_block_group_root(root_backup,
- info->block_group_root->node->start);
- btrfs_set_backup_block_group_root_gen(root_backup,
- btrfs_header_generation(info->block_group_root->node));
- btrfs_set_backup_block_group_root_level(root_backup,
- btrfs_header_level(info->block_group_root->node));
- } else {
+ if (!btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE)) {
struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
@@ -2093,8 +2168,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->workers);
if (fs_info->endio_workers)
destroy_workqueue(fs_info->endio_workers);
- if (fs_info->endio_raid56_workers)
- destroy_workqueue(fs_info->endio_raid56_workers);
if (fs_info->rmw_workers)
destroy_workqueue(fs_info->rmw_workers);
if (fs_info->compressed_write_workers)
@@ -2225,6 +2298,8 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
{
struct inode *inode = fs_info->btree_inode;
+ unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID,
+ fs_info->tree_root);
inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
set_nlink(inode, 1);
@@ -2238,8 +2313,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
- IO_TREE_BTREE_INODE_IO, inode);
- BTRFS_I(inode)->io_tree.track_uptodate = false;
+ IO_TREE_BTREE_INODE_IO);
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
@@ -2247,7 +2321,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
BTRFS_I(inode)->location.type = 0;
BTRFS_I(inode)->location.offset = 0;
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
- btrfs_insert_inode_hash(inode);
+ __insert_inode_hash(inode, hash);
}
static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
@@ -2266,6 +2340,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
fs_info->qgroup_seq = 1;
fs_info->qgroup_ulist = NULL;
fs_info->qgroup_rescan_running = false;
+ fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
mutex_init(&fs_info->qgroup_rescan_lock);
}
@@ -2298,8 +2373,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
alloc_workqueue("btrfs-endio", flags, max_active);
fs_info->endio_meta_workers =
alloc_workqueue("btrfs-endio-meta", flags, max_active);
- fs_info->endio_raid56_workers =
- alloc_workqueue("btrfs-endio-raid56", flags, max_active);
fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
fs_info->endio_write_workers =
btrfs_alloc_workqueue(fs_info, "endio-write", flags,
@@ -2321,7 +2394,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->delalloc_workers && fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
fs_info->compressed_write_workers &&
- fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
+ fs_info->endio_write_workers &&
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->fixup_workers &&
fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
@@ -2357,6 +2430,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices)
{
int ret;
+ struct btrfs_tree_parent_check check = { 0 };
struct btrfs_root *log_tree_root;
struct btrfs_super_block *disk_super = fs_info->super_copy;
u64 bytenr = btrfs_super_log_root(disk_super);
@@ -2372,10 +2446,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
if (!log_tree_root)
return -ENOMEM;
- log_tree_root->node = read_tree_block(fs_info, bytenr,
- BTRFS_TREE_LOG_OBJECTID,
- fs_info->generation + 1, level,
- NULL);
+ check.level = level;
+ check.transid = fs_info->generation + 1;
+ check.owner_root = BTRFS_TREE_LOG_OBJECTID;
+ log_tree_root->node = read_tree_block(fs_info, bytenr, &check);
if (IS_ERR(log_tree_root->node)) {
btrfs_warn(fs_info, "failed to read log tree");
ret = PTR_ERR(log_tree_root->node);
@@ -2529,10 +2603,24 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
if (ret)
return ret;
- location.objectid = BTRFS_DEV_TREE_OBJECTID;
location.type = BTRFS_ROOT_ITEM_KEY;
location.offset = 0;
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
+ location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->block_group_root = root;
+ }
+ }
+
+ location.objectid = BTRFS_DEV_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
@@ -2544,7 +2632,9 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
fs_info->dev_root = root;
}
/* Initialize fs_info for all devices in any case */
- btrfs_init_devices_late(fs_info);
+ ret = btrfs_init_devices_late(fs_info);
+ if (ret)
+ goto out;
/*
* This tree can share blocks with some other fs tree during relocation
@@ -2600,8 +2690,8 @@ out:
* 1, 2 2nd and 3rd backup copy
* -1 skip bytenr check
*/
-static int validate_super(struct btrfs_fs_info *fs_info,
- struct btrfs_super_block *sb, int mirror_num)
+int btrfs_validate_super(struct btrfs_fs_info *fs_info,
+ struct btrfs_super_block *sb, int mirror_num)
{
u64 nodesize = btrfs_super_nodesize(sb);
u64 sectorsize = btrfs_super_sectorsize(sb);
@@ -2703,6 +2793,18 @@ static int validate_super(struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
+ /*
+ * Artificial requirement for block-group-tree to force newer features
+ * (free-space-tree, no-holes) so the test matrix is smaller.
+ */
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
+ (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
+ !btrfs_fs_incompat(fs_info, NO_HOLES))) {
+ btrfs_err(fs_info,
+ "block-group-tree feature requires fres-space-tree and no-holes");
+ ret = -EINVAL;
+ }
+
if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
BTRFS_FSID_SIZE) != 0) {
btrfs_err(fs_info,
@@ -2785,7 +2887,7 @@ static int validate_super(struct btrfs_fs_info *fs_info,
*/
static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
{
- return validate_super(fs_info, fs_info->super_copy, 0);
+ return btrfs_validate_super(fs_info, fs_info->super_copy, 0);
}
/*
@@ -2799,7 +2901,7 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
{
int ret;
- ret = validate_super(fs_info, sb, -1);
+ ret = btrfs_validate_super(fs_info, sb, -1);
if (ret < 0)
goto out;
if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
@@ -2825,10 +2927,14 @@ out:
static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level)
{
+ struct btrfs_tree_parent_check check = {
+ .level = level,
+ .transid = gen,
+ .owner_root = root->root_key.objectid
+ };
int ret = 0;
- root->node = read_tree_block(root->fs_info, bytenr,
- root->root_key.objectid, gen, level, NULL);
+ root->node = read_tree_block(root->fs_info, bytenr, &check);
if (IS_ERR(root->node)) {
ret = PTR_ERR(root->node);
root->node = NULL;
@@ -2860,17 +2966,7 @@ static int load_important_roots(struct btrfs_fs_info *fs_info)
btrfs_warn(fs_info, "couldn't read tree root");
return ret;
}
-
- if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
- return 0;
-
- bytenr = btrfs_super_block_group_root(sb);
- gen = btrfs_super_block_group_root_generation(sb);
- level = btrfs_super_block_group_root_level(sb);
- ret = load_super_root(fs_info->block_group_root, bytenr, gen, level);
- if (ret)
- btrfs_warn(fs_info, "couldn't read block group root");
- return ret;
+ return 0;
}
static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
@@ -2882,16 +2978,6 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
int ret = 0;
int i;
- if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
- struct btrfs_root *root;
-
- root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID,
- GFP_KERNEL);
- if (!root)
- return -ENOMEM;
- fs_info->block_group_root = root;
- }
-
for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
if (handle_error) {
if (!IS_ERR(tree_root->node))
@@ -2990,6 +3076,19 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
mutex_init(&fs_info->zoned_data_reloc_io_lock);
seqlock_init(&fs_info->profiles_lock);
+ btrfs_lockdep_init_map(fs_info, btrfs_trans_num_writers);
+ btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered);
+ btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_start,
+ BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked,
+ BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed,
+ BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_completed,
+ BTRFS_LOCKDEP_TRANS_COMPLETED);
+
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
@@ -3043,7 +3142,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->block_group_cache_tree = RB_ROOT_CACHED;
extent_io_tree_init(fs_info, &fs_info->excluded_extents,
- IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
+ IO_TREE_FS_EXCLUDED_EXTENTS);
mutex_init(&fs_info->ordered_operations_mutex);
mutex_init(&fs_info->tree_log_mutex);
@@ -3279,6 +3378,112 @@ out:
return ret;
}
+/*
+ * Do various sanity and dependency checks of different features.
+ *
+ * This is the place for less strict checks (like for subpage or artificial
+ * feature dependencies).
+ *
+ * For strict checks or possible corruption detection, see
+ * btrfs_validate_super().
+ *
+ * This should be called after btrfs_parse_options(), as some mount options
+ * (space cache related) can modify on-disk format like free space tree and
+ * screw up certain feature dependencies.
+ */
+int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb)
+{
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ u64 incompat = btrfs_super_incompat_flags(disk_super);
+ const u64 compat_ro = btrfs_super_compat_ro_flags(disk_super);
+ const u64 compat_ro_unsupp = (compat_ro & ~BTRFS_FEATURE_COMPAT_RO_SUPP);
+
+ if (incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
+ btrfs_err(fs_info,
+ "cannot mount because of unknown incompat features (0x%llx)",
+ incompat);
+ return -EINVAL;
+ }
+
+ /* Runtime limitation for mixed block groups. */
+ if ((incompat & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+ (fs_info->sectorsize != fs_info->nodesize)) {
+ btrfs_err(fs_info,
+"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
+ fs_info->nodesize, fs_info->sectorsize);
+ return -EINVAL;
+ }
+
+ /* Mixed backref is an always-enabled feature. */
+ incompat |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+
+ /* Set compression related flags just in case. */
+ if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
+ incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+ else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
+ incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
+
+ /*
+ * An ancient flag, which should really be marked deprecated.
+ * Such runtime limitation doesn't really need a incompat flag.
+ */
+ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
+ incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+
+ if (compat_ro_unsupp && !sb_rdonly(sb)) {
+ btrfs_err(fs_info,
+ "cannot mount read-write because of unknown compat_ro features (0x%llx)",
+ compat_ro);
+ return -EINVAL;
+ }
+
+ /*
+ * We have unsupported RO compat features, although RO mounted, we
+ * should not cause any metadata writes, including log replay.
+ * Or we could screw up whatever the new feature requires.
+ */
+ if (compat_ro_unsupp && btrfs_super_log_root(disk_super) &&
+ !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
+ btrfs_err(fs_info,
+"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
+ compat_ro);
+ return -EINVAL;
+ }
+
+ /*
+ * Artificial limitations for block group tree, to force
+ * block-group-tree to rely on no-holes and free-space-tree.
+ */
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
+ (!btrfs_fs_incompat(fs_info, NO_HOLES) ||
+ !btrfs_test_opt(fs_info, FREE_SPACE_TREE))) {
+ btrfs_err(fs_info,
+"block-group-tree feature requires no-holes and free-space-tree features");
+ return -EINVAL;
+ }
+
+ /*
+ * Subpage runtime limitation on v1 cache.
+ *
+ * V1 space cache still has some hard codeed PAGE_SIZE usage, while
+ * we're already defaulting to v2 cache, no need to bother v1 as it's
+ * going to be deprecated anyway.
+ */
+ if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
+ btrfs_warn(fs_info,
+ "v1 space cache is not supported for page size %lu with sectorsize %u",
+ PAGE_SIZE, fs_info->sectorsize);
+ return -EINVAL;
+ }
+
+ /* This can be called by remount, we need to protect the super block. */
+ spin_lock(&fs_info->super_lock);
+ btrfs_set_super_incompat_flags(disk_super, incompat);
+ spin_unlock(&fs_info->super_lock);
+
+ return 0;
+}
+
int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
char *options)
{
@@ -3359,7 +3564,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
* We want to check superblock checksum, the type is stored inside.
* Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
*/
- if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) {
+ if (btrfs_check_super_csum(fs_info, disk_super)) {
btrfs_err(fs_info, "superblock checksum mismatch");
err = -EINVAL;
btrfs_release_disk_super(disk_super);
@@ -3428,72 +3633,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_alloc;
}
- features = btrfs_super_incompat_flags(disk_super) &
- ~BTRFS_FEATURE_INCOMPAT_SUPP;
- if (features) {
- btrfs_err(fs_info,
- "cannot mount because of unsupported optional features (0x%llx)",
- features);
- err = -EINVAL;
- goto fail_alloc;
- }
-
- features = btrfs_super_incompat_flags(disk_super);
- features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
- if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
- features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
- else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
- features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
-
- /*
- * Flag our filesystem as having big metadata blocks if they are bigger
- * than the page size.
- */
- if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
- features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
-
- /*
- * mixed block groups end up with duplicate but slightly offset
- * extent buffers for the same range. It leads to corruptions
- */
- if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
- (sectorsize != nodesize)) {
- btrfs_err(fs_info,
-"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
- nodesize, sectorsize);
- goto fail_alloc;
- }
-
- /*
- * Needn't use the lock because there is no other task which will
- * update the flag.
- */
- btrfs_set_super_incompat_flags(disk_super, features);
-
- features = btrfs_super_compat_ro_flags(disk_super) &
- ~BTRFS_FEATURE_COMPAT_RO_SUPP;
- if (!sb_rdonly(sb) && features) {
- btrfs_err(fs_info,
- "cannot mount read-write because of unsupported optional features (0x%llx)",
- features);
- err = -EINVAL;
- goto fail_alloc;
- }
- /*
- * We have unsupported RO compat features, although RO mounted, we
- * should not cause any metadata write, including log replay.
- * Or we could screw up whatever the new feature requires.
- */
- if (unlikely(features && btrfs_super_log_root(disk_super) &&
- !btrfs_test_opt(fs_info, NOLOGREPLAY))) {
- btrfs_err(fs_info,
-"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
- features);
- err = -EINVAL;
+ ret = btrfs_check_features(fs_info, sb);
+ if (ret < 0) {
+ err = ret;
goto fail_alloc;
}
-
if (sectorsize < PAGE_SIZE) {
struct btrfs_subpage_info *subpage_info;
@@ -3683,10 +3828,18 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
/*
- * Mount does not set all options immediately, we can do it now and do
- * not have to wait for transaction commit
+ * For devices supporting discard turn on discard=async automatically,
+ * unless it's already set or disabled. This could be turned off by
+ * nodiscard for the same mount.
*/
- btrfs_apply_pending_changes(fs_info);
+ if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) ||
+ btrfs_test_opt(fs_info, DISCARD_ASYNC) ||
+ btrfs_test_opt(fs_info, NODISCARD)) &&
+ fs_info->fs_devices->discardable) {
+ btrfs_set_and_info(fs_info, DISCARD_ASYNC,
+ "auto enabling async discard");
+ btrfs_clear_opt(fs_info->mount_opt, NODISCARD);
+ }
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
@@ -3815,7 +3968,7 @@ static void btrfs_end_super_write(struct bio *bio)
if (bio->bi_status) {
btrfs_warn_rl_in_rcu(device->fs_info,
"lost page write due to IO error on %s (%d)",
- rcu_str_deref(device->name),
+ btrfs_dev_name(device),
blk_status_to_errno(bio->bi_status));
ClearPageUptodate(page);
SetPageError(page);
@@ -3833,7 +3986,7 @@ static void btrfs_end_super_write(struct bio *bio)
}
struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
- int copy_num)
+ int copy_num, bool drop_cache)
{
struct btrfs_super_block *super;
struct page *page;
@@ -3851,6 +4004,19 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
return ERR_PTR(-EINVAL);
+ if (drop_cache) {
+ /* This should only be called with the primary sb. */
+ ASSERT(copy_num == 0);
+
+ /*
+ * Drop the page of the primary superblock, so later read will
+ * always read from the device.
+ */
+ invalidate_inode_pages2_range(mapping,
+ bytenr >> PAGE_SHIFT,
+ (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT);
+ }
+
page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
if (IS_ERR(page))
return ERR_CAST(page);
@@ -3882,7 +4048,7 @@ struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
for (i = 0; i < 1; i++) {
- super = btrfs_read_dev_one_super(bdev, i);
+ super = btrfs_read_dev_one_super(bdev, i, false);
if (IS_ERR(super))
continue;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 47ad8e0a2d33..363935cfc084 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -27,14 +27,14 @@ static inline u64 btrfs_sb_offset(int mirror)
struct btrfs_device;
struct btrfs_fs_devices;
+struct btrfs_tree_parent_check;
void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info);
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_verify_level_key(struct extent_buffer *eb, int level,
struct btrfs_key *first_key, u64 parent_transid);
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 owner_root, u64 parent_transid,
- int level, struct btrfs_key *first_key);
+ struct btrfs_tree_parent_check *check);
struct extent_buffer *btrfs_find_create_tree_block(
struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root,
@@ -42,14 +42,19 @@ struct extent_buffer *btrfs_find_create_tree_block(
void btrfs_clean_tree_block(struct extent_buffer *buf);
void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info);
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
+int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
+ const struct btrfs_super_block *disk_sb);
int __cold open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options);
void __cold close_ctree(struct btrfs_fs_info *fs_info);
+int btrfs_validate_super(struct btrfs_fs_info *fs_info,
+ struct btrfs_super_block *sb, int mirror_num);
+int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb);
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
- int copy_num);
+ int copy_num, bool drop_cache);
int btrfs_commit_super(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
struct btrfs_key *key);
@@ -70,6 +75,7 @@ struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
struct btrfs_key *key);
struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr);
struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr);
+struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info);
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
@@ -80,7 +86,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
struct page *page, u64 start, u64 end,
int mirror);
-void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num);
+void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
#endif
@@ -101,24 +107,22 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
return NULL;
}
-static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
-{
- if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
- return fs_info->block_group_root;
- return btrfs_extent_root(fs_info, 0);
-}
-
void btrfs_put_root(struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
-int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid,
- int level, struct btrfs_key *first_key);
-bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
- u64 dio_file_offset,
- extent_submit_bio_start_t *submit_bio_start);
-blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
- int mirror_num);
+int btrfs_read_extent_buffer(struct extent_buffer *buf,
+ struct btrfs_tree_parent_check *check);
+
+enum btrfs_wq_submit_cmd {
+ WQ_SUBMIT_METADATA,
+ WQ_SUBMIT_DATA,
+ WQ_SUBMIT_DATA_DIO,
+};
+
+bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num,
+ u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd);
+blk_status_t btree_submit_bio_start(struct bio *bio);
int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
@@ -131,8 +135,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid);
-int btree_lock_page_hook(struct page *page, void *data,
- void (*flush_fn)(void *));
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid);
int btrfs_init_root_free_objectid(struct btrfs_root *root);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1d4c2397d0d6..744a02b7fd67 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -7,6 +7,8 @@
#include "btrfs_inode.h"
#include "print-tree.h"
#include "export.h"
+#include "accessors.h"
+#include "super.h"
#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
parent_objectid) / 4)
@@ -57,9 +59,20 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
return type;
}
+/*
+ * Read dentry of inode with @objectid from filesystem root @root_objectid.
+ *
+ * @sb: the filesystem super block
+ * @objectid: inode objectid
+ * @root_objectid: object id of the subvolume root where to look up the inode
+ * @generation: optional, if not zero, verify that the found inode
+ * generation matches
+ *
+ * Return dentry alias for the inode, otherwise an error. In case the
+ * generation does not match return ESTALE.
+ */
struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u32 generation,
- int check_generation)
+ u64 root_objectid, u64 generation)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root;
@@ -77,7 +90,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
if (IS_ERR(inode))
return ERR_CAST(inode);
- if (check_generation && generation != inode->i_generation) {
+ if (generation != 0 && generation != inode->i_generation) {
iput(inode);
return ERR_PTR(-ESTALE);
}
@@ -106,7 +119,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
objectid = fid->parent_objectid;
generation = fid->parent_gen;
- return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+ return btrfs_get_dentry(sb, objectid, root_objectid, generation);
}
static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
@@ -128,7 +141,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
root_objectid = fid->root_objectid;
generation = fid->gen;
- return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+ return btrfs_get_dentry(sb, objectid, root_objectid, generation);
}
struct dentry *btrfs_get_parent(struct dentry *child)
@@ -188,7 +201,7 @@ struct dentry *btrfs_get_parent(struct dentry *child)
if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
return btrfs_get_dentry(fs_info->sb, key.objectid,
- found_key.offset, 0, 0);
+ found_key.offset, 0);
}
return d_obtain_alias(btrfs_iget(fs_info->sb, key.objectid, root));
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
index f32f4113c976..eba6bc4f5a61 100644
--- a/fs/btrfs/export.h
+++ b/fs/btrfs/export.h
@@ -19,8 +19,7 @@ struct btrfs_fid {
} __attribute__ ((packed));
struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u32 generation,
- int check_generation);
+ u64 root_objectid, u64 generation);
struct dentry *btrfs_get_parent(struct dentry *child);
#endif
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
new file mode 100644
index 000000000000..9ae9cd1e7035
--- /dev/null
+++ b/fs/btrfs/extent-io-tree.c
@@ -0,0 +1,1766 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/slab.h>
+#include <trace/events/btrfs.h>
+#include "messages.h"
+#include "ctree.h"
+#include "extent-io-tree.h"
+#include "btrfs_inode.h"
+#include "misc.h"
+
+static struct kmem_cache *extent_state_cache;
+
+static inline bool extent_state_in_tree(const struct extent_state *state)
+{
+ return !RB_EMPTY_NODE(&state->rb_node);
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+static LIST_HEAD(states);
+static DEFINE_SPINLOCK(leak_lock);
+
+static inline void btrfs_leak_debug_add_state(struct extent_state *state)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&leak_lock, flags);
+ list_add(&state->leak_list, &states);
+ spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline void btrfs_leak_debug_del_state(struct extent_state *state)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&leak_lock, flags);
+ list_del(&state->leak_list);
+ spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline void btrfs_extent_state_leak_debug_check(void)
+{
+ struct extent_state *state;
+
+ while (!list_empty(&states)) {
+ state = list_entry(states.next, struct extent_state, leak_list);
+ pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
+ state->start, state->end, state->state,
+ extent_state_in_tree(state),
+ refcount_read(&state->refs));
+ list_del(&state->leak_list);
+ kmem_cache_free(extent_state_cache, state);
+ }
+}
+
+#define btrfs_debug_check_extent_io_range(tree, start, end) \
+ __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
+static inline void __btrfs_debug_check_extent_io_range(const char *caller,
+ struct extent_io_tree *tree,
+ u64 start, u64 end)
+{
+ struct btrfs_inode *inode = tree->inode;
+ u64 isize;
+
+ if (!inode)
+ return;
+
+ isize = i_size_read(&inode->vfs_inode);
+ if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+ btrfs_debug_rl(inode->root->fs_info,
+ "%s: ino %llu isize %llu odd range [%llu,%llu]",
+ caller, btrfs_ino(inode), isize, start, end);
+ }
+}
+#else
+#define btrfs_leak_debug_add_state(state) do {} while (0)
+#define btrfs_leak_debug_del_state(state) do {} while (0)
+#define btrfs_extent_state_leak_debug_check() do {} while (0)
+#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
+#endif
+
+/*
+ * For the file_extent_tree, we want to hold the inode lock when we lookup and
+ * update the disk_i_size, but lockdep will complain because our io_tree we hold
+ * the tree lock and get the inode lock when setting delalloc. These two things
+ * are unrelated, so make a class for the file_extent_tree so we don't get the
+ * two locking patterns mixed up.
+ */
+static struct lock_class_key file_extent_tree_class;
+
+struct tree_entry {
+ u64 start;
+ u64 end;
+ struct rb_node rb_node;
+};
+
+void extent_io_tree_init(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *tree, unsigned int owner)
+{
+ tree->fs_info = fs_info;
+ tree->state = RB_ROOT;
+ spin_lock_init(&tree->lock);
+ tree->inode = NULL;
+ tree->owner = owner;
+ if (owner == IO_TREE_INODE_FILE_EXTENT)
+ lockdep_set_class(&tree->lock, &file_extent_tree_class);
+}
+
+void extent_io_tree_release(struct extent_io_tree *tree)
+{
+ spin_lock(&tree->lock);
+ /*
+ * Do a single barrier for the waitqueue_active check here, the state
+ * of the waitqueue should not change once extent_io_tree_release is
+ * called.
+ */
+ smp_mb();
+ while (!RB_EMPTY_ROOT(&tree->state)) {
+ struct rb_node *node;
+ struct extent_state *state;
+
+ node = rb_first(&tree->state);
+ state = rb_entry(node, struct extent_state, rb_node);
+ rb_erase(&state->rb_node, &tree->state);
+ RB_CLEAR_NODE(&state->rb_node);
+ /*
+ * btree io trees aren't supposed to have tasks waiting for
+ * changes in the flags of extent states ever.
+ */
+ ASSERT(!waitqueue_active(&state->wq));
+ free_extent_state(state);
+
+ cond_resched_lock(&tree->lock);
+ }
+ spin_unlock(&tree->lock);
+}
+
+static struct extent_state *alloc_extent_state(gfp_t mask)
+{
+ struct extent_state *state;
+
+ /*
+ * The given mask might be not appropriate for the slab allocator,
+ * drop the unsupported bits
+ */
+ mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
+ state = kmem_cache_alloc(extent_state_cache, mask);
+ if (!state)
+ return state;
+ state->state = 0;
+ RB_CLEAR_NODE(&state->rb_node);
+ btrfs_leak_debug_add_state(state);
+ refcount_set(&state->refs, 1);
+ init_waitqueue_head(&state->wq);
+ trace_alloc_extent_state(state, mask, _RET_IP_);
+ return state;
+}
+
+static struct extent_state *alloc_extent_state_atomic(struct extent_state *prealloc)
+{
+ if (!prealloc)
+ prealloc = alloc_extent_state(GFP_ATOMIC);
+
+ return prealloc;
+}
+
+void free_extent_state(struct extent_state *state)
+{
+ if (!state)
+ return;
+ if (refcount_dec_and_test(&state->refs)) {
+ WARN_ON(extent_state_in_tree(state));
+ btrfs_leak_debug_del_state(state);
+ trace_free_extent_state(state, _RET_IP_);
+ kmem_cache_free(extent_state_cache, state);
+ }
+}
+
+static int add_extent_changeset(struct extent_state *state, u32 bits,
+ struct extent_changeset *changeset,
+ int set)
+{
+ int ret;
+
+ if (!changeset)
+ return 0;
+ if (set && (state->state & bits) == bits)
+ return 0;
+ if (!set && (state->state & bits) == 0)
+ return 0;
+ changeset->bytes_changed += state->end - state->start + 1;
+ ret = ulist_add(&changeset->range_changed, state->start, state->end,
+ GFP_ATOMIC);
+ return ret;
+}
+
+static inline struct extent_state *next_state(struct extent_state *state)
+{
+ struct rb_node *next = rb_next(&state->rb_node);
+
+ if (next)
+ return rb_entry(next, struct extent_state, rb_node);
+ else
+ return NULL;
+}
+
+static inline struct extent_state *prev_state(struct extent_state *state)
+{
+ struct rb_node *next = rb_prev(&state->rb_node);
+
+ if (next)
+ return rb_entry(next, struct extent_state, rb_node);
+ else
+ return NULL;
+}
+
+/*
+ * Search @tree for an entry that contains @offset. Such entry would have
+ * entry->start <= offset && entry->end >= offset.
+ *
+ * @tree: the tree to search
+ * @offset: offset that should fall within an entry in @tree
+ * @node_ret: pointer where new node should be anchored (used when inserting an
+ * entry in the tree)
+ * @parent_ret: points to entry which would have been the parent of the entry,
+ * containing @offset
+ *
+ * Return a pointer to the entry that contains @offset byte address and don't change
+ * @node_ret and @parent_ret.
+ *
+ * If no such entry exists, return pointer to entry that ends before @offset
+ * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
+ */
+static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree,
+ u64 offset,
+ struct rb_node ***node_ret,
+ struct rb_node **parent_ret)
+{
+ struct rb_root *root = &tree->state;
+ struct rb_node **node = &root->rb_node;
+ struct rb_node *prev = NULL;
+ struct extent_state *entry = NULL;
+
+ while (*node) {
+ prev = *node;
+ entry = rb_entry(prev, struct extent_state, rb_node);
+
+ if (offset < entry->start)
+ node = &(*node)->rb_left;
+ else if (offset > entry->end)
+ node = &(*node)->rb_right;
+ else
+ return entry;
+ }
+
+ if (node_ret)
+ *node_ret = node;
+ if (parent_ret)
+ *parent_ret = prev;
+
+ /* Search neighbors until we find the first one past the end */
+ while (entry && offset > entry->end)
+ entry = next_state(entry);
+
+ return entry;
+}
+
+/*
+ * Search offset in the tree or fill neighbor rbtree node pointers.
+ *
+ * @tree: the tree to search
+ * @offset: offset that should fall within an entry in @tree
+ * @next_ret: pointer to the first entry whose range ends after @offset
+ * @prev_ret: pointer to the first entry whose range begins before @offset
+ *
+ * Return a pointer to the entry that contains @offset byte address. If no
+ * such entry exists, then return NULL and fill @prev_ret and @next_ret.
+ * Otherwise return the found entry and other pointers are left untouched.
+ */
+static struct extent_state *tree_search_prev_next(struct extent_io_tree *tree,
+ u64 offset,
+ struct extent_state **prev_ret,
+ struct extent_state **next_ret)
+{
+ struct rb_root *root = &tree->state;
+ struct rb_node **node = &root->rb_node;
+ struct extent_state *orig_prev;
+ struct extent_state *entry = NULL;
+
+ ASSERT(prev_ret);
+ ASSERT(next_ret);
+
+ while (*node) {
+ entry = rb_entry(*node, struct extent_state, rb_node);
+
+ if (offset < entry->start)
+ node = &(*node)->rb_left;
+ else if (offset > entry->end)
+ node = &(*node)->rb_right;
+ else
+ return entry;
+ }
+
+ orig_prev = entry;
+ while (entry && offset > entry->end)
+ entry = next_state(entry);
+ *next_ret = entry;
+ entry = orig_prev;
+
+ while (entry && offset < entry->start)
+ entry = prev_state(entry);
+ *prev_ret = entry;
+
+ return NULL;
+}
+
+/*
+ * Inexact rb-tree search, return the next entry if @offset is not found
+ */
+static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 offset)
+{
+ return tree_search_for_insert(tree, offset, NULL, NULL);
+}
+
+static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+{
+ btrfs_panic(tree->fs_info, err,
+ "locking error: extent tree was modified by another thread while locked");
+}
+
+/*
+ * Utility function to look for merge candidates inside a given range. Any
+ * extents with matching state are merged together into a single extent in the
+ * tree. Extents with EXTENT_IO in their state field are not merged because
+ * the end_io handlers need to be able to do operations on them without
+ * sleeping (or doing allocations/splits).
+ *
+ * This should be called with the tree lock held.
+ */
+static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
+{
+ struct extent_state *other;
+
+ if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
+ return;
+
+ other = prev_state(state);
+ if (other && other->end == state->start - 1 &&
+ other->state == state->state) {
+ if (tree->inode)
+ btrfs_merge_delalloc_extent(tree->inode, state, other);
+ state->start = other->start;
+ rb_erase(&other->rb_node, &tree->state);
+ RB_CLEAR_NODE(&other->rb_node);
+ free_extent_state(other);
+ }
+ other = next_state(state);
+ if (other && other->start == state->end + 1 &&
+ other->state == state->state) {
+ if (tree->inode)
+ btrfs_merge_delalloc_extent(tree->inode, state, other);
+ state->end = other->end;
+ rb_erase(&other->rb_node, &tree->state);
+ RB_CLEAR_NODE(&other->rb_node);
+ free_extent_state(other);
+ }
+}
+
+static void set_state_bits(struct extent_io_tree *tree,
+ struct extent_state *state,
+ u32 bits, struct extent_changeset *changeset)
+{
+ u32 bits_to_set = bits & ~EXTENT_CTLBITS;
+ int ret;
+
+ if (tree->inode)
+ btrfs_set_delalloc_extent(tree->inode, state, bits);
+
+ ret = add_extent_changeset(state, bits_to_set, changeset, 1);
+ BUG_ON(ret < 0);
+ state->state |= bits_to_set;
+}
+
+/*
+ * Insert an extent_state struct into the tree. 'bits' are set on the
+ * struct before it is inserted.
+ *
+ * This may return -EEXIST if the extent is already there, in which case the
+ * state struct is freed.
+ *
+ * The tree lock is not taken internally. This is a utility function and
+ * probably isn't what you want to call (see set/clear_extent_bit).
+ */
+static int insert_state(struct extent_io_tree *tree,
+ struct extent_state *state,
+ u32 bits, struct extent_changeset *changeset)
+{
+ struct rb_node **node;
+ struct rb_node *parent = NULL;
+ const u64 end = state->end;
+
+ set_state_bits(tree, state, bits, changeset);
+
+ node = &tree->state.rb_node;
+ while (*node) {
+ struct extent_state *entry;
+
+ parent = *node;
+ entry = rb_entry(parent, struct extent_state, rb_node);
+
+ if (end < entry->start) {
+ node = &(*node)->rb_left;
+ } else if (end > entry->end) {
+ node = &(*node)->rb_right;
+ } else {
+ btrfs_err(tree->fs_info,
+ "found node %llu %llu on insert of %llu %llu",
+ entry->start, entry->end, state->start, end);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&state->rb_node, parent, node);
+ rb_insert_color(&state->rb_node, &tree->state);
+
+ merge_state(tree, state);
+ return 0;
+}
+
+/*
+ * Insert state to @tree to the location given by @node and @parent.
+ */
+static void insert_state_fast(struct extent_io_tree *tree,
+ struct extent_state *state, struct rb_node **node,
+ struct rb_node *parent, unsigned bits,
+ struct extent_changeset *changeset)
+{
+ set_state_bits(tree, state, bits, changeset);
+ rb_link_node(&state->rb_node, parent, node);
+ rb_insert_color(&state->rb_node, &tree->state);
+ merge_state(tree, state);
+}
+
+/*
+ * Split a given extent state struct in two, inserting the preallocated
+ * struct 'prealloc' as the newly created second half. 'split' indicates an
+ * offset inside 'orig' where it should be split.
+ *
+ * Before calling,
+ * the tree has 'orig' at [orig->start, orig->end]. After calling, there
+ * are two extent state structs in the tree:
+ * prealloc: [orig->start, split - 1]
+ * orig: [ split, orig->end ]
+ *
+ * The tree locks are not taken by this function. They need to be held
+ * by the caller.
+ */
+static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
+ struct extent_state *prealloc, u64 split)
+{
+ struct rb_node *parent = NULL;
+ struct rb_node **node;
+
+ if (tree->inode)
+ btrfs_split_delalloc_extent(tree->inode, orig, split);
+
+ prealloc->start = orig->start;
+ prealloc->end = split - 1;
+ prealloc->state = orig->state;
+ orig->start = split;
+
+ parent = &orig->rb_node;
+ node = &parent;
+ while (*node) {
+ struct extent_state *entry;
+
+ parent = *node;
+ entry = rb_entry(parent, struct extent_state, rb_node);
+
+ if (prealloc->end < entry->start) {
+ node = &(*node)->rb_left;
+ } else if (prealloc->end > entry->end) {
+ node = &(*node)->rb_right;
+ } else {
+ free_extent_state(prealloc);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&prealloc->rb_node, parent, node);
+ rb_insert_color(&prealloc->rb_node, &tree->state);
+
+ return 0;
+}
+
+/*
+ * Utility function to clear some bits in an extent state struct. It will
+ * optionally wake up anyone waiting on this state (wake == 1).
+ *
+ * If no bits are set on the state struct after clearing things, the
+ * struct is freed and removed from the tree
+ */
+static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
+ struct extent_state *state,
+ u32 bits, int wake,
+ struct extent_changeset *changeset)
+{
+ struct extent_state *next;
+ u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
+ int ret;
+
+ if (tree->inode)
+ btrfs_clear_delalloc_extent(tree->inode, state, bits);
+
+ ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
+ BUG_ON(ret < 0);
+ state->state &= ~bits_to_clear;
+ if (wake)
+ wake_up(&state->wq);
+ if (state->state == 0) {
+ next = next_state(state);
+ if (extent_state_in_tree(state)) {
+ rb_erase(&state->rb_node, &tree->state);
+ RB_CLEAR_NODE(&state->rb_node);
+ free_extent_state(state);
+ } else {
+ WARN_ON(1);
+ }
+ } else {
+ merge_state(tree, state);
+ next = next_state(state);
+ }
+ return next;
+}
+
+/*
+ * Clear some bits on a range in the tree. This may require splitting or
+ * inserting elements in the tree, so the gfp mask is used to indicate which
+ * allocations or sleeping are allowed.
+ *
+ * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given
+ * range from the tree regardless of state (ie for truncate).
+ *
+ * The range [start, end] is inclusive.
+ *
+ * This takes the tree lock, and returns 0 on success and < 0 on error.
+ */
+int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state,
+ gfp_t mask, struct extent_changeset *changeset)
+{
+ struct extent_state *state;
+ struct extent_state *cached;
+ struct extent_state *prealloc = NULL;
+ u64 last_end;
+ int err;
+ int clear = 0;
+ int wake;
+ int delete = (bits & EXTENT_CLEAR_ALL_BITS);
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+ trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
+
+ if (delete)
+ bits |= ~EXTENT_CTLBITS;
+
+ if (bits & EXTENT_DELALLOC)
+ bits |= EXTENT_NORESERVE;
+
+ wake = (bits & EXTENT_LOCKED) ? 1 : 0;
+ if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
+ clear = 1;
+again:
+ if (!prealloc) {
+ /*
+ * Don't care for allocation failure here because we might end
+ * up not needing the pre-allocated extent state at all, which
+ * is the case if we only have in the tree extent states that
+ * cover our input range and don't cover too any other range.
+ * If we end up needing a new extent state we allocate it later.
+ */
+ prealloc = alloc_extent_state(mask);
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state) {
+ cached = *cached_state;
+
+ if (clear) {
+ *cached_state = NULL;
+ cached_state = NULL;
+ }
+
+ if (cached && extent_state_in_tree(cached) &&
+ cached->start <= start && cached->end > start) {
+ if (clear)
+ refcount_dec(&cached->refs);
+ state = cached;
+ goto hit_next;
+ }
+ if (clear)
+ free_extent_state(cached);
+ }
+
+ /* This search will find the extents that end after our range starts. */
+ state = tree_search(tree, start);
+ if (!state)
+ goto out;
+hit_next:
+ if (state->start > end)
+ goto out;
+ WARN_ON(state->end < start);
+ last_end = state->end;
+
+ /* The state doesn't have the wanted bits, go ahead. */
+ if (!(state->state & bits)) {
+ state = next_state(state);
+ goto next;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state | or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on second
+ * half.
+ *
+ * If the extent we found extends past our range, we just split and
+ * search again. It'll get split again the next time though.
+ *
+ * If the extent we found is inside our range, we clear the desired bit
+ * on it.
+ */
+
+ if (state->start < start) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ goto search_again;
+ err = split_state(tree, state, prealloc, start);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ state = clear_state_bit(tree, state, bits, wake, changeset);
+ goto next;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * We need to split the extent, and clear the bit on the first half.
+ */
+ if (state->start <= end && state->end > end) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ goto search_again;
+ err = split_state(tree, state, prealloc, end + 1);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ if (wake)
+ wake_up(&state->wq);
+
+ clear_state_bit(tree, prealloc, bits, wake, changeset);
+
+ prealloc = NULL;
+ goto out;
+ }
+
+ state = clear_state_bit(tree, state, bits, wake, changeset);
+next:
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start <= end && state && !need_resched())
+ goto hit_next;
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (gfpflags_allow_blocking(mask))
+ cond_resched();
+ goto again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return 0;
+
+}
+
+static void wait_on_state(struct extent_io_tree *tree,
+ struct extent_state *state)
+ __releases(tree->lock)
+ __acquires(tree->lock)
+{
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&tree->lock);
+ schedule();
+ spin_lock(&tree->lock);
+ finish_wait(&state->wq, &wait);
+}
+
+/*
+ * Wait for one or more bits to clear on a range in the state tree.
+ * The range [start, end] is inclusive.
+ * The tree lock is taken by this function
+ */
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+
+ spin_lock(&tree->lock);
+again:
+ /*
+ * Maintain cached_state, as we may not remove it from the tree if there
+ * are more bits than the bits we're waiting on set on this state.
+ */
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (extent_state_in_tree(state) &&
+ state->start <= start && start < state->end)
+ goto process_node;
+ }
+ while (1) {
+ /*
+ * This search will find all the extents that end after our
+ * range starts.
+ */
+ state = tree_search(tree, start);
+process_node:
+ if (!state)
+ break;
+ if (state->start > end)
+ goto out;
+
+ if (state->state & bits) {
+ start = state->start;
+ refcount_inc(&state->refs);
+ wait_on_state(tree, state);
+ free_extent_state(state);
+ goto again;
+ }
+ start = state->end + 1;
+
+ if (start > end)
+ break;
+
+ if (!cond_resched_lock(&tree->lock)) {
+ state = next_state(state);
+ goto process_node;
+ }
+ }
+out:
+ /* This state is no longer useful, clear it and free it up. */
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ *cached_state = NULL;
+ free_extent_state(state);
+ }
+ spin_unlock(&tree->lock);
+}
+
+static void cache_state_if_flags(struct extent_state *state,
+ struct extent_state **cached_ptr,
+ unsigned flags)
+{
+ if (cached_ptr && !(*cached_ptr)) {
+ if (!flags || (state->state & flags)) {
+ *cached_ptr = state;
+ refcount_inc(&state->refs);
+ }
+ }
+}
+
+static void cache_state(struct extent_state *state,
+ struct extent_state **cached_ptr)
+{
+ return cache_state_if_flags(state, cached_ptr,
+ EXTENT_LOCKED | EXTENT_BOUNDARY);
+}
+
+/*
+ * Find the first state struct with 'bits' set after 'start', and return it.
+ * tree->lock must be held. NULL will returned if nothing was found after
+ * 'start'.
+ */
+static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
+ u64 start, u32 bits)
+{
+ struct extent_state *state;
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search(tree, start);
+ while (state) {
+ if (state->end >= start && (state->state & bits))
+ return state;
+ state = next_state(state);
+ }
+ return NULL;
+}
+
+/*
+ * Find the first offset in the io tree with one or more @bits set.
+ *
+ * Note: If there are multiple bits set in @bits, any of them will match.
+ *
+ * Return 0 if we find something, and update @start_ret and @end_ret.
+ * Return 1 if we found nothing.
+ */
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+ int ret = 1;
+
+ spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->end == start - 1 && extent_state_in_tree(state)) {
+ while ((state = next_state(state)) != NULL) {
+ if (state->state & bits)
+ goto got_it;
+ }
+ free_extent_state(*cached_state);
+ *cached_state = NULL;
+ goto out;
+ }
+ free_extent_state(*cached_state);
+ *cached_state = NULL;
+ }
+
+ state = find_first_extent_bit_state(tree, start, bits);
+got_it:
+ if (state) {
+ cache_state_if_flags(state, cached_state, 0);
+ *start_ret = state->start;
+ *end_ret = state->end;
+ ret = 0;
+ }
+out:
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/*
+ * Find a contiguous area of bits
+ *
+ * @tree: io tree to check
+ * @start: offset to start the search from
+ * @start_ret: the first offset we found with the bits set
+ * @end_ret: the final contiguous range of the bits that were set
+ * @bits: bits to look for
+ *
+ * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
+ * to set bits appropriately, and then merge them again. During this time it
+ * will drop the tree->lock, so use this helper if you want to find the actual
+ * contiguous area for given bits. We will search to the first bit we find, and
+ * then walk down the tree until we find a non-contiguous area. The area
+ * returned will be the full contiguous area with the bits set.
+ */
+int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits)
+{
+ struct extent_state *state;
+ int ret = 1;
+
+ spin_lock(&tree->lock);
+ state = find_first_extent_bit_state(tree, start, bits);
+ if (state) {
+ *start_ret = state->start;
+ *end_ret = state->end;
+ while ((state = next_state(state)) != NULL) {
+ if (state->start > (*end_ret + 1))
+ break;
+ *end_ret = state->end;
+ }
+ ret = 0;
+ }
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/*
+ * Find a contiguous range of bytes in the file marked as delalloc, not more
+ * than 'max_bytes'. start and end are used to return the range,
+ *
+ * True is returned if we find something, false if nothing was in the tree.
+ */
+bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
+ u64 *end, u64 max_bytes,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+ u64 cur_start = *start;
+ bool found = false;
+ u64 total_bytes = 0;
+
+ spin_lock(&tree->lock);
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search(tree, cur_start);
+ if (!state) {
+ *end = (u64)-1;
+ goto out;
+ }
+
+ while (state) {
+ if (found && (state->start != cur_start ||
+ (state->state & EXTENT_BOUNDARY))) {
+ goto out;
+ }
+ if (!(state->state & EXTENT_DELALLOC)) {
+ if (!found)
+ *end = state->end;
+ goto out;
+ }
+ if (!found) {
+ *start = state->start;
+ *cached_state = state;
+ refcount_inc(&state->refs);
+ }
+ found = true;
+ *end = state->end;
+ cur_start = state->end + 1;
+ total_bytes += state->end - state->start + 1;
+ if (total_bytes >= max_bytes)
+ break;
+ state = next_state(state);
+ }
+out:
+ spin_unlock(&tree->lock);
+ return found;
+}
+
+/*
+ * Set some bits on a range in the tree. This may require allocations or
+ * sleeping, so the gfp mask is used to indicate what is allowed.
+ *
+ * If any of the exclusive bits are set, this will fail with -EEXIST if some
+ * part of the range already has the desired bits set. The extent_state of the
+ * existing range is returned in failed_state in this case, and the start of the
+ * existing range is returned in failed_start. failed_state is used as an
+ * optimization for wait_extent_bit, failed_start must be used as the source of
+ * truth as failed_state may have changed since we returned.
+ *
+ * [start, end] is inclusive This takes the tree lock.
+ */
+static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, u64 *failed_start,
+ struct extent_state **failed_state,
+ struct extent_state **cached_state,
+ struct extent_changeset *changeset, gfp_t mask)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node **p;
+ struct rb_node *parent;
+ int err = 0;
+ u64 last_start;
+ u64 last_end;
+ u32 exclusive_bits = (bits & EXTENT_LOCKED);
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+ trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
+
+ if (exclusive_bits)
+ ASSERT(failed_start);
+ else
+ ASSERT(failed_start == NULL && failed_state == NULL);
+again:
+ if (!prealloc) {
+ /*
+ * Don't care for allocation failure here because we might end
+ * up not needing the pre-allocated extent state at all, which
+ * is the case if we only have in the tree extent states that
+ * cover our input range and don't cover too any other range.
+ * If we end up needing a new extent state we allocate it later.
+ */
+ prealloc = alloc_extent_state(mask);
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->start <= start && state->end > start &&
+ extent_state_in_tree(state))
+ goto hit_next;
+ }
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search_for_insert(tree, start, &p, &parent);
+ if (!state) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ goto search_again;
+ prealloc->start = start;
+ prealloc->end = end;
+ insert_state_fast(tree, prealloc, p, parent, bits, changeset);
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ goto out;
+ }
+hit_next:
+ last_start = state->start;
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * Just lock what we found and keep going
+ */
+ if (state->start == start && state->end <= end) {
+ if (state->state & exclusive_bits) {
+ *failed_start = state->start;
+ cache_state(state, failed_state);
+ err = -EEXIST;
+ goto out;
+ }
+
+ set_state_bits(tree, state, bits, changeset);
+ cache_state(state, cached_state);
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ state = next_state(state);
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ goto search_again;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on second
+ * half.
+ *
+ * If the extent we found extends past our range, we just split and
+ * search again. It'll get split again the next time though.
+ *
+ * If the extent we found is inside our range, we set the desired bit
+ * on it.
+ */
+ if (state->start < start) {
+ if (state->state & exclusive_bits) {
+ *failed_start = start;
+ cache_state(state, failed_state);
+ err = -EEXIST;
+ goto out;
+ }
+
+ /*
+ * If this extent already has all the bits we want set, then
+ * skip it, not necessary to split it or do anything with it.
+ */
+ if ((state->state & bits) == bits) {
+ start = state->end + 1;
+ cache_state(state, cached_state);
+ goto search_again;
+ }
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ goto search_again;
+ err = split_state(tree, state, prealloc, start);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ set_state_bits(tree, state, bits, changeset);
+ cache_state(state, cached_state);
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ state = next_state(state);
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state | or | state |
+ *
+ * There's a hole, we need to insert something in it and ignore the
+ * extent we found.
+ */
+ if (state->start > start) {
+ u64 this_end;
+ if (end < last_start)
+ this_end = end;
+ else
+ this_end = last_start - 1;
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ goto search_again;
+
+ /*
+ * Avoid to free 'prealloc' if it can be merged with the later
+ * extent.
+ */
+ prealloc->start = start;
+ prealloc->end = this_end;
+ err = insert_state(tree, prealloc, bits, changeset);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ start = this_end + 1;
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * We need to split the extent, and set the bit on the first half
+ */
+ if (state->start <= end && state->end > end) {
+ if (state->state & exclusive_bits) {
+ *failed_start = start;
+ cache_state(state, failed_state);
+ err = -EEXIST;
+ goto out;
+ }
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ goto search_again;
+ err = split_state(tree, state, prealloc, end + 1);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ set_state_bits(tree, prealloc, bits, changeset);
+ cache_state(prealloc, cached_state);
+ merge_state(tree, prealloc);
+ prealloc = NULL;
+ goto out;
+ }
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (gfpflags_allow_blocking(mask))
+ cond_resched();
+ goto again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return err;
+
+}
+
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state, gfp_t mask)
+{
+ return __set_extent_bit(tree, start, end, bits, NULL, NULL,
+ cached_state, NULL, mask);
+}
+
+/*
+ * Convert all bits in a given range from one bit to another
+ *
+ * @tree: the io tree to search
+ * @start: the start offset in bytes
+ * @end: the end offset in bytes (inclusive)
+ * @bits: the bits to set in this range
+ * @clear_bits: the bits to clear in this range
+ * @cached_state: state that we're going to cache
+ *
+ * This will go through and set bits for the given range. If any states exist
+ * already in this range they are set with the given bit and cleared of the
+ * clear_bits. This is only meant to be used by things that are mergeable, ie.
+ * converting from say DELALLOC to DIRTY. This is not meant to be used with
+ * boundary bits like LOCK.
+ *
+ * All allocations are done with GFP_NOFS.
+ */
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, u32 clear_bits,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node **p;
+ struct rb_node *parent;
+ int err = 0;
+ u64 last_start;
+ u64 last_end;
+ bool first_iteration = true;
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+ trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
+ clear_bits);
+
+again:
+ if (!prealloc) {
+ /*
+ * Best effort, don't worry if extent state allocation fails
+ * here for the first iteration. We might have a cached state
+ * that matches exactly the target range, in which case no
+ * extent state allocations are needed. We'll only know this
+ * after locking the tree.
+ */
+ prealloc = alloc_extent_state(GFP_NOFS);
+ if (!prealloc && !first_iteration)
+ return -ENOMEM;
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->start <= start && state->end > start &&
+ extent_state_in_tree(state))
+ goto hit_next;
+ }
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search_for_insert(tree, start, &p, &parent);
+ if (!state) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+ prealloc->start = start;
+ prealloc->end = end;
+ insert_state_fast(tree, prealloc, p, parent, bits, NULL);
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ goto out;
+ }
+hit_next:
+ last_start = state->start;
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * Just lock what we found and keep going.
+ */
+ if (state->start == start && state->end <= end) {
+ set_state_bits(tree, state, bits, NULL);
+ cache_state(state, cached_state);
+ state = clear_state_bit(tree, state, clear_bits, 0, NULL);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ goto search_again;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on second
+ * half.
+ *
+ * If the extent we found extends past our range, we just split and
+ * search again. It'll get split again the next time though.
+ *
+ * If the extent we found is inside our range, we set the desired bit
+ * on it.
+ */
+ if (state->start < start) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = split_state(tree, state, prealloc, start);
+ if (err)
+ extent_io_tree_panic(tree, err);
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ set_state_bits(tree, state, bits, NULL);
+ cache_state(state, cached_state);
+ state = clear_state_bit(tree, state, clear_bits, 0, NULL);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state | or | state |
+ *
+ * There's a hole, we need to insert something in it and ignore the
+ * extent we found.
+ */
+ if (state->start > start) {
+ u64 this_end;
+ if (end < last_start)
+ this_end = end;
+ else
+ this_end = last_start - 1;
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Avoid to free 'prealloc' if it can be merged with the later
+ * extent.
+ */
+ prealloc->start = start;
+ prealloc->end = this_end;
+ err = insert_state(tree, prealloc, bits, NULL);
+ if (err)
+ extent_io_tree_panic(tree, err);
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ start = this_end + 1;
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * We need to split the extent, and set the bit on the first half.
+ */
+ if (state->start <= end && state->end > end) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = split_state(tree, state, prealloc, end + 1);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ set_state_bits(tree, prealloc, bits, NULL);
+ cache_state(prealloc, cached_state);
+ clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
+ prealloc = NULL;
+ goto out;
+ }
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ cond_resched();
+ first_iteration = false;
+ goto again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return err;
+}
+
+/*
+ * Find the first range that has @bits not set. This range could start before
+ * @start.
+ *
+ * @tree: the tree to search
+ * @start: offset at/after which the found extent should start
+ * @start_ret: records the beginning of the range
+ * @end_ret: records the end of the range (inclusive)
+ * @bits: the set of bits which must be unset
+ *
+ * Since unallocated range is also considered one which doesn't have the bits
+ * set it's possible that @end_ret contains -1, this happens in case the range
+ * spans (last_range_end, end of device]. In this case it's up to the caller to
+ * trim @end_ret to the appropriate size.
+ */
+void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits)
+{
+ struct extent_state *state;
+ struct extent_state *prev = NULL, *next = NULL;
+
+ spin_lock(&tree->lock);
+
+ /* Find first extent with bits cleared */
+ while (1) {
+ state = tree_search_prev_next(tree, start, &prev, &next);
+ if (!state && !next && !prev) {
+ /*
+ * Tree is completely empty, send full range and let
+ * caller deal with it
+ */
+ *start_ret = 0;
+ *end_ret = -1;
+ goto out;
+ } else if (!state && !next) {
+ /*
+ * We are past the last allocated chunk, set start at
+ * the end of the last extent.
+ */
+ *start_ret = prev->end + 1;
+ *end_ret = -1;
+ goto out;
+ } else if (!state) {
+ state = next;
+ }
+
+ /*
+ * At this point 'state' either contains 'start' or start is
+ * before 'state'
+ */
+ if (in_range(start, state->start, state->end - state->start + 1)) {
+ if (state->state & bits) {
+ /*
+ * |--range with bits sets--|
+ * |
+ * start
+ */
+ start = state->end + 1;
+ } else {
+ /*
+ * 'start' falls within a range that doesn't
+ * have the bits set, so take its start as the
+ * beginning of the desired range
+ *
+ * |--range with bits cleared----|
+ * |
+ * start
+ */
+ *start_ret = state->start;
+ break;
+ }
+ } else {
+ /*
+ * |---prev range---|---hole/unset---|---node range---|
+ * |
+ * start
+ *
+ * or
+ *
+ * |---hole/unset--||--first node--|
+ * 0 |
+ * start
+ */
+ if (prev)
+ *start_ret = prev->end + 1;
+ else
+ *start_ret = 0;
+ break;
+ }
+ }
+
+ /*
+ * Find the longest stretch from start until an entry which has the
+ * bits set
+ */
+ while (state) {
+ if (state->end >= start && !(state->state & bits)) {
+ *end_ret = state->end;
+ } else {
+ *end_ret = state->start - 1;
+ break;
+ }
+ state = next_state(state);
+ }
+out:
+ spin_unlock(&tree->lock);
+}
+
+/*
+ * Count the number of bytes in the tree that have a given bit(s) set for a
+ * given range.
+ *
+ * @tree: The io tree to search.
+ * @start: The start offset of the range. This value is updated to the
+ * offset of the first byte found with the given bit(s), so it
+ * can end up being bigger than the initial value.
+ * @search_end: The end offset (inclusive value) of the search range.
+ * @max_bytes: The maximum byte count we are interested. The search stops
+ * once it reaches this count.
+ * @bits: The bits the range must have in order to be accounted for.
+ * If multiple bits are set, then only subranges that have all
+ * the bits set are accounted for.
+ * @contig: Indicate if we should ignore holes in the range or not. If
+ * this is true, then stop once we find a hole.
+ * @cached_state: A cached state to be used across multiple calls to this
+ * function in order to speedup searches. Use NULL if this is
+ * called only once or if each call does not start where the
+ * previous one ended.
+ *
+ * Returns the total number of bytes found within the given range that have
+ * all given bits set. If the returned number of bytes is greater than zero
+ * then @start is updated with the offset of the first byte with the bits set.
+ */
+u64 count_range_bits(struct extent_io_tree *tree,
+ u64 *start, u64 search_end, u64 max_bytes,
+ u32 bits, int contig,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state = NULL;
+ struct extent_state *cached;
+ u64 cur_start = *start;
+ u64 total_bytes = 0;
+ u64 last = 0;
+ int found = 0;
+
+ if (WARN_ON(search_end <= cur_start))
+ return 0;
+
+ spin_lock(&tree->lock);
+
+ if (!cached_state || !*cached_state)
+ goto search;
+
+ cached = *cached_state;
+
+ if (!extent_state_in_tree(cached))
+ goto search;
+
+ if (cached->start <= cur_start && cur_start <= cached->end) {
+ state = cached;
+ } else if (cached->start > cur_start) {
+ struct extent_state *prev;
+
+ /*
+ * The cached state starts after our search range's start. Check
+ * if the previous state record starts at or before the range we
+ * are looking for, and if so, use it - this is a common case
+ * when there are holes between records in the tree. If there is
+ * no previous state record, we can start from our cached state.
+ */
+ prev = prev_state(cached);
+ if (!prev)
+ state = cached;
+ else if (prev->start <= cur_start && cur_start <= prev->end)
+ state = prev;
+ }
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+search:
+ if (!state)
+ state = tree_search(tree, cur_start);
+
+ while (state) {
+ if (state->start > search_end)
+ break;
+ if (contig && found && state->start > last + 1)
+ break;
+ if (state->end >= cur_start && (state->state & bits) == bits) {
+ total_bytes += min(search_end, state->end) + 1 -
+ max(cur_start, state->start);
+ if (total_bytes >= max_bytes)
+ break;
+ if (!found) {
+ *start = max(cur_start, state->start);
+ found = 1;
+ }
+ last = state->end;
+ } else if (contig && found) {
+ break;
+ }
+ state = next_state(state);
+ }
+
+ if (cached_state) {
+ free_extent_state(*cached_state);
+ *cached_state = state;
+ if (state)
+ refcount_inc(&state->refs);
+ }
+
+ spin_unlock(&tree->lock);
+
+ return total_bytes;
+}
+
+/*
+ * Searche a range in the state tree for a given mask. If 'filled' == 1, this
+ * returns 1 only if every extent in the tree has the bits set. Otherwise, 1
+ * is returned if any bit in the range is found set.
+ */
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, int filled, struct extent_state *cached)
+{
+ struct extent_state *state = NULL;
+ int bitset = 0;
+
+ spin_lock(&tree->lock);
+ if (cached && extent_state_in_tree(cached) && cached->start <= start &&
+ cached->end > start)
+ state = cached;
+ else
+ state = tree_search(tree, start);
+ while (state && start <= end) {
+ if (filled && state->start > start) {
+ bitset = 0;
+ break;
+ }
+
+ if (state->start > end)
+ break;
+
+ if (state->state & bits) {
+ bitset = 1;
+ if (!filled)
+ break;
+ } else if (filled) {
+ bitset = 0;
+ break;
+ }
+
+ if (state->end == (u64)-1)
+ break;
+
+ start = state->end + 1;
+ if (start > end)
+ break;
+ state = next_state(state);
+ }
+
+ /* We ran out of states and were still inside of our range. */
+ if (filled && !state)
+ bitset = 0;
+ spin_unlock(&tree->lock);
+ return bitset;
+}
+
+/* Wrappers around set/clear extent bit */
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset)
+{
+ /*
+ * We don't support EXTENT_LOCKED yet, as current changeset will
+ * record any bits changed, so for EXTENT_LOCKED case, it will
+ * either fail with -EEXIST or changeset will record the whole
+ * range.
+ */
+ ASSERT(!(bits & EXTENT_LOCKED));
+
+ return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL,
+ changeset, GFP_NOFS);
+}
+
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset)
+{
+ /*
+ * Don't support EXTENT_LOCKED case, same reason as
+ * set_record_extent_bits().
+ */
+ ASSERT(!(bits & EXTENT_LOCKED));
+
+ return __clear_extent_bit(tree, start, end, bits, NULL, GFP_NOFS,
+ changeset);
+}
+
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached)
+{
+ int err;
+ u64 failed_start;
+
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
+ NULL, cached, NULL, GFP_NOFS);
+ if (err == -EEXIST) {
+ if (failed_start > start)
+ clear_extent_bit(tree, start, failed_start - 1,
+ EXTENT_LOCKED, cached);
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * Either insert or lock state struct between start and end use mask to tell
+ * us if waiting is desired.
+ */
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state)
+{
+ struct extent_state *failed_state = NULL;
+ int err;
+ u64 failed_start;
+
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
+ &failed_state, cached_state, NULL, GFP_NOFS);
+ while (err == -EEXIST) {
+ if (failed_start != start)
+ clear_extent_bit(tree, start, failed_start - 1,
+ EXTENT_LOCKED, cached_state);
+
+ wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED,
+ &failed_state);
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
+ &failed_start, &failed_state,
+ cached_state, NULL, GFP_NOFS);
+ }
+ return err;
+}
+
+void __cold extent_state_free_cachep(void)
+{
+ btrfs_extent_state_leak_debug_check();
+ kmem_cache_destroy(extent_state_cache);
+}
+
+int __init extent_state_init_cachep(void)
+{
+ extent_state_cache = kmem_cache_create("btrfs_extent_state",
+ sizeof(struct extent_state), 0,
+ SLAB_MEM_SPREAD, NULL);
+ if (!extent_state_cache)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index c3eb52dbe61c..e3eeec380844 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -3,42 +3,54 @@
#ifndef BTRFS_EXTENT_IO_TREE_H
#define BTRFS_EXTENT_IO_TREE_H
+#include "misc.h"
+
struct extent_changeset;
struct io_failure_record;
/* Bits for the extent state */
-#define EXTENT_DIRTY (1U << 0)
-#define EXTENT_UPTODATE (1U << 1)
-#define EXTENT_LOCKED (1U << 2)
-#define EXTENT_NEW (1U << 3)
-#define EXTENT_DELALLOC (1U << 4)
-#define EXTENT_DEFRAG (1U << 5)
-#define EXTENT_BOUNDARY (1U << 6)
-#define EXTENT_NODATASUM (1U << 7)
-#define EXTENT_CLEAR_META_RESV (1U << 8)
-#define EXTENT_NEED_WAIT (1U << 9)
-#define EXTENT_DAMAGED (1U << 10)
-#define EXTENT_NORESERVE (1U << 11)
-#define EXTENT_QGROUP_RESERVED (1U << 12)
-#define EXTENT_CLEAR_DATA_RESV (1U << 13)
-/*
- * Must be cleared only during ordered extent completion or on error paths if we
- * did not manage to submit bios and create the ordered extents for the range.
- * Should not be cleared during page release and page invalidation (if there is
- * an ordered extent in flight), that is left for the ordered extent completion.
- */
-#define EXTENT_DELALLOC_NEW (1U << 14)
-/*
- * When an ordered extent successfully completes for a region marked as a new
- * delalloc range, use this flag when clearing a new delalloc range to indicate
- * that the VFS' inode number of bytes should be incremented and the inode's new
- * delalloc bytes decremented, in an atomic way to prevent races with stat(2).
- */
-#define EXTENT_ADD_INODE_BYTES (1U << 15)
+enum {
+ ENUM_BIT(EXTENT_DIRTY),
+ ENUM_BIT(EXTENT_UPTODATE),
+ ENUM_BIT(EXTENT_LOCKED),
+ ENUM_BIT(EXTENT_NEW),
+ ENUM_BIT(EXTENT_DELALLOC),
+ ENUM_BIT(EXTENT_DEFRAG),
+ ENUM_BIT(EXTENT_BOUNDARY),
+ ENUM_BIT(EXTENT_NODATASUM),
+ ENUM_BIT(EXTENT_CLEAR_META_RESV),
+ ENUM_BIT(EXTENT_NEED_WAIT),
+ ENUM_BIT(EXTENT_NORESERVE),
+ ENUM_BIT(EXTENT_QGROUP_RESERVED),
+ ENUM_BIT(EXTENT_CLEAR_DATA_RESV),
+ /*
+ * Must be cleared only during ordered extent completion or on error
+ * paths if we did not manage to submit bios and create the ordered
+ * extents for the range. Should not be cleared during page release
+ * and page invalidation (if there is an ordered extent in flight),
+ * that is left for the ordered extent completion.
+ */
+ ENUM_BIT(EXTENT_DELALLOC_NEW),
+ /*
+ * When an ordered extent successfully completes for a region marked as
+ * a new delalloc range, use this flag when clearing a new delalloc
+ * range to indicate that the VFS' inode number of bytes should be
+ * incremented and the inode's new delalloc bytes decremented, in an
+ * atomic way to prevent races with stat(2).
+ */
+ ENUM_BIT(EXTENT_ADD_INODE_BYTES),
+ /*
+ * Set during truncate when we're clearing an entire range and we just
+ * want the extent states to go away.
+ */
+ ENUM_BIT(EXTENT_CLEAR_ALL_BITS),
+};
+
#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
EXTENT_CLEAR_DATA_RESV)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | \
- EXTENT_ADD_INODE_BYTES)
+ EXTENT_ADD_INODE_BYTES | \
+ EXTENT_CLEAR_ALL_BITS)
/*
* Redefined bits above which are used only in the device allocation tree,
@@ -56,7 +68,6 @@ enum {
IO_TREE_FS_EXCLUDED_EXTENTS,
IO_TREE_BTREE_INODE_IO,
IO_TREE_INODE_IO,
- IO_TREE_INODE_IO_FAILURE,
IO_TREE_RELOC_BLOCKS,
IO_TREE_TRANS_DIRTY_PAGES,
IO_TREE_ROOT_DIRTY_LOG_PAGES,
@@ -69,9 +80,8 @@ enum {
struct extent_io_tree {
struct rb_root state;
struct btrfs_fs_info *fs_info;
- void *private_data;
- u64 dirty_bytes;
- bool track_uptodate;
+ /* Inode associated with this tree, or NULL. */
+ struct btrfs_inode *inode;
/* Who owns this io tree, should be one of IO_TREE_* */
u8 owner;
@@ -89,109 +99,87 @@ struct extent_state {
refcount_t refs;
u32 state;
- struct io_failure_record *failrec;
-
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
#endif
};
-int __init extent_state_cache_init(void);
-void __cold extent_state_cache_exit(void);
-
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *tree, unsigned int owner,
- void *private_data);
+ struct extent_io_tree *tree, unsigned int owner);
void extent_io_tree_release(struct extent_io_tree *tree);
-int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached);
-
-static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- return lock_extent_bits(tree, start, end, NULL);
-}
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached);
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached);
-int __init extent_io_init(void);
-void __cold extent_io_exit(void);
+int __init extent_state_init_cachep(void);
+void __cold extent_state_free_cachep(void);
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end,
- u64 max_bytes, u32 bits, int contig);
+ u64 max_bytes, u32 bits, int contig,
+ struct extent_state **cached_state);
void free_extent_state(struct extent_state *state);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, int filled, struct extent_state *cached_state);
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset);
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int wake, int delete,
- struct extent_state **cached);
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int wake, int delete,
- struct extent_state **cached, gfp_t mask,
- struct extent_changeset *changeset);
+ u32 bits, struct extent_state **cached, gfp_t mask,
+ struct extent_changeset *changeset);
-static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 end, u32 bits,
+ struct extent_state **cached)
{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL);
+ return __clear_extent_bit(tree, start, end, bits, cached,
+ GFP_NOFS, NULL);
}
-static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
+static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached)
{
- return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
- GFP_NOFS, NULL);
-}
-
-static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree,
- u64 start, u64 end, struct extent_state **cached)
-{
- return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
- GFP_ATOMIC, NULL);
+ return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached,
+ GFP_NOFS, NULL);
}
static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
u64 end, u32 bits)
{
- int wake = 0;
-
- if (bits & EXTENT_LOCKED)
- wake = 1;
-
- return clear_extent_bit(tree, start, end, bits, wake, 0, NULL);
+ return clear_extent_bit(tree, start, end, bits, NULL);
}
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, unsigned exclusive_bits, u64 *failed_start,
- struct extent_state **cached_state, gfp_t mask,
- struct extent_changeset *changeset);
-int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits);
+ u32 bits, struct extent_state **cached_state, gfp_t mask);
+
+static inline int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start,
+ u64 end, u32 bits)
+{
+ return set_extent_bit(tree, start, end, bits, NULL, GFP_NOWAIT);
+}
static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
u64 end, u32 bits)
{
- return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
- NULL);
+ return set_extent_bit(tree, start, end, bits, NULL, GFP_NOFS);
}
static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state)
{
- return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
- cached_state, GFP_NOFS, NULL);
+ return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
+ cached_state, GFP_NOFS, NULL);
}
static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
u64 end, gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, NULL,
- mask, NULL);
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, mask);
}
static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
@@ -199,7 +187,7 @@ static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
{
return clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, cached);
+ EXTENT_DO_ACCOUNTING, cached);
}
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -211,30 +199,22 @@ static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
struct extent_state **cached_state)
{
return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits,
- 0, NULL, cached_state, GFP_NOFS, NULL);
+ EXTENT_DELALLOC | extra_bits,
+ cached_state, GFP_NOFS);
}
static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state)
{
return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
- 0, NULL, cached_state, GFP_NOFS, NULL);
+ EXTENT_DELALLOC | EXTENT_DEFRAG,
+ cached_state, GFP_NOFS);
}
static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
u64 end)
{
- return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, NULL,
- GFP_NOFS, NULL);
-}
-
-static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state, gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
- cached_state, mask, NULL);
+ return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS);
}
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -244,24 +224,10 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits);
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits);
-int extent_invalidate_folio(struct extent_io_tree *tree,
- struct folio *folio, size_t offset);
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
u64 *end, u64 max_bytes,
struct extent_state **cached_state);
-
-/* This should be reworked in the future and put elsewhere. */
-struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start);
-int set_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record *failrec);
-void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
- u64 end);
-int free_io_failure(struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree,
- struct io_failure_record *rec);
-int clean_io_failure(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree, u64 start,
- struct page *page, u64 ino, unsigned int pg_offset);
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+ struct extent_state **cached_state);
#endif /* BTRFS_EXTENT_IO_TREE_H */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6914cd8024ba..892d78c1853c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -36,6 +36,13 @@
#include "rcu-string.h"
#include "zoned.h"
#include "dev-replace.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "root-tree.h"
+#include "file-item.h"
+#include "orphan.h"
+#include "tree-checker.h"
#undef SCRAMBLE_DELAYED_REFS
@@ -2220,6 +2227,12 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
}
if (!mutex_trylock(&head->mutex)) {
+ if (path->nowait) {
+ spin_unlock(&delayed_refs->lock);
+ btrfs_put_transaction(cur_trans);
+ return -EAGAIN;
+ }
+
refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
@@ -2686,13 +2699,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
len = cache->start + cache->length - start;
len = min(len, end + 1 - start);
- down_read(&fs_info->commit_root_sem);
- if (start < cache->last_byte_to_unpin && return_free_space) {
- u64 add_len = min(len, cache->last_byte_to_unpin - start);
-
- btrfs_add_free_space(cache, start, add_len);
- }
- up_read(&fs_info->commit_root_sem);
+ if (return_free_space)
+ btrfs_add_free_space(cache, start, len);
start += len;
total_unpinned += len;
@@ -3294,21 +3302,22 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
}
/*
- * If this is a leaf and there are tree mod log users, we may
- * have recorded mod log operations that point to this leaf.
- * So we must make sure no one reuses this leaf's extent before
- * mod log operations are applied to a node, otherwise after
- * rewinding a node using the mod log operations we get an
- * inconsistent btree, as the leaf's extent may now be used as
- * a node or leaf for another different btree.
+ * If there are tree mod log users we may have recorded mod log
+ * operations for this node. If we re-allocate this node we
+ * could replay operations on this node that happened when it
+ * existed in a completely different root. For example if it
+ * was part of root A, then was reallocated to root B, and we
+ * are doing a btrfs_old_search_slot(root b), we could replay
+ * operations that happened when the block was part of root A,
+ * giving us an inconsistent view of the btree.
+ *
* We are safe from races here because at this point no other
* node or root points to this extent buffer, so if after this
- * check a new tree mod log user joins, it will not be able to
- * find a node pointing to this leaf and record operations that
- * point to this leaf.
+ * check a new tree mod log user joins we will not have an
+ * existing log of operations on this node that we have to
+ * contend with.
*/
- if (btrfs_header_level(buf) == 0 &&
- test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
+ if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
must_pin = true;
if (must_pin || btrfs_is_zoned(fs_info)) {
@@ -3804,7 +3813,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
block_group->start == fs_info->data_reloc_bg ||
fs_info->data_reloc_bg == 0);
- if (block_group->ro || block_group->zoned_data_reloc_ongoing) {
+ if (block_group->ro ||
+ test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
ret = 1;
goto out;
}
@@ -3881,7 +3891,7 @@ out:
* regular extents) at the same time to the same zone, which
* easily break the write pointer.
*/
- block_group->zoned_data_reloc_ongoing = 1;
+ set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
fs_info->data_reloc_bg = 0;
}
spin_unlock(&fs_info->relocation_bg_lock);
@@ -4888,6 +4898,9 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
!test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
lockdep_owner = BTRFS_FS_TREE_OBJECTID;
+ /* btrfs_clean_tree_block() accesses generation field. */
+ btrfs_set_header_generation(buf, trans->transid);
+
/*
* This needs to stay, because we could allocate a freed block from an
* old tree into a new tree, so we need to make sure this new block is
@@ -5249,8 +5262,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
u64 bytenr;
u64 generation;
u64 parent;
+ struct btrfs_tree_parent_check check = { 0 };
struct btrfs_key key;
- struct btrfs_key first_key;
struct btrfs_ref ref = { 0 };
struct extent_buffer *next;
int level = wc->level;
@@ -5272,7 +5285,12 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
}
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
- btrfs_node_key_to_cpu(path->nodes[level], &first_key,
+
+ check.level = level - 1;
+ check.transid = generation;
+ check.owner_root = root->root_key.objectid;
+ check.has_first_key = true;
+ btrfs_node_key_to_cpu(path->nodes[level], &check.first_key,
path->slots[level]);
next = find_extent_buffer(fs_info, bytenr);
@@ -5334,8 +5352,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (!next) {
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
- next = read_tree_block(fs_info, bytenr, root->root_key.objectid,
- generation, level - 1, &first_key);
+ next = read_tree_block(fs_info, bytenr, &check);
if (IS_ERR(next)) {
return PTR_ERR(next);
} else if (!extent_buffer_uptodate(next)) {
@@ -5639,6 +5656,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
*/
int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
{
+ const bool is_reloc_root = (root->root_key.objectid ==
+ BTRFS_TREE_RELOC_OBJECTID);
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
@@ -5798,6 +5817,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
goto out_end_trans;
}
+ if (!is_reloc_root)
+ btrfs_set_last_root_drop_gen(fs_info, trans->transid);
+
btrfs_end_transaction_throttle(trans);
if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
btrfs_debug(fs_info,
@@ -5832,7 +5854,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
goto out_end_trans;
}
- if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ if (!is_reloc_root) {
ret = btrfs_find_root(tree_root, &root->root_key, path,
NULL, NULL);
if (ret < 0) {
@@ -5864,6 +5886,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
btrfs_put_root(root);
root_dropped = true;
out_end_trans:
+ if (!is_reloc_root)
+ btrfs_set_last_root_drop_gen(fs_info, trans->transid);
+
btrfs_end_transaction_throttle(trans);
out_free:
kfree(wc);
@@ -5959,40 +5984,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
return ret;
}
-/*
- * helper to account the unused space of all the readonly block group in the
- * space_info. takes mirrors into account.
- */
-u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
-{
- struct btrfs_block_group *block_group;
- u64 free_bytes = 0;
- int factor;
-
- /* It's df, we don't care if it's racy */
- if (list_empty(&sinfo->ro_bgs))
- return 0;
-
- spin_lock(&sinfo->lock);
- list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
- spin_lock(&block_group->lock);
-
- if (!block_group->ro) {
- spin_unlock(&block_group->lock);
- continue;
- }
-
- factor = btrfs_bg_type_to_factor(block_group->flags);
- free_bytes += (block_group->length -
- block_group->used) * factor;
-
- spin_unlock(&block_group->lock);
- }
- spin_unlock(&sinfo->lock);
-
- return free_bytes;
-}
-
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end)
{
@@ -6058,7 +6049,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
btrfs_warn_in_rcu(fs_info,
"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
start, end - start + 1,
- rcu_str_deref(device->name),
+ btrfs_dev_name(device),
device->total_bytes);
mutex_unlock(&fs_info->chunk_mutex);
ret = 0;
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
new file mode 100644
index 000000000000..ae5425253603
--- /dev/null
+++ b/fs/btrfs/extent-tree.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_EXTENT_TREE_H
+#define BTRFS_EXTENT_TREE_H
+
+enum btrfs_inline_ref_type {
+ BTRFS_REF_TYPE_INVALID,
+ BTRFS_REF_TYPE_BLOCK,
+ BTRFS_REF_TYPE_DATA,
+ BTRFS_REF_TYPE_ANY,
+};
+
+int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
+ struct btrfs_extent_inline_ref *iref,
+ enum btrfs_inline_ref_type is_data);
+u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
+
+int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
+ u64 start, u64 num_bytes);
+void btrfs_free_excluded_extents(struct btrfs_block_group *cache);
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count);
+void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head);
+int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 offset, int metadata, u64 *refs, u64 *flags);
+int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
+ int reserved);
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes);
+int btrfs_exclude_logged_extents(struct extent_buffer *eb);
+int btrfs_cross_ref_exist(struct btrfs_root *root,
+ u64 objectid, u64 offset, u64 bytenr, bool strict,
+ struct btrfs_path *path);
+struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 parent, u64 root_objectid,
+ const struct btrfs_disk_key *key,
+ int level, u64 hint,
+ u64 empty_size,
+ enum btrfs_lock_nesting nest);
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ u64 root_id,
+ struct extent_buffer *buf,
+ u64 parent, int last_ref);
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 owner,
+ u64 offset, u64 ram_bytes,
+ struct btrfs_key *ins);
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
+ u64 root_objectid, u64 owner, u64 offset,
+ struct btrfs_key *ins);
+int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,
+ u64 min_alloc_size, u64 empty_size, u64 hint_byte,
+ struct btrfs_key *ins, int is_data, int delalloc);
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf, int full_backref);
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf, int full_backref);
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
+ struct extent_buffer *eb, u64 flags, int level);
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
+
+int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
+ u64 start, u64 len, int delalloc);
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len);
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref);
+int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
+ int for_reloc);
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *node,
+ struct extent_buffer *parent);
+
+#endif
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cf4f19e80e2f..83dd3aa59663 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -20,7 +20,7 @@
#include "extent_map.h"
#include "ctree.h"
#include "btrfs_inode.h"
-#include "volumes.h"
+#include "bio.h"
#include "check-integrity.h"
#include "locking.h"
#include "rcu-string.h"
@@ -30,39 +30,34 @@
#include "zoned.h"
#include "block-group.h"
#include "compression.h"
+#include "fs.h"
+#include "accessors.h"
+#include "file-item.h"
+#include "file.h"
+#include "dev-replace.h"
+#include "super.h"
-static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
-static struct bio_set btrfs_bioset;
-
-static inline bool extent_state_in_tree(const struct extent_state *state)
-{
- return !RB_EMPTY_NODE(&state->rb_node);
-}
#ifdef CONFIG_BTRFS_DEBUG
-static LIST_HEAD(states);
-static DEFINE_SPINLOCK(leak_lock);
-
-static inline void btrfs_leak_debug_add(spinlock_t *lock,
- struct list_head *new,
- struct list_head *head)
+static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
unsigned long flags;
- spin_lock_irqsave(lock, flags);
- list_add(new, head);
- spin_unlock_irqrestore(lock, flags);
+ spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
+ list_add(&eb->leak_list, &fs_info->allocated_ebs);
+ spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}
-static inline void btrfs_leak_debug_del(spinlock_t *lock,
- struct list_head *entry)
+static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
unsigned long flags;
- spin_lock_irqsave(lock, flags);
- list_del(entry);
- spin_unlock_irqrestore(lock, flags);
+ spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
+ list_del(&eb->leak_list);
+ spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}
void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
@@ -91,53 +86,11 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
}
spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}
-
-static inline void btrfs_extent_state_leak_debug_check(void)
-{
- struct extent_state *state;
-
- while (!list_empty(&states)) {
- state = list_entry(states.next, struct extent_state, leak_list);
- pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
- state->start, state->end, state->state,
- extent_state_in_tree(state),
- refcount_read(&state->refs));
- list_del(&state->leak_list);
- kmem_cache_free(extent_state_cache, state);
- }
-}
-
-#define btrfs_debug_check_extent_io_range(tree, start, end) \
- __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
-static inline void __btrfs_debug_check_extent_io_range(const char *caller,
- struct extent_io_tree *tree, u64 start, u64 end)
-{
- struct inode *inode = tree->private_data;
- u64 isize;
-
- if (!inode || !is_data_inode(inode))
- return;
-
- isize = i_size_read(inode);
- if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
- btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
- "%s: ino %llu isize %llu odd range [%llu,%llu]",
- caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
- }
-}
#else
-#define btrfs_leak_debug_add(lock, new, head) do {} while (0)
-#define btrfs_leak_debug_del(lock, entry) do {} while (0)
-#define btrfs_extent_state_leak_debug_check() do {} while (0)
-#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
+#define btrfs_leak_debug_add_eb(eb) do {} while (0)
+#define btrfs_leak_debug_del_eb(eb) do {} while (0)
#endif
-struct tree_entry {
- u64 start;
- u64 end;
- struct rb_node rb_node;
-};
-
/*
* Structure to record info about the bio being assembled, and other info like
* how many bytes are there before stripe/ordered extent boundary.
@@ -148,42 +101,23 @@ struct btrfs_bio_ctrl {
enum btrfs_compression_type compress_type;
u32 len_to_stripe_boundary;
u32 len_to_oe_boundary;
-};
+ btrfs_bio_end_io_t end_io_func;
-struct extent_page_data {
- struct btrfs_bio_ctrl bio_ctrl;
- /* tells writepage not to lock the state bits for this range
- * it still does the unlocking
+ /*
+ * Tell writepage not to lock the state bits for this range, it still
+ * does the unlocking.
*/
- unsigned int extent_locked:1;
+ bool extent_locked;
- /* tells the submit_bio code to use REQ_SYNC */
- unsigned int sync_io:1;
+ /* Tell the submit_bio code to use REQ_SYNC */
+ bool sync_io;
};
-static int add_extent_changeset(struct extent_state *state, u32 bits,
- struct extent_changeset *changeset,
- int set)
-{
- int ret;
-
- if (!changeset)
- return 0;
- if (set && (state->state & bits) == bits)
- return 0;
- if (!set && (state->state & bits) == 0)
- return 0;
- changeset->bytes_changed += state->end - state->start + 1;
- ret = ulist_add(&changeset->range_changed, state->start, state->end,
- GFP_ATOMIC);
- return ret;
-}
-
static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
{
struct bio *bio;
struct bio_vec *bv;
- struct inode *inode;
+ struct btrfs_inode *inode;
int mirror_num;
if (!bio_ctrl->bio)
@@ -191,7 +125,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
bio = bio_ctrl->bio;
bv = bio_first_bvec_all(bio);
- inode = bv->bv_page->mapping->host;
+ inode = BTRFS_I(bv->bv_page->mapping->host);
mirror_num = bio_ctrl->mirror_num;
/* Caller should ensure the bio has at least some range added */
@@ -199,7 +133,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset;
- if (!is_data_inode(inode))
+ if (!is_data_inode(&inode->vfs_inode))
btrfs_submit_metadata_bio(inode, bio, mirror_num);
else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
btrfs_submit_data_write_bio(inode, bio, mirror_num);
@@ -207,42 +141,31 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
btrfs_submit_data_read_bio(inode, bio, mirror_num,
bio_ctrl->compress_type);
- /* The bio is owned by the bi_end_io handler now */
+ /* The bio is owned by the end_io handler now */
bio_ctrl->bio = NULL;
}
/*
- * Submit or fail the current bio in an extent_page_data structure.
+ * Submit or fail the current bio in the bio_ctrl structure.
*/
-static void submit_write_bio(struct extent_page_data *epd, int ret)
+static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret)
{
- struct bio *bio = epd->bio_ctrl.bio;
+ struct bio *bio = bio_ctrl->bio;
if (!bio)
return;
if (ret) {
ASSERT(ret < 0);
- bio->bi_status = errno_to_blk_status(ret);
- bio_endio(bio);
- /* The bio is owned by the bi_end_io handler now */
- epd->bio_ctrl.bio = NULL;
+ btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
+ /* The bio is owned by the end_io handler now */
+ bio_ctrl->bio = NULL;
} else {
- submit_one_bio(&epd->bio_ctrl);
+ submit_one_bio(bio_ctrl);
}
}
-int __init extent_state_cache_init(void)
-{
- extent_state_cache = kmem_cache_create("btrfs_extent_state",
- sizeof(struct extent_state), 0,
- SLAB_MEM_SPREAD, NULL);
- if (!extent_state_cache)
- return -ENOMEM;
- return 0;
-}
-
-int __init extent_io_init(void)
+int __init extent_buffer_init_cachep(void)
{
extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
sizeof(struct extent_buffer), 0,
@@ -250,32 +173,10 @@ int __init extent_io_init(void)
if (!extent_buffer_cache)
return -ENOMEM;
- if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
- offsetof(struct btrfs_bio, bio),
- BIOSET_NEED_BVECS))
- goto free_buffer_cache;
-
- if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
- goto free_bioset;
-
return 0;
-
-free_bioset:
- bioset_exit(&btrfs_bioset);
-
-free_buffer_cache:
- kmem_cache_destroy(extent_buffer_cache);
- extent_buffer_cache = NULL;
- return -ENOMEM;
-}
-
-void __cold extent_state_cache_exit(void)
-{
- btrfs_extent_state_leak_debug_check();
- kmem_cache_destroy(extent_state_cache);
}
-void __cold extent_io_exit(void)
+void __cold extent_buffer_free_cachep(void)
{
/*
* Make sure all delayed rcu free are flushed before we
@@ -283,1244 +184,6 @@ void __cold extent_io_exit(void)
*/
rcu_barrier();
kmem_cache_destroy(extent_buffer_cache);
- bioset_exit(&btrfs_bioset);
-}
-
-/*
- * For the file_extent_tree, we want to hold the inode lock when we lookup and
- * update the disk_i_size, but lockdep will complain because our io_tree we hold
- * the tree lock and get the inode lock when setting delalloc. These two things
- * are unrelated, so make a class for the file_extent_tree so we don't get the
- * two locking patterns mixed up.
- */
-static struct lock_class_key file_extent_tree_class;
-
-void extent_io_tree_init(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *tree, unsigned int owner,
- void *private_data)
-{
- tree->fs_info = fs_info;
- tree->state = RB_ROOT;
- tree->dirty_bytes = 0;
- spin_lock_init(&tree->lock);
- tree->private_data = private_data;
- tree->owner = owner;
- if (owner == IO_TREE_INODE_FILE_EXTENT)
- lockdep_set_class(&tree->lock, &file_extent_tree_class);
-}
-
-void extent_io_tree_release(struct extent_io_tree *tree)
-{
- spin_lock(&tree->lock);
- /*
- * Do a single barrier for the waitqueue_active check here, the state
- * of the waitqueue should not change once extent_io_tree_release is
- * called.
- */
- smp_mb();
- while (!RB_EMPTY_ROOT(&tree->state)) {
- struct rb_node *node;
- struct extent_state *state;
-
- node = rb_first(&tree->state);
- state = rb_entry(node, struct extent_state, rb_node);
- rb_erase(&state->rb_node, &tree->state);
- RB_CLEAR_NODE(&state->rb_node);
- /*
- * btree io trees aren't supposed to have tasks waiting for
- * changes in the flags of extent states ever.
- */
- ASSERT(!waitqueue_active(&state->wq));
- free_extent_state(state);
-
- cond_resched_lock(&tree->lock);
- }
- spin_unlock(&tree->lock);
-}
-
-static struct extent_state *alloc_extent_state(gfp_t mask)
-{
- struct extent_state *state;
-
- /*
- * The given mask might be not appropriate for the slab allocator,
- * drop the unsupported bits
- */
- mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
- state = kmem_cache_alloc(extent_state_cache, mask);
- if (!state)
- return state;
- state->state = 0;
- state->failrec = NULL;
- RB_CLEAR_NODE(&state->rb_node);
- btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
- refcount_set(&state->refs, 1);
- init_waitqueue_head(&state->wq);
- trace_alloc_extent_state(state, mask, _RET_IP_);
- return state;
-}
-
-void free_extent_state(struct extent_state *state)
-{
- if (!state)
- return;
- if (refcount_dec_and_test(&state->refs)) {
- WARN_ON(extent_state_in_tree(state));
- btrfs_leak_debug_del(&leak_lock, &state->leak_list);
- trace_free_extent_state(state, _RET_IP_);
- kmem_cache_free(extent_state_cache, state);
- }
-}
-
-/**
- * Search @tree for an entry that contains @offset. Such entry would have
- * entry->start <= offset && entry->end >= offset.
- *
- * @tree: the tree to search
- * @offset: offset that should fall within an entry in @tree
- * @node_ret: pointer where new node should be anchored (used when inserting an
- * entry in the tree)
- * @parent_ret: points to entry which would have been the parent of the entry,
- * containing @offset
- *
- * Return a pointer to the entry that contains @offset byte address and don't change
- * @node_ret and @parent_ret.
- *
- * If no such entry exists, return pointer to entry that ends before @offset
- * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
- */
-static inline struct rb_node *tree_search_for_insert(struct extent_io_tree *tree,
- u64 offset,
- struct rb_node ***node_ret,
- struct rb_node **parent_ret)
-{
- struct rb_root *root = &tree->state;
- struct rb_node **node = &root->rb_node;
- struct rb_node *prev = NULL;
- struct tree_entry *entry;
-
- while (*node) {
- prev = *node;
- entry = rb_entry(prev, struct tree_entry, rb_node);
-
- if (offset < entry->start)
- node = &(*node)->rb_left;
- else if (offset > entry->end)
- node = &(*node)->rb_right;
- else
- return *node;
- }
-
- if (node_ret)
- *node_ret = node;
- if (parent_ret)
- *parent_ret = prev;
-
- /* Search neighbors until we find the first one past the end */
- while (prev && offset > entry->end) {
- prev = rb_next(prev);
- entry = rb_entry(prev, struct tree_entry, rb_node);
- }
-
- return prev;
-}
-
-/*
- * Inexact rb-tree search, return the next entry if @offset is not found
- */
-static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset)
-{
- return tree_search_for_insert(tree, offset, NULL, NULL);
-}
-
-/**
- * Search offset in the tree or fill neighbor rbtree node pointers.
- *
- * @tree: the tree to search
- * @offset: offset that should fall within an entry in @tree
- * @next_ret: pointer to the first entry whose range ends after @offset
- * @prev_ret: pointer to the first entry whose range begins before @offset
- *
- * Return a pointer to the entry that contains @offset byte address. If no
- * such entry exists, then return NULL and fill @prev_ret and @next_ret.
- * Otherwise return the found entry and other pointers are left untouched.
- */
-static struct rb_node *tree_search_prev_next(struct extent_io_tree *tree,
- u64 offset,
- struct rb_node **prev_ret,
- struct rb_node **next_ret)
-{
- struct rb_root *root = &tree->state;
- struct rb_node **node = &root->rb_node;
- struct rb_node *prev = NULL;
- struct rb_node *orig_prev = NULL;
- struct tree_entry *entry;
-
- ASSERT(prev_ret);
- ASSERT(next_ret);
-
- while (*node) {
- prev = *node;
- entry = rb_entry(prev, struct tree_entry, rb_node);
-
- if (offset < entry->start)
- node = &(*node)->rb_left;
- else if (offset > entry->end)
- node = &(*node)->rb_right;
- else
- return *node;
- }
-
- orig_prev = prev;
- while (prev && offset > entry->end) {
- prev = rb_next(prev);
- entry = rb_entry(prev, struct tree_entry, rb_node);
- }
- *next_ret = prev;
- prev = orig_prev;
-
- entry = rb_entry(prev, struct tree_entry, rb_node);
- while (prev && offset < entry->start) {
- prev = rb_prev(prev);
- entry = rb_entry(prev, struct tree_entry, rb_node);
- }
- *prev_ret = prev;
-
- return NULL;
-}
-
-/*
- * utility function to look for merge candidates inside a given range.
- * Any extents with matching state are merged together into a single
- * extent in the tree. Extents with EXTENT_IO in their state field
- * are not merged because the end_io handlers need to be able to do
- * operations on them without sleeping (or doing allocations/splits).
- *
- * This should be called with the tree lock held.
- */
-static void merge_state(struct extent_io_tree *tree,
- struct extent_state *state)
-{
- struct extent_state *other;
- struct rb_node *other_node;
-
- if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
- return;
-
- other_node = rb_prev(&state->rb_node);
- if (other_node) {
- other = rb_entry(other_node, struct extent_state, rb_node);
- if (other->end == state->start - 1 &&
- other->state == state->state) {
- if (tree->private_data &&
- is_data_inode(tree->private_data))
- btrfs_merge_delalloc_extent(tree->private_data,
- state, other);
- state->start = other->start;
- rb_erase(&other->rb_node, &tree->state);
- RB_CLEAR_NODE(&other->rb_node);
- free_extent_state(other);
- }
- }
- other_node = rb_next(&state->rb_node);
- if (other_node) {
- other = rb_entry(other_node, struct extent_state, rb_node);
- if (other->start == state->end + 1 &&
- other->state == state->state) {
- if (tree->private_data &&
- is_data_inode(tree->private_data))
- btrfs_merge_delalloc_extent(tree->private_data,
- state, other);
- state->end = other->end;
- rb_erase(&other->rb_node, &tree->state);
- RB_CLEAR_NODE(&other->rb_node);
- free_extent_state(other);
- }
- }
-}
-
-static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state, u32 bits,
- struct extent_changeset *changeset);
-
-/*
- * insert an extent_state struct into the tree. 'bits' are set on the
- * struct before it is inserted.
- *
- * This may return -EEXIST if the extent is already there, in which case the
- * state struct is freed.
- *
- * The tree lock is not taken internally. This is a utility function and
- * probably isn't what you want to call (see set/clear_extent_bit).
- */
-static int insert_state(struct extent_io_tree *tree,
- struct extent_state *state,
- u32 bits, struct extent_changeset *changeset)
-{
- struct rb_node **node;
- struct rb_node *parent;
- const u64 end = state->end;
-
- set_state_bits(tree, state, bits, changeset);
-
- node = &tree->state.rb_node;
- while (*node) {
- struct tree_entry *entry;
-
- parent = *node;
- entry = rb_entry(parent, struct tree_entry, rb_node);
-
- if (end < entry->start) {
- node = &(*node)->rb_left;
- } else if (end > entry->end) {
- node = &(*node)->rb_right;
- } else {
- btrfs_err(tree->fs_info,
- "found node %llu %llu on insert of %llu %llu",
- entry->start, entry->end, state->start, end);
- return -EEXIST;
- }
- }
-
- rb_link_node(&state->rb_node, parent, node);
- rb_insert_color(&state->rb_node, &tree->state);
-
- merge_state(tree, state);
- return 0;
-}
-
-/*
- * Insert state to @tree to the location given by @node and @parent.
- */
-static void insert_state_fast(struct extent_io_tree *tree,
- struct extent_state *state, struct rb_node **node,
- struct rb_node *parent, unsigned bits,
- struct extent_changeset *changeset)
-{
- set_state_bits(tree, state, bits, changeset);
- rb_link_node(&state->rb_node, parent, node);
- rb_insert_color(&state->rb_node, &tree->state);
- merge_state(tree, state);
-}
-
-/*
- * split a given extent state struct in two, inserting the preallocated
- * struct 'prealloc' as the newly created second half. 'split' indicates an
- * offset inside 'orig' where it should be split.
- *
- * Before calling,
- * the tree has 'orig' at [orig->start, orig->end]. After calling, there
- * are two extent state structs in the tree:
- * prealloc: [orig->start, split - 1]
- * orig: [ split, orig->end ]
- *
- * The tree locks are not taken by this function. They need to be held
- * by the caller.
- */
-static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
- struct extent_state *prealloc, u64 split)
-{
- struct rb_node *parent = NULL;
- struct rb_node **node;
-
- if (tree->private_data && is_data_inode(tree->private_data))
- btrfs_split_delalloc_extent(tree->private_data, orig, split);
-
- prealloc->start = orig->start;
- prealloc->end = split - 1;
- prealloc->state = orig->state;
- orig->start = split;
-
- parent = &orig->rb_node;
- node = &parent;
- while (*node) {
- struct tree_entry *entry;
-
- parent = *node;
- entry = rb_entry(parent, struct tree_entry, rb_node);
-
- if (prealloc->end < entry->start) {
- node = &(*node)->rb_left;
- } else if (prealloc->end > entry->end) {
- node = &(*node)->rb_right;
- } else {
- free_extent_state(prealloc);
- return -EEXIST;
- }
- }
-
- rb_link_node(&prealloc->rb_node, parent, node);
- rb_insert_color(&prealloc->rb_node, &tree->state);
-
- return 0;
-}
-
-static struct extent_state *next_state(struct extent_state *state)
-{
- struct rb_node *next = rb_next(&state->rb_node);
- if (next)
- return rb_entry(next, struct extent_state, rb_node);
- else
- return NULL;
-}
-
-/*
- * utility function to clear some bits in an extent state struct.
- * it will optionally wake up anyone waiting on this state (wake == 1).
- *
- * If no bits are set on the state struct after clearing things, the
- * struct is freed and removed from the tree
- */
-static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
- struct extent_state *state,
- u32 bits, int wake,
- struct extent_changeset *changeset)
-{
- struct extent_state *next;
- u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
- int ret;
-
- if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
- u64 range = state->end - state->start + 1;
- WARN_ON(range > tree->dirty_bytes);
- tree->dirty_bytes -= range;
- }
-
- if (tree->private_data && is_data_inode(tree->private_data))
- btrfs_clear_delalloc_extent(tree->private_data, state, bits);
-
- ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
- BUG_ON(ret < 0);
- state->state &= ~bits_to_clear;
- if (wake)
- wake_up(&state->wq);
- if (state->state == 0) {
- next = next_state(state);
- if (extent_state_in_tree(state)) {
- rb_erase(&state->rb_node, &tree->state);
- RB_CLEAR_NODE(&state->rb_node);
- free_extent_state(state);
- } else {
- WARN_ON(1);
- }
- } else {
- merge_state(tree, state);
- next = next_state(state);
- }
- return next;
-}
-
-static struct extent_state *
-alloc_extent_state_atomic(struct extent_state *prealloc)
-{
- if (!prealloc)
- prealloc = alloc_extent_state(GFP_ATOMIC);
-
- return prealloc;
-}
-
-static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
-{
- btrfs_panic(tree->fs_info, err,
- "locking error: extent tree was modified by another thread while locked");
-}
-
-/*
- * clear some bits on a range in the tree. This may require splitting
- * or inserting elements in the tree, so the gfp mask is used to
- * indicate which allocations or sleeping are allowed.
- *
- * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
- * the given range from the tree regardless of state (ie for truncate).
- *
- * the range [start, end] is inclusive.
- *
- * This takes the tree lock, and returns 0 on success and < 0 on error.
- */
-int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int wake, int delete,
- struct extent_state **cached_state,
- gfp_t mask, struct extent_changeset *changeset)
-{
- struct extent_state *state;
- struct extent_state *cached;
- struct extent_state *prealloc = NULL;
- struct rb_node *node;
- u64 last_end;
- int err;
- int clear = 0;
-
- btrfs_debug_check_extent_io_range(tree, start, end);
- trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
-
- if (bits & EXTENT_DELALLOC)
- bits |= EXTENT_NORESERVE;
-
- if (delete)
- bits |= ~EXTENT_CTLBITS;
-
- if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
- clear = 1;
-again:
- if (!prealloc && gfpflags_allow_blocking(mask)) {
- /*
- * Don't care for allocation failure here because we might end
- * up not needing the pre-allocated extent state at all, which
- * is the case if we only have in the tree extent states that
- * cover our input range and don't cover too any other range.
- * If we end up needing a new extent state we allocate it later.
- */
- prealloc = alloc_extent_state(mask);
- }
-
- spin_lock(&tree->lock);
- if (cached_state) {
- cached = *cached_state;
-
- if (clear) {
- *cached_state = NULL;
- cached_state = NULL;
- }
-
- if (cached && extent_state_in_tree(cached) &&
- cached->start <= start && cached->end > start) {
- if (clear)
- refcount_dec(&cached->refs);
- state = cached;
- goto hit_next;
- }
- if (clear)
- free_extent_state(cached);
- }
- /*
- * this search will find the extents that end after
- * our range starts
- */
- node = tree_search(tree, start);
- if (!node)
- goto out;
- state = rb_entry(node, struct extent_state, rb_node);
-hit_next:
- if (state->start > end)
- goto out;
- WARN_ON(state->end < start);
- last_end = state->end;
-
- /* the state doesn't have the wanted bits, go ahead */
- if (!(state->state & bits)) {
- state = next_state(state);
- goto next;
- }
-
- /*
- * | ---- desired range ---- |
- * | state | or
- * | ------------- state -------------- |
- *
- * We need to split the extent we found, and may flip
- * bits on second half.
- *
- * If the extent we found extends past our range, we
- * just split and search again. It'll get split again
- * the next time though.
- *
- * If the extent we found is inside our range, we clear
- * the desired bit on it.
- */
-
- if (state->start < start) {
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, err);
-
- prealloc = NULL;
- if (err)
- goto out;
- if (state->end <= end) {
- state = clear_state_bit(tree, state, bits, wake, changeset);
- goto next;
- }
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state |
- * We need to split the extent, and clear the bit
- * on the first half
- */
- if (state->start <= end && state->end > end) {
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, err);
-
- if (wake)
- wake_up(&state->wq);
-
- clear_state_bit(tree, prealloc, bits, wake, changeset);
-
- prealloc = NULL;
- goto out;
- }
-
- state = clear_state_bit(tree, state, bits, wake, changeset);
-next:
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- if (start <= end && state && !need_resched())
- goto hit_next;
-
-search_again:
- if (start > end)
- goto out;
- spin_unlock(&tree->lock);
- if (gfpflags_allow_blocking(mask))
- cond_resched();
- goto again;
-
-out:
- spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
-
- return 0;
-
-}
-
-static void wait_on_state(struct extent_io_tree *tree,
- struct extent_state *state)
- __releases(tree->lock)
- __acquires(tree->lock)
-{
- DEFINE_WAIT(wait);
- prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
- spin_unlock(&tree->lock);
- schedule();
- spin_lock(&tree->lock);
- finish_wait(&state->wq, &wait);
-}
-
-/*
- * waits for one or more bits to clear on a range in the state tree.
- * The range [start, end] is inclusive.
- * The tree lock is taken by this function
- */
-static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits)
-{
- struct extent_state *state;
- struct rb_node *node;
-
- btrfs_debug_check_extent_io_range(tree, start, end);
-
- spin_lock(&tree->lock);
-again:
- while (1) {
- /*
- * this search will find all the extents that end after
- * our range starts
- */
- node = tree_search(tree, start);
-process_node:
- if (!node)
- break;
-
- state = rb_entry(node, struct extent_state, rb_node);
-
- if (state->start > end)
- goto out;
-
- if (state->state & bits) {
- start = state->start;
- refcount_inc(&state->refs);
- wait_on_state(tree, state);
- free_extent_state(state);
- goto again;
- }
- start = state->end + 1;
-
- if (start > end)
- break;
-
- if (!cond_resched_lock(&tree->lock)) {
- node = rb_next(node);
- goto process_node;
- }
- }
-out:
- spin_unlock(&tree->lock);
-}
-
-static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state,
- u32 bits, struct extent_changeset *changeset)
-{
- u32 bits_to_set = bits & ~EXTENT_CTLBITS;
- int ret;
-
- if (tree->private_data && is_data_inode(tree->private_data))
- btrfs_set_delalloc_extent(tree->private_data, state, bits);
-
- if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
- u64 range = state->end - state->start + 1;
- tree->dirty_bytes += range;
- }
- ret = add_extent_changeset(state, bits_to_set, changeset, 1);
- BUG_ON(ret < 0);
- state->state |= bits_to_set;
-}
-
-static void cache_state_if_flags(struct extent_state *state,
- struct extent_state **cached_ptr,
- unsigned flags)
-{
- if (cached_ptr && !(*cached_ptr)) {
- if (!flags || (state->state & flags)) {
- *cached_ptr = state;
- refcount_inc(&state->refs);
- }
- }
-}
-
-static void cache_state(struct extent_state *state,
- struct extent_state **cached_ptr)
-{
- return cache_state_if_flags(state, cached_ptr,
- EXTENT_LOCKED | EXTENT_BOUNDARY);
-}
-
-/*
- * set some bits on a range in the tree. This may require allocations or
- * sleeping, so the gfp mask is used to indicate what is allowed.
- *
- * If any of the exclusive bits are set, this will fail with -EEXIST if some
- * part of the range already has the desired bits set. The start of the
- * existing range is returned in failed_start in this case.
- *
- * [start, end] is inclusive This takes the tree lock.
- */
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
- u32 exclusive_bits, u64 *failed_start,
- struct extent_state **cached_state, gfp_t mask,
- struct extent_changeset *changeset)
-{
- struct extent_state *state;
- struct extent_state *prealloc = NULL;
- struct rb_node *node;
- struct rb_node **p;
- struct rb_node *parent;
- int err = 0;
- u64 last_start;
- u64 last_end;
-
- btrfs_debug_check_extent_io_range(tree, start, end);
- trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
-
- if (exclusive_bits)
- ASSERT(failed_start);
- else
- ASSERT(failed_start == NULL);
-again:
- if (!prealloc && gfpflags_allow_blocking(mask)) {
- /*
- * Don't care for allocation failure here because we might end
- * up not needing the pre-allocated extent state at all, which
- * is the case if we only have in the tree extent states that
- * cover our input range and don't cover too any other range.
- * If we end up needing a new extent state we allocate it later.
- */
- prealloc = alloc_extent_state(mask);
- }
-
- spin_lock(&tree->lock);
- if (cached_state && *cached_state) {
- state = *cached_state;
- if (state->start <= start && state->end > start &&
- extent_state_in_tree(state)) {
- node = &state->rb_node;
- goto hit_next;
- }
- }
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search_for_insert(tree, start, &p, &parent);
- if (!node) {
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- prealloc->start = start;
- prealloc->end = end;
- insert_state_fast(tree, prealloc, p, parent, bits, changeset);
- cache_state(prealloc, cached_state);
- prealloc = NULL;
- goto out;
- }
- state = rb_entry(node, struct extent_state, rb_node);
-hit_next:
- last_start = state->start;
- last_end = state->end;
-
- /*
- * | ---- desired range ---- |
- * | state |
- *
- * Just lock what we found and keep going
- */
- if (state->start == start && state->end <= end) {
- if (state->state & exclusive_bits) {
- *failed_start = state->start;
- err = -EEXIST;
- goto out;
- }
-
- set_state_bits(tree, state, bits, changeset);
- cache_state(state, cached_state);
- merge_state(tree, state);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- state = next_state(state);
- if (start < end && state && state->start == start &&
- !need_resched())
- goto hit_next;
- goto search_again;
- }
-
- /*
- * | ---- desired range ---- |
- * | state |
- * or
- * | ------------- state -------------- |
- *
- * We need to split the extent we found, and may flip bits on
- * second half.
- *
- * If the extent we found extends past our
- * range, we just split and search again. It'll get split
- * again the next time though.
- *
- * If the extent we found is inside our range, we set the
- * desired bit on it.
- */
- if (state->start < start) {
- if (state->state & exclusive_bits) {
- *failed_start = start;
- err = -EEXIST;
- goto out;
- }
-
- /*
- * If this extent already has all the bits we want set, then
- * skip it, not necessary to split it or do anything with it.
- */
- if ((state->state & bits) == bits) {
- start = state->end + 1;
- cache_state(state, cached_state);
- goto search_again;
- }
-
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, err);
-
- prealloc = NULL;
- if (err)
- goto out;
- if (state->end <= end) {
- set_state_bits(tree, state, bits, changeset);
- cache_state(state, cached_state);
- merge_state(tree, state);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- state = next_state(state);
- if (start < end && state && state->start == start &&
- !need_resched())
- goto hit_next;
- }
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state | or | state |
- *
- * There's a hole, we need to insert something in it and
- * ignore the extent we found.
- */
- if (state->start > start) {
- u64 this_end;
- if (end < last_start)
- this_end = end;
- else
- this_end = last_start - 1;
-
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
-
- /*
- * Avoid to free 'prealloc' if it can be merged with
- * the later extent.
- */
- prealloc->start = start;
- prealloc->end = this_end;
- err = insert_state(tree, prealloc, bits, changeset);
- if (err)
- extent_io_tree_panic(tree, err);
-
- cache_state(prealloc, cached_state);
- prealloc = NULL;
- start = this_end + 1;
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state |
- * We need to split the extent, and set the bit
- * on the first half
- */
- if (state->start <= end && state->end > end) {
- if (state->state & exclusive_bits) {
- *failed_start = start;
- err = -EEXIST;
- goto out;
- }
-
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, err);
-
- set_state_bits(tree, prealloc, bits, changeset);
- cache_state(prealloc, cached_state);
- merge_state(tree, prealloc);
- prealloc = NULL;
- goto out;
- }
-
-search_again:
- if (start > end)
- goto out;
- spin_unlock(&tree->lock);
- if (gfpflags_allow_blocking(mask))
- cond_resched();
- goto again;
-
-out:
- spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
-
- return err;
-
-}
-
-/**
- * convert_extent_bit - convert all bits in a given range from one bit to
- * another
- * @tree: the io tree to search
- * @start: the start offset in bytes
- * @end: the end offset in bytes (inclusive)
- * @bits: the bits to set in this range
- * @clear_bits: the bits to clear in this range
- * @cached_state: state that we're going to cache
- *
- * This will go through and set bits for the given range. If any states exist
- * already in this range they are set with the given bit and cleared of the
- * clear_bits. This is only meant to be used by things that are mergeable, ie
- * converting from say DELALLOC to DIRTY. This is not meant to be used with
- * boundary bits like LOCK.
- *
- * All allocations are done with GFP_NOFS.
- */
-int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, u32 clear_bits,
- struct extent_state **cached_state)
-{
- struct extent_state *state;
- struct extent_state *prealloc = NULL;
- struct rb_node *node;
- struct rb_node **p;
- struct rb_node *parent;
- int err = 0;
- u64 last_start;
- u64 last_end;
- bool first_iteration = true;
-
- btrfs_debug_check_extent_io_range(tree, start, end);
- trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
- clear_bits);
-
-again:
- if (!prealloc) {
- /*
- * Best effort, don't worry if extent state allocation fails
- * here for the first iteration. We might have a cached state
- * that matches exactly the target range, in which case no
- * extent state allocations are needed. We'll only know this
- * after locking the tree.
- */
- prealloc = alloc_extent_state(GFP_NOFS);
- if (!prealloc && !first_iteration)
- return -ENOMEM;
- }
-
- spin_lock(&tree->lock);
- if (cached_state && *cached_state) {
- state = *cached_state;
- if (state->start <= start && state->end > start &&
- extent_state_in_tree(state)) {
- node = &state->rb_node;
- goto hit_next;
- }
- }
-
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search_for_insert(tree, start, &p, &parent);
- if (!node) {
- prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc) {
- err = -ENOMEM;
- goto out;
- }
- prealloc->start = start;
- prealloc->end = end;
- insert_state_fast(tree, prealloc, p, parent, bits, NULL);
- cache_state(prealloc, cached_state);
- prealloc = NULL;
- goto out;
- }
- state = rb_entry(node, struct extent_state, rb_node);
-hit_next:
- last_start = state->start;
- last_end = state->end;
-
- /*
- * | ---- desired range ---- |
- * | state |
- *
- * Just lock what we found and keep going
- */
- if (state->start == start && state->end <= end) {
- set_state_bits(tree, state, bits, NULL);
- cache_state(state, cached_state);
- state = clear_state_bit(tree, state, clear_bits, 0, NULL);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- if (start < end && state && state->start == start &&
- !need_resched())
- goto hit_next;
- goto search_again;
- }
-
- /*
- * | ---- desired range ---- |
- * | state |
- * or
- * | ------------- state -------------- |
- *
- * We need to split the extent we found, and may flip bits on
- * second half.
- *
- * If the extent we found extends past our
- * range, we just split and search again. It'll get split
- * again the next time though.
- *
- * If the extent we found is inside our range, we set the
- * desired bit on it.
- */
- if (state->start < start) {
- prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc) {
- err = -ENOMEM;
- goto out;
- }
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, err);
- prealloc = NULL;
- if (err)
- goto out;
- if (state->end <= end) {
- set_state_bits(tree, state, bits, NULL);
- cache_state(state, cached_state);
- state = clear_state_bit(tree, state, clear_bits, 0, NULL);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- if (start < end && state && state->start == start &&
- !need_resched())
- goto hit_next;
- }
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state | or | state |
- *
- * There's a hole, we need to insert something in it and
- * ignore the extent we found.
- */
- if (state->start > start) {
- u64 this_end;
- if (end < last_start)
- this_end = end;
- else
- this_end = last_start - 1;
-
- prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc) {
- err = -ENOMEM;
- goto out;
- }
-
- /*
- * Avoid to free 'prealloc' if it can be merged with
- * the later extent.
- */
- prealloc->start = start;
- prealloc->end = this_end;
- err = insert_state(tree, prealloc, bits, NULL);
- if (err)
- extent_io_tree_panic(tree, err);
- cache_state(prealloc, cached_state);
- prealloc = NULL;
- start = this_end + 1;
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state |
- * We need to split the extent, and set the bit
- * on the first half
- */
- if (state->start <= end && state->end > end) {
- prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc) {
- err = -ENOMEM;
- goto out;
- }
-
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, err);
-
- set_state_bits(tree, prealloc, bits, NULL);
- cache_state(prealloc, cached_state);
- clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
- prealloc = NULL;
- goto out;
- }
-
-search_again:
- if (start > end)
- goto out;
- spin_unlock(&tree->lock);
- cond_resched();
- first_iteration = false;
- goto again;
-
-out:
- spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
-
- return err;
-}
-
-/* wrappers around set/clear extent bit */
-int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_changeset *changeset)
-{
- /*
- * We don't support EXTENT_LOCKED yet, as current changeset will
- * record any bits changed, so for EXTENT_LOCKED case, it will
- * either fail with -EEXIST or changeset will record the whole
- * range.
- */
- BUG_ON(bits & EXTENT_LOCKED);
-
- return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
- changeset);
-}
-
-int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits)
-{
- return set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
- GFP_NOWAIT, NULL);
-}
-
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int wake, int delete,
- struct extent_state **cached)
-{
- return __clear_extent_bit(tree, start, end, bits, wake, delete,
- cached, GFP_NOFS, NULL);
-}
-
-int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_changeset *changeset)
-{
- /*
- * Don't support EXTENT_LOCKED case, same reason as
- * set_record_extent_bits().
- */
- BUG_ON(bits & EXTENT_LOCKED);
-
- return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
- changeset);
-}
-
-/*
- * either insert or lock state struct between start and end use mask to tell
- * us if waiting is desired.
- */
-int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state)
-{
- int err;
- u64 failed_start;
-
- while (1) {
- err = set_extent_bit(tree, start, end, EXTENT_LOCKED,
- EXTENT_LOCKED, &failed_start,
- cached_state, GFP_NOFS, NULL);
- if (err == -EEXIST) {
- wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
- start = failed_start;
- } else
- break;
- WARN_ON(start > end);
- }
- return err;
-}
-
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- int err;
- u64 failed_start;
-
- err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
- &failed_start, NULL, GFP_NOFS, NULL);
- if (err == -EEXIST) {
- if (failed_start > start)
- clear_extent_bit(tree, start, failed_start - 1,
- EXTENT_LOCKED, 1, 0, NULL);
- return 0;
- }
- return 1;
}
void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
@@ -1554,295 +217,6 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
}
}
-/* find the first state struct with 'bits' set after 'start', and
- * return it. tree->lock must be held. NULL will returned if
- * nothing was found after 'start'
- */
-static struct extent_state *
-find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits)
-{
- struct rb_node *node;
- struct extent_state *state;
-
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search(tree, start);
- if (!node)
- goto out;
-
- while (1) {
- state = rb_entry(node, struct extent_state, rb_node);
- if (state->end >= start && (state->state & bits))
- return state;
-
- node = rb_next(node);
- if (!node)
- break;
- }
-out:
- return NULL;
-}
-
-/*
- * Find the first offset in the io tree with one or more @bits set.
- *
- * Note: If there are multiple bits set in @bits, any of them will match.
- *
- * Return 0 if we find something, and update @start_ret and @end_ret.
- * Return 1 if we found nothing.
- */
-int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits,
- struct extent_state **cached_state)
-{
- struct extent_state *state;
- int ret = 1;
-
- spin_lock(&tree->lock);
- if (cached_state && *cached_state) {
- state = *cached_state;
- if (state->end == start - 1 && extent_state_in_tree(state)) {
- while ((state = next_state(state)) != NULL) {
- if (state->state & bits)
- goto got_it;
- }
- free_extent_state(*cached_state);
- *cached_state = NULL;
- goto out;
- }
- free_extent_state(*cached_state);
- *cached_state = NULL;
- }
-
- state = find_first_extent_bit_state(tree, start, bits);
-got_it:
- if (state) {
- cache_state_if_flags(state, cached_state, 0);
- *start_ret = state->start;
- *end_ret = state->end;
- ret = 0;
- }
-out:
- spin_unlock(&tree->lock);
- return ret;
-}
-
-/**
- * Find a contiguous area of bits
- *
- * @tree: io tree to check
- * @start: offset to start the search from
- * @start_ret: the first offset we found with the bits set
- * @end_ret: the final contiguous range of the bits that were set
- * @bits: bits to look for
- *
- * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
- * to set bits appropriately, and then merge them again. During this time it
- * will drop the tree->lock, so use this helper if you want to find the actual
- * contiguous area for given bits. We will search to the first bit we find, and
- * then walk down the tree until we find a non-contiguous area. The area
- * returned will be the full contiguous area with the bits set.
- */
-int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits)
-{
- struct extent_state *state;
- int ret = 1;
-
- spin_lock(&tree->lock);
- state = find_first_extent_bit_state(tree, start, bits);
- if (state) {
- *start_ret = state->start;
- *end_ret = state->end;
- while ((state = next_state(state)) != NULL) {
- if (state->start > (*end_ret + 1))
- break;
- *end_ret = state->end;
- }
- ret = 0;
- }
- spin_unlock(&tree->lock);
- return ret;
-}
-
-/**
- * Find the first range that has @bits not set. This range could start before
- * @start.
- *
- * @tree: the tree to search
- * @start: offset at/after which the found extent should start
- * @start_ret: records the beginning of the range
- * @end_ret: records the end of the range (inclusive)
- * @bits: the set of bits which must be unset
- *
- * Since unallocated range is also considered one which doesn't have the bits
- * set it's possible that @end_ret contains -1, this happens in case the range
- * spans (last_range_end, end of device]. In this case it's up to the caller to
- * trim @end_ret to the appropriate size.
- */
-void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits)
-{
- struct extent_state *state;
- struct rb_node *node, *prev = NULL, *next;
-
- spin_lock(&tree->lock);
-
- /* Find first extent with bits cleared */
- while (1) {
- node = tree_search_prev_next(tree, start, &prev, &next);
- if (!node && !next && !prev) {
- /*
- * Tree is completely empty, send full range and let
- * caller deal with it
- */
- *start_ret = 0;
- *end_ret = -1;
- goto out;
- } else if (!node && !next) {
- /*
- * We are past the last allocated chunk, set start at
- * the end of the last extent.
- */
- state = rb_entry(prev, struct extent_state, rb_node);
- *start_ret = state->end + 1;
- *end_ret = -1;
- goto out;
- } else if (!node) {
- node = next;
- }
- /*
- * At this point 'node' either contains 'start' or start is
- * before 'node'
- */
- state = rb_entry(node, struct extent_state, rb_node);
-
- if (in_range(start, state->start, state->end - state->start + 1)) {
- if (state->state & bits) {
- /*
- * |--range with bits sets--|
- * |
- * start
- */
- start = state->end + 1;
- } else {
- /*
- * 'start' falls within a range that doesn't
- * have the bits set, so take its start as
- * the beginning of the desired range
- *
- * |--range with bits cleared----|
- * |
- * start
- */
- *start_ret = state->start;
- break;
- }
- } else {
- /*
- * |---prev range---|---hole/unset---|---node range---|
- * |
- * start
- *
- * or
- *
- * |---hole/unset--||--first node--|
- * 0 |
- * start
- */
- if (prev) {
- state = rb_entry(prev, struct extent_state,
- rb_node);
- *start_ret = state->end + 1;
- } else {
- *start_ret = 0;
- }
- break;
- }
- }
-
- /*
- * Find the longest stretch from start until an entry which has the
- * bits set
- */
- while (1) {
- state = rb_entry(node, struct extent_state, rb_node);
- if (state->end >= start && !(state->state & bits)) {
- *end_ret = state->end;
- } else {
- *end_ret = state->start - 1;
- break;
- }
-
- node = rb_next(node);
- if (!node)
- break;
- }
-out:
- spin_unlock(&tree->lock);
-}
-
-/*
- * find a contiguous range of bytes in the file marked as delalloc, not
- * more than 'max_bytes'. start and end are used to return the range,
- *
- * true is returned if we find something, false if nothing was in the tree
- */
-bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
- u64 *end, u64 max_bytes,
- struct extent_state **cached_state)
-{
- struct rb_node *node;
- struct extent_state *state;
- u64 cur_start = *start;
- bool found = false;
- u64 total_bytes = 0;
-
- spin_lock(&tree->lock);
-
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search(tree, cur_start);
- if (!node) {
- *end = (u64)-1;
- goto out;
- }
-
- while (1) {
- state = rb_entry(node, struct extent_state, rb_node);
- if (found && (state->start != cur_start ||
- (state->state & EXTENT_BOUNDARY))) {
- goto out;
- }
- if (!(state->state & EXTENT_DELALLOC)) {
- if (!found)
- *end = state->end;
- goto out;
- }
- if (!found) {
- *start = state->start;
- *cached_state = state;
- refcount_inc(&state->refs);
- }
- found = true;
- *end = state->end;
- cur_start = state->end + 1;
- node = rb_next(node);
- total_bytes += state->end - state->start + 1;
- if (total_bytes >= max_bytes)
- break;
- if (!node)
- break;
- }
-out:
- spin_unlock(&tree->lock);
- return found;
-}
-
/*
* Process one page for __process_pages_contig().
*
@@ -1900,9 +274,8 @@ static int __process_pages_contig(struct address_space *mapping,
pgoff_t start_index = start >> PAGE_SHIFT;
pgoff_t end_index = end >> PAGE_SHIFT;
pgoff_t index = start_index;
- unsigned long nr_pages = end_index - start_index + 1;
unsigned long pages_processed = 0;
- struct page *pages[16];
+ struct folio_batch fbatch;
int err = 0;
int i;
@@ -1911,16 +284,17 @@ static int __process_pages_contig(struct address_space *mapping,
ASSERT(processed_end && *processed_end == start);
}
- if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
+ if ((page_ops & PAGE_SET_ERROR) && start_index <= end_index)
mapping_set_error(mapping, -EIO);
- while (nr_pages > 0) {
- int found_pages;
+ folio_batch_init(&fbatch);
+ while (index <= end_index) {
+ int found_folios;
+
+ found_folios = filemap_get_folios_contig(mapping, &index,
+ end_index, &fbatch);
- found_pages = find_get_pages_contig(mapping, index,
- min_t(unsigned long,
- nr_pages, ARRAY_SIZE(pages)), pages);
- if (found_pages == 0) {
+ if (found_folios == 0) {
/*
* Only if we're going to lock these pages, we can find
* nothing at @index.
@@ -1930,23 +304,20 @@ static int __process_pages_contig(struct address_space *mapping,
goto out;
}
- for (i = 0; i < found_pages; i++) {
+ for (i = 0; i < found_folios; i++) {
int process_ret;
-
+ struct folio *folio = fbatch.folios[i];
process_ret = process_one_page(fs_info, mapping,
- pages[i], locked_page, page_ops,
+ &folio->page, locked_page, page_ops,
start, end);
if (process_ret < 0) {
- for (; i < found_pages; i++)
- put_page(pages[i]);
err = -EAGAIN;
+ folio_batch_release(&fbatch);
goto out;
}
- put_page(pages[i]);
- pages_processed++;
+ pages_processed += folio_nr_pages(folio);
}
- nr_pages -= found_pages;
- index += found_pages;
+ folio_batch_release(&fbatch);
cond_resched();
}
out:
@@ -2094,14 +465,14 @@ again:
}
/* step three, lock the state bits for the whole range */
- lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
+ lock_extent(tree, delalloc_start, delalloc_end, &cached_state);
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
EXTENT_DELALLOC, 1, cached_state);
if (!ret) {
- unlock_extent_cached(tree, delalloc_start, delalloc_end,
- &cached_state);
+ unlock_extent(tree, delalloc_start, delalloc_end,
+ &cached_state);
__unlock_for_delalloc(inode, locked_page,
delalloc_start, delalloc_end);
cond_resched();
@@ -2118,324 +489,46 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
u32 clear_bits, unsigned long page_ops)
{
- clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
+ clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL);
__process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
start, end, page_ops, NULL);
}
-/*
- * count the number of bytes in the tree that have a given bit(s)
- * set. This can be fairly slow, except for EXTENT_DIRTY which is
- * cached. The total number found is returned.
- */
-u64 count_range_bits(struct extent_io_tree *tree,
- u64 *start, u64 search_end, u64 max_bytes,
- u32 bits, int contig)
+static int insert_failrec(struct btrfs_inode *inode,
+ struct io_failure_record *failrec)
{
- struct rb_node *node;
- struct extent_state *state;
- u64 cur_start = *start;
- u64 total_bytes = 0;
- u64 last = 0;
- int found = 0;
+ struct rb_node *exist;
- if (WARN_ON(search_end <= cur_start))
- return 0;
+ spin_lock(&inode->io_failure_lock);
+ exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr,
+ &failrec->rb_node);
+ spin_unlock(&inode->io_failure_lock);
- spin_lock(&tree->lock);
- if (cur_start == 0 && bits == EXTENT_DIRTY) {
- total_bytes = tree->dirty_bytes;
- goto out;
- }
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search(tree, cur_start);
- if (!node)
- goto out;
-
- while (1) {
- state = rb_entry(node, struct extent_state, rb_node);
- if (state->start > search_end)
- break;
- if (contig && found && state->start > last + 1)
- break;
- if (state->end >= cur_start && (state->state & bits) == bits) {
- total_bytes += min(search_end, state->end) + 1 -
- max(cur_start, state->start);
- if (total_bytes >= max_bytes)
- break;
- if (!found) {
- *start = max(cur_start, state->start);
- found = 1;
- }
- last = state->end;
- } else if (contig && found) {
- break;
- }
- node = rb_next(node);
- if (!node)
- break;
- }
-out:
- spin_unlock(&tree->lock);
- return total_bytes;
+ return (exist == NULL) ? 0 : -EEXIST;
}
-/*
- * set the private field for a given byte offset in the tree. If there isn't
- * an extent_state there already, this does nothing.
- */
-int set_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record *failrec)
+static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start)
{
struct rb_node *node;
- struct extent_state *state;
- int ret = 0;
+ struct io_failure_record *failrec = ERR_PTR(-ENOENT);
- spin_lock(&tree->lock);
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search(tree, start);
- if (!node) {
- ret = -ENOENT;
- goto out;
- }
- state = rb_entry(node, struct extent_state, rb_node);
- if (state->start != start) {
- ret = -ENOENT;
- goto out;
- }
- state->failrec = failrec;
-out:
- spin_unlock(&tree->lock);
- return ret;
-}
-
-struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)
-{
- struct rb_node *node;
- struct extent_state *state;
- struct io_failure_record *failrec;
-
- spin_lock(&tree->lock);
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search(tree, start);
- if (!node) {
- failrec = ERR_PTR(-ENOENT);
- goto out;
- }
- state = rb_entry(node, struct extent_state, rb_node);
- if (state->start != start) {
- failrec = ERR_PTR(-ENOENT);
- goto out;
- }
-
- failrec = state->failrec;
-out:
- spin_unlock(&tree->lock);
+ spin_lock(&inode->io_failure_lock);
+ node = rb_simple_search(&inode->io_failure_tree, start);
+ if (node)
+ failrec = rb_entry(node, struct io_failure_record, rb_node);
+ spin_unlock(&inode->io_failure_lock);
return failrec;
}
-/*
- * searches a range in the state tree for a given mask.
- * If 'filled' == 1, this returns 1 only if every extent in the tree
- * has the bits set. Otherwise, 1 is returned if any bit in the
- * range is found set.
- */
-int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int filled, struct extent_state *cached)
+static void free_io_failure(struct btrfs_inode *inode,
+ struct io_failure_record *rec)
{
- struct extent_state *state = NULL;
- struct rb_node *node;
- int bitset = 0;
-
- spin_lock(&tree->lock);
- if (cached && extent_state_in_tree(cached) && cached->start <= start &&
- cached->end > start)
- node = &cached->rb_node;
- else
- node = tree_search(tree, start);
- while (node && start <= end) {
- state = rb_entry(node, struct extent_state, rb_node);
-
- if (filled && state->start > start) {
- bitset = 0;
- break;
- }
-
- if (state->start > end)
- break;
-
- if (state->state & bits) {
- bitset = 1;
- if (!filled)
- break;
- } else if (filled) {
- bitset = 0;
- break;
- }
-
- if (state->end == (u64)-1)
- break;
-
- start = state->end + 1;
- if (start > end)
- break;
- node = rb_next(node);
- if (!node) {
- if (filled)
- bitset = 0;
- break;
- }
- }
- spin_unlock(&tree->lock);
- return bitset;
-}
-
-int free_io_failure(struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree,
- struct io_failure_record *rec)
-{
- int ret;
- int err = 0;
-
- set_state_failrec(failure_tree, rec->start, NULL);
- ret = clear_extent_bits(failure_tree, rec->start,
- rec->start + rec->len - 1,
- EXTENT_LOCKED | EXTENT_DIRTY);
- if (ret)
- err = ret;
-
- ret = clear_extent_bits(io_tree, rec->start,
- rec->start + rec->len - 1,
- EXTENT_DAMAGED);
- if (ret && !err)
- err = ret;
+ spin_lock(&inode->io_failure_lock);
+ rb_erase(&rec->rb_node, &inode->io_failure_tree);
+ spin_unlock(&inode->io_failure_lock);
kfree(rec);
- return err;
-}
-
-/*
- * this bypasses the standard btrfs submit functions deliberately, as
- * the standard behavior is to write all copies in a raid setup. here we only
- * want to write the one bad copy. so we do the mapping for ourselves and issue
- * submit_bio directly.
- * to avoid any synchronization issues, wait for the data after writing, which
- * actually prevents the read that triggered the error from finishing.
- * currently, there can be no more than two copies of every data bit. thus,
- * exactly one rewrite is required.
- */
-static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num)
-{
- struct btrfs_device *dev;
- struct bio_vec bvec;
- struct bio bio;
- u64 map_length = 0;
- u64 sector;
- struct btrfs_io_context *bioc = NULL;
- int ret = 0;
-
- ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
- BUG_ON(!mirror_num);
-
- if (btrfs_repair_one_zone(fs_info, logical))
- return 0;
-
- map_length = length;
-
- /*
- * Avoid races with device replace and make sure our bioc has devices
- * associated to its stripes that don't go away while we are doing the
- * read repair operation.
- */
- btrfs_bio_counter_inc_blocked(fs_info);
- if (btrfs_is_parity_mirror(fs_info, logical, length)) {
- /*
- * Note that we don't use BTRFS_MAP_WRITE because it's supposed
- * to update all raid stripes, but here we just want to correct
- * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
- * stripe's dev and sector.
- */
- ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
- &map_length, &bioc, 0);
- if (ret)
- goto out_counter_dec;
- ASSERT(bioc->mirror_num == 1);
- } else {
- ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
- &map_length, &bioc, mirror_num);
- if (ret)
- goto out_counter_dec;
- BUG_ON(mirror_num != bioc->mirror_num);
- }
-
- sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
- dev = bioc->stripes[bioc->mirror_num - 1].dev;
- btrfs_put_bioc(bioc);
-
- if (!dev || !dev->bdev ||
- !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
- ret = -EIO;
- goto out_counter_dec;
- }
-
- bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
- bio.bi_iter.bi_sector = sector;
- __bio_add_page(&bio, page, length, pg_offset);
-
- btrfsic_check_bio(&bio);
- ret = submit_bio_wait(&bio);
- if (ret) {
- /* try to remap that extent elsewhere? */
- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
- goto out_bio_uninit;
- }
-
- btrfs_info_rl_in_rcu(fs_info,
- "read error corrected: ino %llu off %llu (dev %s sector %llu)",
- ino, start,
- rcu_str_deref(dev->name), sector);
- ret = 0;
-
-out_bio_uninit:
- bio_uninit(&bio);
-out_counter_dec:
- btrfs_bio_counter_dec(fs_info);
- return ret;
-}
-
-int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
-{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- u64 start = eb->start;
- int i, num_pages = num_extent_pages(eb);
- int ret = 0;
-
- if (sb_rdonly(fs_info->sb))
- return -EROFS;
-
- for (i = 0; i < num_pages; i++) {
- struct page *p = eb->pages[i];
-
- ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
- start - page_offset(p), mirror_num);
- if (ret)
- break;
- start += PAGE_SIZE;
- }
-
- return ret;
}
static int next_mirror(const struct io_failure_record *failrec, int cur_mirror)
@@ -2456,24 +549,18 @@ static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror)
* each time an IO finishes, we do a fast check in the IO failure tree
* to see if we need to process or clean up an io_failure_record
*/
-int clean_io_failure(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree, u64 start,
- struct page *page, u64 ino, unsigned int pg_offset)
+int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
+ struct page *page, unsigned int pg_offset)
{
- u64 private;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ u64 ino = btrfs_ino(inode);
+ u64 locked_start, locked_end;
struct io_failure_record *failrec;
- struct extent_state *state;
int mirror;
int ret;
- private = 0;
- ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
- EXTENT_DIRTY, 0);
- if (!ret)
- return 0;
-
- failrec = get_state_failrec(failure_tree, start);
+ failrec = get_failrec(inode, start);
if (IS_ERR(failrec))
return 0;
@@ -2482,25 +569,21 @@ int clean_io_failure(struct btrfs_fs_info *fs_info,
if (sb_rdonly(fs_info->sb))
goto out;
- spin_lock(&io_tree->lock);
- state = find_first_extent_bit_state(io_tree,
- failrec->start,
- EXTENT_LOCKED);
- spin_unlock(&io_tree->lock);
-
- if (!state || state->start > failrec->start ||
- state->end < failrec->start + failrec->len - 1)
+ ret = find_first_extent_bit(io_tree, failrec->bytenr, &locked_start,
+ &locked_end, EXTENT_LOCKED, NULL);
+ if (ret || locked_start > failrec->bytenr ||
+ locked_end < failrec->bytenr + failrec->len - 1)
goto out;
mirror = failrec->this_mirror;
do {
mirror = prev_mirror(failrec, mirror);
- repair_io_failure(fs_info, ino, start, failrec->len,
+ btrfs_repair_io_failure(fs_info, ino, start, failrec->len,
failrec->logical, page, pg_offset, mirror);
} while (mirror != failrec->failed_mirror);
out:
- free_io_failure(failure_tree, io_tree, failrec);
+ free_io_failure(inode, failrec);
return 0;
}
@@ -2512,30 +595,26 @@ out:
*/
void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
{
- struct extent_io_tree *failure_tree = &inode->io_failure_tree;
struct io_failure_record *failrec;
- struct extent_state *state, *next;
+ struct rb_node *node, *next;
- if (RB_EMPTY_ROOT(&failure_tree->state))
+ if (RB_EMPTY_ROOT(&inode->io_failure_tree))
return;
- spin_lock(&failure_tree->lock);
- state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
- while (state) {
- if (state->start > end)
+ spin_lock(&inode->io_failure_lock);
+ node = rb_simple_search_first(&inode->io_failure_tree, start);
+ while (node) {
+ failrec = rb_entry(node, struct io_failure_record, rb_node);
+ if (failrec->bytenr > end)
break;
- ASSERT(state->end <= end);
-
- next = next_state(state);
-
- failrec = state->failrec;
- free_extent_state(state);
+ next = rb_next(node);
+ rb_erase(&failrec->rb_node, &inode->io_failure_tree);
kfree(failrec);
- state = next;
+ node = next;
}
- spin_unlock(&failure_tree->lock);
+ spin_unlock(&inode->io_failure_lock);
}
static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
@@ -2545,16 +624,14 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 start = bbio->file_offset + bio_offset;
struct io_failure_record *failrec;
- struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
const u32 sectorsize = fs_info->sectorsize;
int ret;
- failrec = get_state_failrec(failure_tree, start);
+ failrec = get_failrec(BTRFS_I(inode), start);
if (!IS_ERR(failrec)) {
btrfs_debug(fs_info,
"Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
- failrec->logical, failrec->start, failrec->len);
+ failrec->logical, failrec->bytenr, failrec->len);
/*
* when data can be on disk more than twice, add to failrec here
* (e.g. with a list for failed_mirror) to make
@@ -2569,7 +646,8 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
if (!failrec)
return ERR_PTR(-ENOMEM);
- failrec->start = start;
+ RB_CLEAR_NODE(&failrec->rb_node);
+ failrec->bytenr = start;
failrec->len = sectorsize;
failrec->failed_mirror = bbio->mirror_num;
failrec->this_mirror = bbio->mirror_num;
@@ -2594,14 +672,8 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
}
/* Set the bits in the private failure tree */
- ret = set_extent_bits(failure_tree, start, start + sectorsize - 1,
- EXTENT_LOCKED | EXTENT_DIRTY);
- if (ret >= 0) {
- ret = set_state_failrec(failure_tree, start, failrec);
- /* Set the bits in the inode's tree */
- ret = set_extent_bits(tree, start, start + sectorsize - 1,
- EXTENT_DAMAGED);
- } else if (ret < 0) {
+ ret = insert_failrec(BTRFS_I(inode), failrec);
+ if (ret) {
kfree(failrec);
return ERR_PTR(ret);
}
@@ -2609,15 +681,13 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
return failrec;
}
-int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
+int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio,
u32 bio_offset, struct page *page, unsigned int pgoff,
- submit_bio_hook_t *submit_bio_hook)
+ bool submit_buffered)
{
u64 start = failed_bbio->file_offset + bio_offset;
struct io_failure_record *failrec;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
- struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *failed_bio = &failed_bbio->bio;
const int icsum = bio_offset >> fs_info->sectorsize_bits;
struct bio *repair_bio;
@@ -2628,7 +698,7 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
- failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset);
+ failrec = btrfs_get_io_failure_record(&inode->vfs_inode, failed_bbio, bio_offset);
if (IS_ERR(failrec))
return PTR_ERR(failrec);
@@ -2639,24 +709,22 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
*
* Since we're only doing repair for one sector, we only need to get
* a good copy of the failed sector and if we succeed, we have setup
- * everything for repair_io_failure to do the rest for us.
+ * everything for btrfs_repair_io_failure to do the rest for us.
*/
failrec->this_mirror = next_mirror(failrec, failrec->this_mirror);
if (failrec->this_mirror == failrec->failed_mirror) {
btrfs_debug(fs_info,
"failed to repair num_copies %d this_mirror %d failed_mirror %d",
failrec->num_copies, failrec->this_mirror, failrec->failed_mirror);
- free_io_failure(failure_tree, tree, failrec);
+ free_io_failure(inode, failrec);
return -EIO;
}
- repair_bio = btrfs_bio_alloc(1);
+ repair_bio = btrfs_bio_alloc(1, REQ_OP_READ, failed_bbio->end_io,
+ failed_bbio->private);
repair_bbio = btrfs_bio(repair_bio);
repair_bbio->file_offset = start;
- repair_bio->bi_opf = REQ_OP_READ;
- repair_bio->bi_end_io = failed_bio->bi_end_io;
repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
- repair_bio->bi_private = failed_bio->bi_private;
if (failed_bbio->csum) {
const u32 csum_size = fs_info->csum_size;
@@ -2669,16 +737,21 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
bio_add_page(repair_bio, page, failrec->len, pgoff);
repair_bbio->iter = repair_bio->bi_iter;
- btrfs_debug(btrfs_sb(inode->i_sb),
+ btrfs_debug(fs_info,
"repair read error: submitting new read to mirror %d",
failrec->this_mirror);
/*
- * At this point we have a bio, so any errors from submit_bio_hook()
- * will be handled by the endio on the repair_bio, so we can't return an
+ * At this point we have a bio, so any errors from bio submission will
+ * be handled by the endio on the repair_bio, so we can't return an
* error here.
*/
- submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0);
+ if (submit_buffered)
+ btrfs_submit_data_read_bio(inode, repair_bio,
+ failrec->this_mirror, 0);
+ else
+ btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror);
+
return BLK_STS_OK;
}
@@ -2714,14 +787,9 @@ static void end_sector_io(struct page *page, u64 offset, bool uptodate)
{
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
const u32 sectorsize = inode->root->fs_info->sectorsize;
- struct extent_state *cached = NULL;
end_page_read(page, uptodate, offset, sectorsize);
- if (uptodate)
- set_extent_uptodate(&inode->io_tree, offset,
- offset + sectorsize - 1, &cached, GFP_ATOMIC);
- unlock_extent_cached_atomic(&inode->io_tree, offset,
- offset + sectorsize - 1, &cached);
+ unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, NULL);
}
static void submit_data_read_repair(struct inode *inode,
@@ -2767,9 +835,9 @@ static void submit_data_read_repair(struct inode *inode,
goto next;
}
- ret = btrfs_repair_one_sector(inode, failed_bbio,
+ ret = btrfs_repair_one_sector(BTRFS_I(inode), failed_bbio,
bio_offset + offset, page, pgoff + offset,
- btrfs_submit_data_read_bio);
+ true);
if (!ret) {
/*
* We have submitted the read repair, the page release
@@ -2823,8 +891,9 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_writepage(struct bio *bio)
+static void end_bio_extent_writepage(struct btrfs_bio *bbio)
{
+ struct bio *bio = &bbio->bio;
int error = blk_status_to_errno(bio->bi_status);
struct bio_vec *bvec;
u64 start;
@@ -2924,11 +993,7 @@ static void endio_readpage_release_extent(struct processed_extent *processed,
* Now we don't have range contiguous to the processed range, release
* the processed range now.
*/
- if (processed->uptodate && tree->track_uptodate)
- set_extent_uptodate(tree, processed->start, processed->end,
- &cached, GFP_ATOMIC);
- unlock_extent_cached_atomic(tree, processed->start, processed->end,
- &cached);
+ unlock_extent(tree, processed->start, processed->end, &cached);
update:
/* Update processed to current range */
@@ -2988,11 +1053,10 @@ static struct extent_buffer *find_extent_buffer_readpage(
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_readpage(struct bio *bio)
+static void end_bio_extent_readpage(struct btrfs_bio *bbio)
{
+ struct bio *bio = &bbio->bio;
struct bio_vec *bvec;
- struct btrfs_bio *bbio = btrfs_bio(bio);
- struct extent_io_tree *tree, *failure_tree;
struct processed_extent processed = { 0 };
/*
* The offset to the beginning of a bio, since one bio can never be
@@ -3019,8 +1083,6 @@ static void end_bio_extent_readpage(struct bio *bio)
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
bio->bi_iter.bi_sector, bio->bi_status,
bbio->mirror_num);
- tree = &BTRFS_I(inode)->io_tree;
- failure_tree = &BTRFS_I(inode)->io_failure_tree;
/*
* We always issue full-sector reads, but if some block in a
@@ -3061,9 +1123,7 @@ static void end_bio_extent_readpage(struct bio *bio)
loff_t i_size = i_size_read(inode);
pgoff_t end_index = i_size >> PAGE_SHIFT;
- clean_io_failure(BTRFS_I(inode)->root->fs_info,
- failure_tree, tree, start, page,
- btrfs_ino(BTRFS_I(inode)), 0);
+ btrfs_clean_io_failure(BTRFS_I(inode), start, page, 0);
/*
* Zero out the remaining part if this range straddles
@@ -3126,7 +1186,7 @@ static void end_bio_extent_readpage(struct bio *bio)
bio_put(bio);
}
-/**
+/*
* Populate every free slot in a provided array with pages.
*
* @nr_pages: number of pages to allocate
@@ -3163,59 +1223,15 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
}
/*
- * Initialize the members up to but not including 'bio'. Use after allocating a
- * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
- * 'bio' because use of __GFP_ZERO is not supported.
- */
-static inline void btrfs_bio_init(struct btrfs_bio *bbio)
-{
- memset(bbio, 0, offsetof(struct btrfs_bio, bio));
-}
-
-/*
- * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs.
+ * Attempt to add a page to bio.
*
- * The bio allocation is backed by bioset and does not fail.
- */
-struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
-{
- struct bio *bio;
-
- ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS);
- bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset);
- btrfs_bio_init(btrfs_bio(bio));
- return bio;
-}
-
-struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
-{
- struct bio *bio;
- struct btrfs_bio *bbio;
-
- ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
-
- /* this will never fail when it's backed by a bioset */
- bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
- ASSERT(bio);
-
- bbio = btrfs_bio(bio);
- btrfs_bio_init(bbio);
-
- bio_trim(bio, offset >> 9, size >> 9);
- bbio->iter = bio->bi_iter;
- return bio;
-}
-
-/**
- * Attempt to add a page to bio
- *
- * @bio_ctrl: record both the bio, and its bio_flags
- * @page: page to add to the bio
- * @disk_bytenr: offset of the new bio or to check whether we are adding
- * a contiguous page to the previous one
- * @size: portion of page that we want to write
- * @pg_offset: starting offset in the page
- * @compress_type: compression type of the current bio to see if we can merge them
+ * @bio_ctrl: record both the bio, and its bio_flags
+ * @page: page to add to the bio
+ * @disk_bytenr: offset of the new bio or to check whether we are adding
+ * a contiguous page to the previous one
+ * @size: portion of page that we want to write
+ * @pg_offset: starting offset in the page
+ * @compress_type: compression type of the current bio to see if we can merge them
*
* Attempt to add a page to bio considering stripe alignment etc.
*
@@ -3351,7 +1367,6 @@ static int alloc_new_bio(struct btrfs_inode *inode,
struct btrfs_bio_ctrl *bio_ctrl,
struct writeback_control *wbc,
blk_opf_t opf,
- bio_end_io_t end_io_func,
u64 disk_bytenr, u32 offset, u64 file_offset,
enum btrfs_compression_type compress_type)
{
@@ -3359,7 +1374,9 @@ static int alloc_new_bio(struct btrfs_inode *inode,
struct bio *bio;
int ret;
- bio = btrfs_bio_alloc(BIO_MAX_VECS);
+ ASSERT(bio_ctrl->end_io_func);
+
+ bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, bio_ctrl->end_io_func, NULL);
/*
* For compressed page range, its disk_bytenr is always @disk_bytenr
* passed in, no matter if we have added any range into previous bio.
@@ -3370,8 +1387,6 @@ static int alloc_new_bio(struct btrfs_inode *inode,
bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
bio_ctrl->bio = bio;
bio_ctrl->compress_type = compress_type;
- bio->bi_end_io = end_io_func;
- bio->bi_opf = opf;
ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
if (ret < 0)
goto error;
@@ -3410,31 +1425,30 @@ static int alloc_new_bio(struct btrfs_inode *inode,
return 0;
error:
bio_ctrl->bio = NULL;
- bio->bi_status = errno_to_blk_status(ret);
- bio_endio(bio);
+ btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
return ret;
}
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
* @wbc: optional writeback control for io accounting
- * @page: page to add to the bio
* @disk_bytenr: logical bytenr where the write will be
+ * @page: page to add to the bio
* @size: portion of page that we want to write to
* @pg_offset: offset of the new bio or to check whether we are adding
* a contiguous page to the previous one
- * @bio_ret: must be valid pointer, newly allocated bio will be stored there
- * @end_io_func: end_io callback for new bio
- * @mirror_num: desired mirror to read/write
- * @prev_bio_flags: flags of previous bio to see if we can merge the current one
* @compress_type: compress type for current bio
+ *
+ * The will either add the page into the existing @bio_ctrl->bio, or allocate a
+ * new one in @bio_ctrl->bio.
+ * The mirror number for this IO should already be initizlied in
+ * @bio_ctrl->mirror_num.
*/
static int submit_extent_page(blk_opf_t opf,
struct writeback_control *wbc,
struct btrfs_bio_ctrl *bio_ctrl,
- struct page *page, u64 disk_bytenr,
+ u64 disk_bytenr, struct page *page,
size_t size, unsigned long pg_offset,
- bio_end_io_t end_io_func,
enum btrfs_compression_type compress_type,
bool force_bio_submit)
{
@@ -3446,6 +1460,9 @@ static int submit_extent_page(blk_opf_t opf,
ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
pg_offset + size <= PAGE_SIZE);
+
+ ASSERT(bio_ctrl->end_io_func);
+
if (force_bio_submit)
submit_one_bio(bio_ctrl);
@@ -3456,7 +1473,7 @@ static int submit_extent_page(blk_opf_t opf,
/* Allocate new bio if needed */
if (!bio_ctrl->bio) {
ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
- end_io_func, disk_bytenr, offset,
+ disk_bytenr, offset,
page_offset(page) + cur,
compress_type);
if (ret < 0)
@@ -3613,7 +1630,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
u64 extent_offset;
u64 last_byte = i_size_read(inode);
u64 block_start;
- u64 cur_end;
struct extent_map *em;
int ret = 0;
size_t pg_offset = 0;
@@ -3623,7 +1639,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
ret = set_page_extent_mapped(page);
if (ret < 0) {
- unlock_extent(tree, start, end);
+ unlock_extent(tree, start, end, NULL);
btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
unlock_page(page);
goto out;
@@ -3637,6 +1653,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
memzero_page(page, zero_offset, iosize);
}
}
+ bio_ctrl->end_io_func = end_bio_extent_readpage;
begin_page_read(fs_info, page);
while (cur <= end) {
unsigned long this_bio_flag = 0;
@@ -3645,21 +1662,16 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
if (cur >= last_byte) {
- struct extent_state *cached = NULL;
-
iosize = PAGE_SIZE - pg_offset;
memzero_page(page, pg_offset, iosize);
- set_extent_uptodate(tree, cur, cur + iosize - 1,
- &cached, GFP_NOFS);
- unlock_extent_cached(tree, cur,
- cur + iosize - 1, &cached);
+ unlock_extent(tree, cur, cur + iosize - 1, NULL);
end_page_read(page, true, cur, iosize);
break;
}
em = __get_extent_map(inode, page, pg_offset, cur,
end - cur + 1, em_cached);
if (IS_ERR(em)) {
- unlock_extent(tree, cur, end);
+ unlock_extent(tree, cur, end, NULL);
end_page_read(page, false, cur, end + 1 - cur);
ret = PTR_ERR(em);
break;
@@ -3672,7 +1684,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
this_bio_flag = em->compress_type;
iosize = min(extent_map_end(em) - cur, end - cur + 1);
- cur_end = min(extent_map_end(em) - 1, end);
iosize = ALIGN(iosize, blocksize);
if (this_bio_flag != BTRFS_COMPRESS_NONE)
disk_bytenr = em->block_start;
@@ -3729,49 +1740,33 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
/* we've found a hole, just zero and go on */
if (block_start == EXTENT_MAP_HOLE) {
- struct extent_state *cached = NULL;
-
memzero_page(page, pg_offset, iosize);
- set_extent_uptodate(tree, cur, cur + iosize - 1,
- &cached, GFP_NOFS);
- unlock_extent_cached(tree, cur,
- cur + iosize - 1, &cached);
+ unlock_extent(tree, cur, cur + iosize - 1, NULL);
end_page_read(page, true, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
}
/* the get_extent function already copied into the page */
- if (test_range_bit(tree, cur, cur_end,
- EXTENT_UPTODATE, 1, NULL)) {
- unlock_extent(tree, cur, cur + iosize - 1);
- end_page_read(page, true, cur, iosize);
- cur = cur + iosize;
- pg_offset += iosize;
- continue;
- }
- /* we have an inline extent but it didn't get marked up
- * to date. Error out
- */
if (block_start == EXTENT_MAP_INLINE) {
- unlock_extent(tree, cur, cur + iosize - 1);
- end_page_read(page, false, cur, iosize);
+ unlock_extent(tree, cur, cur + iosize - 1, NULL);
+ end_page_read(page, true, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
}
ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
- bio_ctrl, page, disk_bytenr, iosize,
- pg_offset, end_bio_extent_readpage,
- this_bio_flag, force_bio_submit);
+ bio_ctrl, disk_bytenr, page, iosize,
+ pg_offset, this_bio_flag,
+ force_bio_submit);
if (ret) {
/*
* We have to unlock the remaining range, or the page
* will never be unlocked.
*/
- unlock_extent(tree, cur, end);
+ unlock_extent(tree, cur, end, NULL);
end_page_read(page, false, cur, end + 1 - cur);
goto out;
}
@@ -3952,7 +1947,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
struct page *page,
struct writeback_control *wbc,
- struct extent_page_data *epd,
+ struct btrfs_bio_ctrl *bio_ctrl,
loff_t i_size,
int *nr_ret)
{
@@ -3984,6 +1979,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
*/
wbc->nr_to_write--;
+ bio_ctrl->end_io_func = end_bio_extent_writepage;
while (cur <= end) {
u64 disk_bytenr;
u64 em_end;
@@ -4077,10 +2073,9 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
btrfs_page_clear_dirty(fs_info, page, cur, iosize);
ret = submit_extent_page(op | write_flags, wbc,
- &epd->bio_ctrl, page,
- disk_bytenr, iosize,
+ bio_ctrl, disk_bytenr,
+ page, iosize,
cur - page_offset(page),
- end_bio_extent_writepage,
0, false);
if (ret) {
has_error = true;
@@ -4118,7 +2113,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
* Return <0 for error.
*/
static int __extent_writepage(struct page *page, struct writeback_control *wbc,
- struct extent_page_data *epd)
+ struct btrfs_bio_ctrl *bio_ctrl)
{
struct folio *folio = page_folio(page);
struct inode *inode = page->mapping->host;
@@ -4155,7 +2150,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
goto done;
}
- if (!epd->extent_locked) {
+ if (!bio_ctrl->extent_locked) {
ret = writepage_delalloc(BTRFS_I(inode), page, wbc);
if (ret == 1)
return 0;
@@ -4163,7 +2158,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
goto done;
}
- ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
+ ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, bio_ctrl, i_size,
&nr);
if (ret == 1)
return 0;
@@ -4207,9 +2202,9 @@ done:
*/
if (PageError(page))
end_extent_writepage(page, ret, page_start, page_end);
- if (epd->extent_locked) {
+ if (bio_ctrl->extent_locked) {
/*
- * If epd->extent_locked, it's from extent_write_locked_range(),
+ * If bio_ctrl->extent_locked, it's from extent_write_locked_range(),
* the page can either be locked by lock_page() or
* process_one_page().
* Let btrfs_page_unlock_writer() handle both cases.
@@ -4248,7 +2243,7 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb)
* Return <0 if something went wrong, no page is locked.
*/
static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
- struct extent_page_data *epd)
+ struct btrfs_bio_ctrl *bio_ctrl)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
int i, num_pages;
@@ -4256,17 +2251,17 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
int ret = 0;
if (!btrfs_try_tree_write_lock(eb)) {
- submit_write_bio(epd, 0);
+ submit_write_bio(bio_ctrl, 0);
flush = 1;
btrfs_tree_lock(eb);
}
if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
btrfs_tree_unlock(eb);
- if (!epd->sync_io)
+ if (!bio_ctrl->sync_io)
return 0;
if (!flush) {
- submit_write_bio(epd, 0);
+ submit_write_bio(bio_ctrl, 0);
flush = 1;
}
while (1) {
@@ -4313,7 +2308,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
if (!trylock_page(p)) {
if (!flush) {
- submit_write_bio(epd, 0);
+ submit_write_bio(bio_ctrl, 0);
flush = 1;
}
lock_page(p);
@@ -4431,8 +2426,9 @@ static struct extent_buffer *find_extent_buffer_nolock(
* Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
* after all extent buffers in the page has finished their writeback.
*/
-static void end_bio_subpage_eb_writepage(struct bio *bio)
+static void end_bio_subpage_eb_writepage(struct btrfs_bio *bbio)
{
+ struct bio *bio = &bbio->bio;
struct btrfs_fs_info *fs_info;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
@@ -4488,8 +2484,9 @@ static void end_bio_subpage_eb_writepage(struct bio *bio)
bio_put(bio);
}
-static void end_bio_extent_buffer_writepage(struct bio *bio)
+static void end_bio_extent_buffer_writepage(struct btrfs_bio *bbio)
{
+ struct bio *bio = &bbio->bio;
struct bio_vec *bvec;
struct extent_buffer *eb;
int done;
@@ -4532,15 +2529,19 @@ static void prepare_eb_write(struct extent_buffer *eb)
/* Set btree blocks beyond nritems with 0 to avoid stale content */
nritems = btrfs_header_nritems(eb);
if (btrfs_header_level(eb) > 0) {
- end = btrfs_node_key_ptr_offset(nritems);
+ end = btrfs_node_key_ptr_offset(eb, nritems);
memzero_extent_buffer(eb, end, eb->len - end);
} else {
/*
* Leaf:
* header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
*/
- start = btrfs_item_nr_offset(nritems);
- end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
+ start = btrfs_item_nr_offset(eb, nritems);
+ end = btrfs_item_nr_offset(eb, 0);
+ if (nritems == 0)
+ end += BTRFS_LEAF_DATA_SIZE(eb->fs_info);
+ else
+ end += btrfs_item_offset(eb, nritems - 1);
memzero_extent_buffer(eb, start, end - start);
}
}
@@ -4551,7 +2552,7 @@ static void prepare_eb_write(struct extent_buffer *eb)
*/
static int write_one_subpage_eb(struct extent_buffer *eb,
struct writeback_control *wbc,
- struct extent_page_data *epd)
+ struct btrfs_bio_ctrl *bio_ctrl)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct page *page = eb->pages[0];
@@ -4571,10 +2572,11 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
if (no_dirty_ebs)
clear_page_dirty_for_io(page);
+ bio_ctrl->end_io_func = end_bio_subpage_eb_writepage;
+
ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
- &epd->bio_ctrl, page, eb->start, eb->len,
- eb->start - page_offset(page),
- end_bio_subpage_eb_writepage, 0, false);
+ bio_ctrl, eb->start, page, eb->len,
+ eb->start - page_offset(page), 0, false);
if (ret) {
btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
set_btree_ioerr(page, eb);
@@ -4596,7 +2598,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
struct writeback_control *wbc,
- struct extent_page_data *epd)
+ struct btrfs_bio_ctrl *bio_ctrl)
{
u64 disk_bytenr = eb->start;
int i, num_pages;
@@ -4605,6 +2607,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
prepare_eb_write(eb);
+ bio_ctrl->end_io_func = end_bio_extent_buffer_writepage;
+
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
struct page *p = eb->pages[i];
@@ -4612,10 +2616,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
clear_page_dirty_for_io(p);
set_page_writeback(p);
ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
- &epd->bio_ctrl, p, disk_bytenr,
- PAGE_SIZE, 0,
- end_bio_extent_buffer_writepage,
- 0, false);
+ bio_ctrl, disk_bytenr, p,
+ PAGE_SIZE, 0, 0, false);
if (ret) {
set_btree_ioerr(p, eb);
if (PageWriteback(p))
@@ -4657,7 +2659,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
*/
static int submit_eb_subpage(struct page *page,
struct writeback_control *wbc,
- struct extent_page_data *epd)
+ struct btrfs_bio_ctrl *bio_ctrl)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
int submitted = 0;
@@ -4710,7 +2712,7 @@ static int submit_eb_subpage(struct page *page,
if (!eb)
continue;
- ret = lock_extent_buffer_for_io(eb, epd);
+ ret = lock_extent_buffer_for_io(eb, bio_ctrl);
if (ret == 0) {
free_extent_buffer(eb);
continue;
@@ -4719,7 +2721,7 @@ static int submit_eb_subpage(struct page *page,
free_extent_buffer(eb);
goto cleanup;
}
- ret = write_one_subpage_eb(eb, wbc, epd);
+ ret = write_one_subpage_eb(eb, wbc, bio_ctrl);
free_extent_buffer(eb);
if (ret < 0)
goto cleanup;
@@ -4729,7 +2731,7 @@ static int submit_eb_subpage(struct page *page,
cleanup:
/* We hit error, end bio for the submitted extent buffers */
- submit_write_bio(epd, ret);
+ submit_write_bio(bio_ctrl, ret);
return ret;
}
@@ -4754,7 +2756,7 @@ cleanup:
* Return <0 for fatal error.
*/
static int submit_eb_page(struct page *page, struct writeback_control *wbc,
- struct extent_page_data *epd,
+ struct btrfs_bio_ctrl *bio_ctrl,
struct extent_buffer **eb_context)
{
struct address_space *mapping = page->mapping;
@@ -4766,7 +2768,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
return 0;
if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
- return submit_eb_subpage(page, wbc, epd);
+ return submit_eb_subpage(page, wbc, bio_ctrl);
spin_lock(&mapping->private_lock);
if (!PagePrivate(page)) {
@@ -4809,7 +2811,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
*eb_context = eb;
- ret = lock_extent_buffer_for_io(eb, epd);
+ ret = lock_extent_buffer_for_io(eb, bio_ctrl);
if (ret <= 0) {
btrfs_revert_meta_write_pointer(cache, eb);
if (cache)
@@ -4824,7 +2826,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
btrfs_schedule_zone_finish_bg(cache, eb);
btrfs_put_block_group(cache);
}
- ret = write_one_eb(eb, wbc, epd);
+ ret = write_one_eb(eb, wbc, bio_ctrl);
free_extent_buffer(eb);
if (ret < 0)
return ret;
@@ -4835,10 +2837,9 @@ int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct extent_buffer *eb_context = NULL;
- struct extent_page_data epd = {
- .bio_ctrl = { 0 },
+ struct btrfs_bio_ctrl bio_ctrl = {
.extent_locked = 0,
- .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ .sync_io = (wbc->sync_mode == WB_SYNC_ALL),
};
struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
int ret = 0;
@@ -4881,7 +2882,7 @@ retry:
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- ret = submit_eb_page(page, wbc, &epd, &eb_context);
+ ret = submit_eb_page(page, wbc, &bio_ctrl, &eb_context);
if (ret == 0)
continue;
if (ret < 0) {
@@ -4942,18 +2943,18 @@ retry:
ret = 0;
if (!ret && BTRFS_FS_ERROR(fs_info))
ret = -EROFS;
- submit_write_bio(&epd, ret);
+ submit_write_bio(&bio_ctrl, ret);
btrfs_zoned_meta_io_unlock(fs_info);
return ret;
}
-/**
+/*
* Walk the list of dirty pages of the given address space and write all of them.
*
- * @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @epd: holds context for the write, namely the bio
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @bio_ctrl: holds context for the write, namely the bio
*
* If a page is already under I/O, write_cache_pages() skips it, even
* if it's dirty. This is desirable behaviour for memory-cleaning writeback,
@@ -4965,7 +2966,7 @@ retry:
*/
static int extent_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc,
- struct extent_page_data *epd)
+ struct btrfs_bio_ctrl *bio_ctrl)
{
struct inode *inode = mapping->host;
int ret = 0;
@@ -5046,7 +3047,7 @@ retry:
* tmpfs file mapping
*/
if (!trylock_page(page)) {
- submit_write_bio(epd, 0);
+ submit_write_bio(bio_ctrl, 0);
lock_page(page);
}
@@ -5057,7 +3058,7 @@ retry:
if (wbc->sync_mode != WB_SYNC_NONE) {
if (PageWriteback(page))
- submit_write_bio(epd, 0);
+ submit_write_bio(bio_ctrl, 0);
wait_on_page_writeback(page);
}
@@ -5067,7 +3068,7 @@ retry:
continue;
}
- ret = __extent_writepage(page, wbc, epd);
+ ret = __extent_writepage(page, wbc, bio_ctrl);
if (ret < 0) {
done = 1;
break;
@@ -5097,14 +3098,14 @@ retry:
* page in our current bio, and thus deadlock, so flush the
* write bio here.
*/
- submit_write_bio(epd, 0);
+ submit_write_bio(bio_ctrl, 0);
goto retry;
}
if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
mapping->writeback_index = done_index;
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(BTRFS_I(inode));
return ret;
}
@@ -5123,8 +3124,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
u64 cur = start;
unsigned long nr_pages;
const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
- struct extent_page_data epd = {
- .bio_ctrl = { 0 },
+ struct btrfs_bio_ctrl bio_ctrl = {
.extent_locked = 1,
.sync_io = 1,
};
@@ -5155,7 +3155,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
ASSERT(PageLocked(page));
ASSERT(PageDirty(page));
clear_page_dirty_for_io(page);
- ret = __extent_writepage(page, &wbc_writepages, &epd);
+ ret = __extent_writepage(page, &wbc_writepages, &bio_ctrl);
ASSERT(ret <= 0);
if (ret < 0) {
found_error = true;
@@ -5165,7 +3165,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
cur = cur_end + 1;
}
- submit_write_bio(&epd, found_error ? ret : 0);
+ submit_write_bio(&bio_ctrl, found_error ? ret : 0);
wbc_detach_inode(&wbc_writepages);
if (found_error)
@@ -5178,10 +3178,9 @@ int extent_writepages(struct address_space *mapping,
{
struct inode *inode = mapping->host;
int ret = 0;
- struct extent_page_data epd = {
- .bio_ctrl = { 0 },
+ struct btrfs_bio_ctrl bio_ctrl = {
.extent_locked = 0,
- .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ .sync_io = (wbc->sync_mode == WB_SYNC_ALL),
};
/*
@@ -5189,8 +3188,8 @@ int extent_writepages(struct address_space *mapping,
* protect the write pointer updates.
*/
btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
- ret = extent_write_cache_pages(mapping, wbc, &epd);
- submit_write_bio(&epd, ret);
+ ret = extent_write_cache_pages(mapping, wbc, &bio_ctrl);
+ submit_write_bio(&bio_ctrl, ret);
btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
return ret;
}
@@ -5236,7 +3235,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
if (start > end)
return 0;
- lock_extent_bits(tree, start, end, &cached_state);
+ lock_extent(tree, start, end, &cached_state);
folio_wait_writeback(folio);
/*
@@ -5244,7 +3243,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
* so here we only need to unlock the extent range to free any
* existing extent state.
*/
- unlock_extent_cached(tree, start, end, &cached_state);
+ unlock_extent(tree, start, end, &cached_state);
return 0;
}
@@ -5263,15 +3262,17 @@ static int try_release_extent_state(struct extent_io_tree *tree,
if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
ret = 0;
} else {
+ u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM |
+ EXTENT_DELALLOC_NEW | EXTENT_CTLBITS);
+
/*
* At this point we can safely clear everything except the
* locked bit, the nodatasum bit and the delalloc new bit.
* The delalloc new bit will be cleared by ordered extent
* completion.
*/
- ret = __clear_extent_bit(tree, start, end,
- ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW),
- 0, 0, NULL, mask, NULL);
+ ret = __clear_extent_bit(tree, start, end, clear_bits, NULL,
+ mask, NULL);
/* if clear_extent_bit failed for enomem reasons,
* we can't allow the release to continue.
@@ -5370,42 +3371,6 @@ next:
}
/*
- * helper function for fiemap, which doesn't want to see any holes.
- * This maps until we find something past 'last'
- */
-static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
- u64 offset, u64 last)
-{
- u64 sectorsize = btrfs_inode_sectorsize(inode);
- struct extent_map *em;
- u64 len;
-
- if (offset >= last)
- return NULL;
-
- while (1) {
- len = last - offset;
- if (len == 0)
- break;
- len = ALIGN(len, sectorsize);
- em = btrfs_get_extent_fiemap(inode, offset, len);
- if (IS_ERR(em))
- return em;
-
- /* if this isn't a hole return it */
- if (em->block_start != EXTENT_MAP_HOLE)
- return em;
-
- /* this is a hole, advance to the next extent */
- offset = extent_map_end(em);
- free_extent_map(em);
- if (offset >= last)
- break;
- }
- return NULL;
-}
-
-/*
* To cache previous fiemap extent
*
* Will be used for merging fiemap extent
@@ -5434,6 +3399,9 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
{
int ret = 0;
+ /* Set at the end of extent_fiemap(). */
+ ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
+
if (!cache->cached)
goto assign;
@@ -5457,16 +3425,13 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
* So truly compressed (physical size smaller than logical size)
* extents won't get merged with each other
*
- * 3) Share same flags except FIEMAP_EXTENT_LAST
- * So regular extent won't get merged with prealloc extent
+ * 3) Share same flags
*/
if (cache->offset + cache->len == offset &&
cache->phys + cache->len == phys &&
- (cache->flags & ~FIEMAP_EXTENT_LAST) ==
- (flags & ~FIEMAP_EXTENT_LAST)) {
+ cache->flags == flags) {
cache->len += len;
- cache->flags |= flags;
- goto try_submit_last;
+ return 0;
}
/* Not mergeable, need to submit cached one */
@@ -5481,13 +3446,8 @@ assign:
cache->phys = phys;
cache->len = len;
cache->flags = flags;
-try_submit_last:
- if (cache->flags & FIEMAP_EXTENT_LAST) {
- ret = fiemap_fill_next_extent(fieinfo, cache->offset,
- cache->phys, cache->len, cache->flags);
- cache->cached = false;
- }
- return ret;
+
+ return 0;
}
/*
@@ -5517,218 +3477,532 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
return ret;
}
-int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
- u64 start, u64 len)
+static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
{
- int ret = 0;
- u64 off;
- u64 max = start + len;
- u32 flags = 0;
- u32 found_type;
- u64 last;
- u64 last_for_get_extent = 0;
- u64 disko = 0;
- u64 isize = i_size_read(&inode->vfs_inode);
- struct btrfs_key found_key;
- struct extent_map *em = NULL;
- struct extent_state *cached_state = NULL;
- struct btrfs_path *path;
- struct btrfs_root *root = inode->root;
- struct fiemap_cache cache = { 0 };
- struct ulist *roots;
- struct ulist *tmp_ulist;
- int end = 0;
- u64 em_start = 0;
- u64 em_len = 0;
- u64 em_end = 0;
-
- if (len == 0)
- return -EINVAL;
+ struct extent_buffer *clone;
+ struct btrfs_key key;
+ int slot;
+ int ret;
- path = btrfs_alloc_path();
- if (!path)
+ path->slots[0]++;
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
+ return 0;
+
+ ret = btrfs_next_leaf(inode->root, path);
+ if (ret != 0)
+ return ret;
+
+ /*
+ * Don't bother with cloning if there are no more file extent items for
+ * our inode.
+ */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY)
+ return 1;
+
+ /* See the comment at fiemap_search_slot() about why we clone. */
+ clone = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!clone)
return -ENOMEM;
- roots = ulist_alloc(GFP_KERNEL);
- tmp_ulist = ulist_alloc(GFP_KERNEL);
- if (!roots || !tmp_ulist) {
- ret = -ENOMEM;
- goto out_free_ulist;
+ slot = path->slots[0];
+ btrfs_release_path(path);
+ path->nodes[0] = clone;
+ path->slots[0] = slot;
+
+ return 0;
+}
+
+/*
+ * Search for the first file extent item that starts at a given file offset or
+ * the one that starts immediately before that offset.
+ * Returns: 0 on success, < 0 on error, 1 if not found.
+ */
+static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
+ u64 file_offset)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *clone;
+ struct btrfs_key key;
+ int slot;
+ int ret;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = file_offset;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ if (ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret != 0)
+ return ret;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ return 1;
}
/*
- * We can't initialize that to 'start' as this could miss extents due
- * to extent item merging
+ * We clone the leaf and use it during fiemap. This is because while
+ * using the leaf we do expensive things like checking if an extent is
+ * shared, which can take a long time. In order to prevent blocking
+ * other tasks for too long, we use a clone of the leaf. We have locked
+ * the file range in the inode's io tree, so we know none of our file
+ * extent items can change. This way we avoid blocking other tasks that
+ * want to insert items for other inodes in the same leaf or b+tree
+ * rebalance operations (triggered for example when someone is trying
+ * to push items into this leaf when trying to insert an item in a
+ * neighbour leaf).
+ * We also need the private clone because holding a read lock on an
+ * extent buffer of the subvolume's b+tree will make lockdep unhappy
+ * when we call fiemap_fill_next_extent(), because that may cause a page
+ * fault when filling the user space buffer with fiemap data.
*/
- off = 0;
- start = round_down(start, btrfs_inode_sectorsize(inode));
- len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
+ clone = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!clone)
+ return -ENOMEM;
+
+ slot = path->slots[0];
+ btrfs_release_path(path);
+ path->nodes[0] = clone;
+ path->slots[0] = slot;
+
+ return 0;
+}
+
+/*
+ * Process a range which is a hole or a prealloc extent in the inode's subvolume
+ * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
+ * extent. The end offset (@end) is inclusive.
+ */
+static int fiemap_process_hole(struct btrfs_inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache,
+ struct extent_state **delalloc_cached_state,
+ struct btrfs_backref_share_check_ctx *backref_ctx,
+ u64 disk_bytenr, u64 extent_offset,
+ u64 extent_gen,
+ u64 start, u64 end)
+{
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+ u64 cur_offset = start;
+ u64 last_delalloc_end = 0;
+ u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
+ bool checked_extent_shared = false;
+ int ret;
/*
- * lookup the last file extent. We're not using i_size here
- * because there might be preallocation past i_size
+ * There can be no delalloc past i_size, so don't waste time looking for
+ * it beyond i_size.
*/
- ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
- 0);
- if (ret < 0) {
- goto out_free_ulist;
- } else {
- WARN_ON(!ret);
- if (ret == 1)
- ret = 0;
- }
+ while (cur_offset < end && cur_offset < i_size) {
+ u64 delalloc_start;
+ u64 delalloc_end;
+ u64 prealloc_start;
+ u64 prealloc_len = 0;
+ bool delalloc;
+
+ delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
+ delalloc_cached_state,
+ &delalloc_start,
+ &delalloc_end);
+ if (!delalloc)
+ break;
- path->slots[0]--;
- btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
- found_type = found_key.type;
-
- /* No extents, but there might be delalloc bits */
- if (found_key.objectid != btrfs_ino(inode) ||
- found_type != BTRFS_EXTENT_DATA_KEY) {
- /* have to trust i_size as the end */
- last = (u64)-1;
- last_for_get_extent = isize;
- } else {
/*
- * remember the start of the last extent. There are a
- * bunch of different factors that go into the length of the
- * extent, so its much less complex to remember where it started
+ * If this is a prealloc extent we have to report every section
+ * of it that has no delalloc.
*/
- last = found_key.offset;
- last_for_get_extent = last + 1;
+ if (disk_bytenr != 0) {
+ if (last_delalloc_end == 0) {
+ prealloc_start = start;
+ prealloc_len = delalloc_start - start;
+ } else {
+ prealloc_start = last_delalloc_end + 1;
+ prealloc_len = delalloc_start - prealloc_start;
+ }
+ }
+
+ if (prealloc_len > 0) {
+ if (!checked_extent_shared && fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(inode,
+ disk_bytenr,
+ extent_gen,
+ backref_ctx);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ prealloc_flags |= FIEMAP_EXTENT_SHARED;
+
+ checked_extent_shared = true;
+ }
+ ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
+ disk_bytenr + extent_offset,
+ prealloc_len, prealloc_flags);
+ if (ret)
+ return ret;
+ extent_offset += prealloc_len;
+ }
+
+ ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
+ delalloc_end + 1 - delalloc_start,
+ FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
+ if (ret)
+ return ret;
+
+ last_delalloc_end = delalloc_end;
+ cur_offset = delalloc_end + 1;
+ extent_offset += cur_offset - delalloc_start;
+ cond_resched();
}
- btrfs_release_path(path);
/*
- * we might have some extents allocated but more delalloc past those
- * extents. so, we trust isize unless the start of the last extent is
- * beyond isize
+ * Either we found no delalloc for the whole prealloc extent or we have
+ * a prealloc extent that spans i_size or starts at or after i_size.
+ */
+ if (disk_bytenr != 0 && last_delalloc_end < end) {
+ u64 prealloc_start;
+ u64 prealloc_len;
+
+ if (last_delalloc_end == 0) {
+ prealloc_start = start;
+ prealloc_len = end + 1 - start;
+ } else {
+ prealloc_start = last_delalloc_end + 1;
+ prealloc_len = end + 1 - prealloc_start;
+ }
+
+ if (!checked_extent_shared && fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(inode,
+ disk_bytenr,
+ extent_gen,
+ backref_ctx);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ prealloc_flags |= FIEMAP_EXTENT_SHARED;
+ }
+ ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
+ disk_bytenr + extent_offset,
+ prealloc_len, prealloc_flags);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ u64 *last_extent_end_ret)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *ei;
+ struct btrfs_key key;
+ u64 disk_bytenr;
+ int ret;
+
+ /*
+ * Lookup the last file extent. We're not using i_size here because
+ * there might be preallocation past i_size.
+ */
+ ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
+ /* There can't be a file extent item at offset (u64)-1 */
+ ASSERT(ret != 0);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * For a non-existing key, btrfs_search_slot() always leaves us at a
+ * slot > 0, except if the btree is empty, which is impossible because
+ * at least it has the inode item for this inode and all the items for
+ * the root inode 256.
*/
- if (last < isize) {
- last = (u64)-1;
- last_for_get_extent = isize;
+ ASSERT(path->slots[0] > 0);
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+ /* No file extent items in the subvolume tree. */
+ *last_extent_end_ret = 0;
+ return 0;
}
- lock_extent_bits(&inode->io_tree, start, start + len - 1,
- &cached_state);
+ /*
+ * For an inline extent, the disk_bytenr is where inline data starts at,
+ * so first check if we have an inline extent item before checking if we
+ * have an implicit hole (disk_bytenr == 0).
+ */
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
+ *last_extent_end_ret = btrfs_file_extent_end(path);
+ return 0;
+ }
- em = get_extent_skip_holes(inode, start, last_for_get_extent);
- if (!em)
- goto out;
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
+ /*
+ * Find the last file extent item that is not a hole (when NO_HOLES is
+ * not enabled). This should take at most 2 iterations in the worst
+ * case: we have one hole file extent item at slot 0 of a leaf and
+ * another hole file extent item as the last item in the previous leaf.
+ * This is because we merge file extent items that represent holes.
+ */
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ while (disk_bytenr == 0) {
+ ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ /* No file extent items that are not holes. */
+ *last_extent_end_ret = 0;
+ return 0;
+ }
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ }
+
+ *last_extent_end_ret = btrfs_file_extent_end(path);
+ return 0;
+}
+
+int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct extent_state *cached_state = NULL;
+ struct extent_state *delalloc_cached_state = NULL;
+ struct btrfs_path *path;
+ struct fiemap_cache cache = { 0 };
+ struct btrfs_backref_share_check_ctx *backref_ctx;
+ u64 last_extent_end;
+ u64 prev_extent_end;
+ u64 lockstart;
+ u64 lockend;
+ bool stopped = false;
+ int ret;
+
+ backref_ctx = btrfs_alloc_backref_share_check_ctx();
+ path = btrfs_alloc_path();
+ if (!backref_ctx || !path) {
+ ret = -ENOMEM;
goto out;
}
- while (!end) {
- u64 offset_in_extent = 0;
+ lockstart = round_down(start, inode->root->fs_info->sectorsize);
+ lockend = round_up(start + len, inode->root->fs_info->sectorsize);
+ prev_extent_end = lockstart;
- /* break if the extent we found is outside the range */
- if (em->start >= max || extent_map_end(em) < off)
- break;
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
- /*
- * get_extent may return an extent that starts before our
- * requested range. We have to make sure the ranges
- * we return to fiemap always move forward and don't
- * overlap, so adjust the offsets here
- */
- em_start = max(em->start, off);
+ ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
+ if (ret < 0)
+ goto out_unlock;
+ btrfs_release_path(path);
+ path->reada = READA_FORWARD;
+ ret = fiemap_search_slot(inode, path, lockstart);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
/*
- * record the offset from the start of the extent
- * for adjusting the disk offset below. Only do this if the
- * extent isn't compressed since our in ram offset may be past
- * what we have actually allocated on disk.
+ * No file extent item found, but we may have delalloc between
+ * the current offset and i_size. So check for that.
*/
- if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
- offset_in_extent = em_start - em->start;
- em_end = extent_map_end(em);
- em_len = em_end - em_start;
- flags = 0;
- if (em->block_start < EXTENT_MAP_LAST_BYTE)
- disko = em->block_start + offset_in_extent;
- else
- disko = 0;
+ ret = 0;
+ goto check_eof_delalloc;
+ }
+
+ while (prev_extent_end < lockend) {
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_file_extent_item *ei;
+ struct btrfs_key key;
+ u64 extent_end;
+ u64 extent_len;
+ u64 extent_offset = 0;
+ u64 extent_gen;
+ u64 disk_bytenr = 0;
+ u64 flags = 0;
+ int extent_type;
+ u8 compression;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ extent_end = btrfs_file_extent_end(path);
/*
- * bump off for our next call to get_extent
+ * The first iteration can leave us at an extent item that ends
+ * before our range's start. Move to the next item.
*/
- off = extent_map_end(em);
- if (off >= max)
- end = 1;
-
- if (em->block_start == EXTENT_MAP_LAST_BYTE) {
- end = 1;
- flags |= FIEMAP_EXTENT_LAST;
- } else if (em->block_start == EXTENT_MAP_INLINE) {
- flags |= (FIEMAP_EXTENT_DATA_INLINE |
- FIEMAP_EXTENT_NOT_ALIGNED);
- } else if (em->block_start == EXTENT_MAP_DELALLOC) {
- flags |= (FIEMAP_EXTENT_DELALLOC |
- FIEMAP_EXTENT_UNKNOWN);
- } else if (fieinfo->fi_extents_max) {
- u64 bytenr = em->block_start -
- (em->start - em->orig_start);
+ if (extent_end <= lockstart)
+ goto next_item;
- /*
- * As btrfs supports shared space, this information
- * can be exported to userspace tools via
- * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0
- * then we're just getting a count and we can skip the
- * lookup stuff.
- */
- ret = btrfs_check_shared(root, btrfs_ino(inode),
- bytenr, roots, tmp_ulist);
- if (ret < 0)
- goto out_free;
- if (ret)
- flags |= FIEMAP_EXTENT_SHARED;
- ret = 0;
+ backref_ctx->curr_leaf_bytenr = leaf->start;
+
+ /* We have in implicit hole (NO_HOLES feature enabled). */
+ if (prev_extent_end < key.offset) {
+ const u64 range_end = min(key.offset, lockend) - 1;
+
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ &delalloc_cached_state,
+ backref_ctx, 0, 0, 0,
+ prev_extent_end, range_end);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* fiemap_fill_next_extent() told us to stop. */
+ stopped = true;
+ break;
+ }
+
+ /* We've reached the end of the fiemap range, stop. */
+ if (key.offset >= lockend) {
+ stopped = true;
+ break;
+ }
}
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+
+ extent_len = extent_end - key.offset;
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ compression = btrfs_file_extent_compression(leaf, ei);
+ extent_type = btrfs_file_extent_type(leaf, ei);
+ extent_gen = btrfs_file_extent_generation(leaf, ei);
+
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ if (compression == BTRFS_COMPRESS_NONE)
+ extent_offset = btrfs_file_extent_offset(leaf, ei);
+ }
+
+ if (compression != BTRFS_COMPRESS_NONE)
flags |= FIEMAP_EXTENT_ENCODED;
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- flags |= FIEMAP_EXTENT_UNWRITTEN;
- free_extent_map(em);
- em = NULL;
- if ((em_start >= last) || em_len == (u64)-1 ||
- (last == (u64)-1 && isize <= em_end)) {
- flags |= FIEMAP_EXTENT_LAST;
- end = 1;
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ flags |= FIEMAP_EXTENT_DATA_INLINE;
+ flags |= FIEMAP_EXTENT_NOT_ALIGNED;
+ ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
+ extent_len, flags);
+ } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ &delalloc_cached_state,
+ backref_ctx,
+ disk_bytenr, extent_offset,
+ extent_gen, key.offset,
+ extent_end - 1);
+ } else if (disk_bytenr == 0) {
+ /* We have an explicit hole. */
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ &delalloc_cached_state,
+ backref_ctx, 0, 0, 0,
+ key.offset, extent_end - 1);
+ } else {
+ /* We have a regular extent. */
+ if (fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(inode,
+ disk_bytenr,
+ extent_gen,
+ backref_ctx);
+ if (ret < 0)
+ goto out_unlock;
+ else if (ret > 0)
+ flags |= FIEMAP_EXTENT_SHARED;
+ }
+
+ ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
+ disk_bytenr + extent_offset,
+ extent_len, flags);
}
- /* now scan forward to see if this is really the last extent. */
- em = get_extent_skip_holes(inode, off, last_for_get_extent);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out;
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* fiemap_fill_next_extent() told us to stop. */
+ stopped = true;
+ break;
}
- if (!em) {
- flags |= FIEMAP_EXTENT_LAST;
- end = 1;
+
+ prev_extent_end = extent_end;
+next_item:
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out_unlock;
}
- ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
- em_len, flags);
- if (ret) {
- if (ret == 1)
- ret = 0;
- goto out_free;
+
+ ret = fiemap_next_leaf_item(inode, path);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* No more file extent items for this inode. */
+ break;
}
+ cond_resched();
}
-out_free:
- if (!ret)
- ret = emit_last_fiemap_cache(fieinfo, &cache);
- free_extent_map(em);
-out:
- unlock_extent_cached(&inode->io_tree, start, start + len - 1,
- &cached_state);
-out_free_ulist:
+check_eof_delalloc:
+ /*
+ * Release (and free) the path before emitting any final entries to
+ * fiemap_fill_next_extent() to keep lockdep happy. This is because
+ * once we find no more file extent items exist, we may have a
+ * non-cloned leaf, and fiemap_fill_next_extent() can trigger page
+ * faults when copying data to the user space buffer.
+ */
+ btrfs_free_path(path);
+ path = NULL;
+
+ if (!stopped && prev_extent_end < lockend) {
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ &delalloc_cached_state, backref_ctx,
+ 0, 0, 0, prev_extent_end, lockend - 1);
+ if (ret < 0)
+ goto out_unlock;
+ prev_extent_end = lockend;
+ }
+
+ if (cache.cached && cache.offset + cache.len >= last_extent_end) {
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+
+ if (prev_extent_end < i_size) {
+ u64 delalloc_start;
+ u64 delalloc_end;
+ bool delalloc;
+
+ delalloc = btrfs_find_delalloc_in_range(inode,
+ prev_extent_end,
+ i_size - 1,
+ &delalloc_cached_state,
+ &delalloc_start,
+ &delalloc_end);
+ if (!delalloc)
+ cache.flags |= FIEMAP_EXTENT_LAST;
+ } else {
+ cache.flags |= FIEMAP_EXTENT_LAST;
+ }
+ }
+
+ ret = emit_last_fiemap_cache(fieinfo, &cache);
+
+out_unlock:
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+out:
+ free_extent_state(delalloc_cached_state);
+ btrfs_free_backref_share_ctx(backref_ctx);
btrfs_free_path(path);
- ulist_free(roots);
- ulist_free(tmp_ulist);
return ret;
}
@@ -5856,7 +4130,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
{
btrfs_release_extent_buffer_pages(eb);
- btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
+ btrfs_leak_debug_del_eb(eb);
__free_extent_buffer(eb);
}
@@ -5870,11 +4144,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
eb->start = start;
eb->len = len;
eb->fs_info = fs_info;
- eb->bflags = 0;
init_rwsem(&eb->lock);
- btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
- &fs_info->allocated_ebs);
+ btrfs_leak_debug_add_eb(eb);
INIT_LIST_HEAD(&eb->release_list);
spin_lock_init(&eb->refs_lock);
@@ -5904,7 +4176,6 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
*/
set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
- memset(new->pages, 0, sizeof(*new->pages) * num_pages);
ret = btrfs_alloc_page_array(num_pages, new->pages);
if (ret) {
btrfs_release_extent_buffer(new);
@@ -6342,7 +4613,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
spin_unlock(&eb->refs_lock);
}
- btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
+ btrfs_leak_debug_del_eb(eb);
/* Should be safe to release our pages at this point */
btrfs_release_extent_buffer_pages(eb);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -6362,18 +4633,16 @@ static int release_extent_buffer(struct extent_buffer *eb)
void free_extent_buffer(struct extent_buffer *eb)
{
int refs;
- int old;
if (!eb)
return;
+ refs = atomic_read(&eb->refs);
while (1) {
- refs = atomic_read(&eb->refs);
if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
|| (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
refs == 1))
break;
- old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
- if (old == refs)
+ if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1))
return;
}
@@ -6551,11 +4820,13 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb)
}
static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
- int mirror_num)
+ int mirror_num,
+ struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct extent_io_tree *io_tree;
struct page *page = eb->pages[0];
+ struct extent_state *cached_state = NULL;
struct btrfs_bio_ctrl bio_ctrl = {
.mirror_num = mirror_num,
};
@@ -6563,13 +4834,16 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
ASSERT(PagePrivate(page));
+ ASSERT(check);
io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
if (wait == WAIT_NONE) {
- if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1))
+ if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+ &cached_state))
return -EAGAIN;
} else {
- ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
+ ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+ &cached_state);
if (ret < 0)
return ret;
}
@@ -6579,7 +4853,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
PageUptodate(page) ||
btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- unlock_extent(io_tree, eb->start, eb->start + eb->len - 1);
+ unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+ &cached_state);
return ret;
}
@@ -6587,13 +4862,14 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
eb->read_mirror = 0;
atomic_set(&eb->io_pages, 1);
check_buffer_tree_ref(eb);
+ bio_ctrl.end_io_func = end_bio_extent_readpage;
+
btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl,
- page, eb->start, eb->len,
- eb->start - page_offset(page),
- end_bio_extent_readpage, 0, true);
+ eb->start, page, eb->len,
+ eb->start - page_offset(page), 0, true);
if (ret) {
/*
* In the endio function, if we hit something wrong we will
@@ -6602,17 +4878,22 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
*/
atomic_dec(&eb->io_pages);
}
+ memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check, sizeof(*check));
submit_one_bio(&bio_ctrl);
- if (ret || wait != WAIT_COMPLETE)
+ if (ret || wait != WAIT_COMPLETE) {
+ free_extent_state(cached_state);
return ret;
+ }
- wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED);
+ wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1,
+ EXTENT_LOCKED, &cached_state);
if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
ret = -EIO;
return ret;
}
-int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
+int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
+ struct btrfs_tree_parent_check *check)
{
int i;
struct page *page;
@@ -6638,7 +4919,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
return -EIO;
if (eb->fs_info->nodesize < PAGE_SIZE)
- return read_extent_buffer_subpage(eb, wait, mirror_num);
+ return read_extent_buffer_subpage(eb, wait, mirror_num, check);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
@@ -6684,6 +4965,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
* set io_pages. See check_buffer_tree_ref for a more detailed comment.
*/
check_buffer_tree_ref(eb);
+ bio_ctrl.end_io_func = end_bio_extent_readpage;
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
@@ -6696,9 +4978,8 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
ClearPageError(page);
err = submit_extent_page(REQ_OP_READ, NULL,
- &bio_ctrl, page, page_offset(page),
- PAGE_SIZE, 0, end_bio_extent_readpage,
- 0, false);
+ &bio_ctrl, page_offset(page), page,
+ PAGE_SIZE, 0, 0, false);
if (err) {
/*
* We failed to submit the bio so it's the
@@ -6715,6 +4996,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
}
+ memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check, sizeof(*check));
submit_one_bio(&bio_ctrl);
if (ret || wait != WAIT_COMPLETE)
@@ -7073,11 +5355,12 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
*page_offset = offset_in_page(offset);
}
-/**
- * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
- * @eb: the extent buffer
- * @start: offset of the bitmap item in the extent buffer
- * @nr: bit number to test
+/*
+ * Determine whether a bit in a bitmap item is set.
+ *
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @nr: bit number to test
*/
int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
unsigned long nr)
@@ -7094,12 +5377,13 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
}
-/**
- * extent_buffer_bitmap_set - set an area of a bitmap
- * @eb: the extent buffer
- * @start: offset of the bitmap item in the extent buffer
- * @pos: bit number of the first bit
- * @len: number of bits to set
+/*
+ * Set an area of a bitmap to 1.
+ *
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @pos: bit number of the first bit
+ * @len: number of bits to set
*/
void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len)
@@ -7136,12 +5420,13 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star
}
-/**
- * extent_buffer_bitmap_clear - clear an area of a bitmap
- * @eb: the extent buffer
- * @start: offset of the bitmap item in the extent buffer
- * @pos: bit number of the first bit
- * @len: number of bits to clear
+/*
+ * Clear an area of a bitmap.
+ *
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @pos: bit number of the first bit
+ * @len: number of bits to clear
*/
void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
unsigned long start, unsigned long pos,
@@ -7447,6 +5732,11 @@ int try_release_extent_buffer(struct page *page)
void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root, u64 gen, int level)
{
+ struct btrfs_tree_parent_check check = {
+ .has_first_key = 0,
+ .level = level,
+ .transid = gen
+ };
struct extent_buffer *eb;
int ret;
@@ -7459,7 +5749,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
return;
}
- ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
+ ret = read_extent_buffer_pages(eb, WAIT_NONE, 0, &check);
if (ret < 0)
free_extent_buffer_stale(eb);
else
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4bc72a87b9a9..a2c82448b2e0 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -9,6 +9,7 @@
#include <linux/btrfs_tree.h>
#include "compression.h"
#include "ulist.h"
+#include "misc.h"
enum {
EXTENT_BUFFER_UPTODATE,
@@ -29,13 +30,15 @@ enum {
};
/* these are flags for __process_pages_contig */
-#define PAGE_UNLOCK (1 << 0)
-/* Page starts writeback, clear dirty bit and set writeback bit */
-#define PAGE_START_WRITEBACK (1 << 1)
-#define PAGE_END_WRITEBACK (1 << 2)
-#define PAGE_SET_ORDERED (1 << 3)
-#define PAGE_SET_ERROR (1 << 4)
-#define PAGE_LOCK (1 << 5)
+enum {
+ ENUM_BIT(PAGE_UNLOCK),
+ /* Page starts writeback, clear dirty bit and set writeback bit */
+ ENUM_BIT(PAGE_START_WRITEBACK),
+ ENUM_BIT(PAGE_END_WRITEBACK),
+ ENUM_BIT(PAGE_SET_ORDERED),
+ ENUM_BIT(PAGE_SET_ERROR),
+ ENUM_BIT(PAGE_LOCK),
+};
/*
* page->private values. Every page that is controlled by the extent
@@ -60,17 +63,13 @@ enum {
struct btrfs_bio;
struct btrfs_root;
struct btrfs_inode;
-struct btrfs_io_bio;
struct btrfs_fs_info;
struct io_failure_record;
struct extent_io_tree;
+struct btrfs_tree_parent_check;
-typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
- int mirror_num,
- enum btrfs_compression_type compress_type);
-
-typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode,
- struct bio *bio, u64 dio_file_offset);
+int __init extent_buffer_init_cachep(void);
+void __cold extent_buffer_free_cachep(void);
#define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE)
struct extent_buffer {
@@ -97,6 +96,39 @@ struct extent_buffer {
};
/*
+ * Get the correct offset inside the page of extent buffer.
+ *
+ * @eb: target extent buffer
+ * @start: offset inside the extent buffer
+ *
+ * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases.
+ */
+static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb,
+ unsigned long offset)
+{
+ /*
+ * For sectorsize == PAGE_SIZE case, eb->start will always be aligned
+ * to PAGE_SIZE, thus adding it won't cause any difference.
+ *
+ * For sectorsize < PAGE_SIZE, we must only read the data that belongs
+ * to the eb, thus we have to take the eb->start into consideration.
+ */
+ return offset_in_page(offset + eb->start);
+}
+
+static inline unsigned long get_eb_page_index(unsigned long offset)
+{
+ /*
+ * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough.
+ *
+ * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE,
+ * and have ensured that all tree blocks are contained in one page,
+ * thus we always get index == 0.
+ */
+ return offset >> PAGE_SHIFT;
+}
+
+/*
* Structure to record how many bytes and which ranges are set/cleared
*/
struct extent_changeset {
@@ -172,8 +204,8 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
#define WAIT_NONE 0
#define WAIT_COMPLETE 1
#define WAIT_PAGE_LOCK 2
-int read_extent_buffer_pages(struct extent_buffer *eb, int wait,
- int mirror_num);
+int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
+ struct btrfs_tree_parent_check *parent_check);
void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root, u64 gen, int level);
@@ -240,13 +272,12 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
u32 bits_to_clear, unsigned long page_ops);
+int extent_invalidate_folio(struct extent_io_tree *tree,
+ struct folio *folio, size_t offset);
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
-struct bio *btrfs_bio_alloc(unsigned int nr_iovecs);
-struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
-int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
/*
* When IO fails, either with EIO or csum verification fails, we
@@ -257,8 +288,12 @@ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
* bio end_io callback is called to indicate things have failed.
*/
struct io_failure_record {
+ /* Use rb_simple_node for search/insert */
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ };
struct page *page;
- u64 start;
u64 len;
u64 logical;
int this_mirror;
@@ -266,9 +301,12 @@ struct io_failure_record {
int num_copies;
};
-int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
+int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio,
u32 bio_offset, struct page *page, unsigned int pgoff,
- submit_bio_hook_t *submit_bio_hook);
+ bool submit_buffered);
+void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end);
+int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
+ struct page *page, unsigned int pg_offset);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6fee14ce2e6b..be94030e1dfb 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,10 +3,12 @@
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include "messages.h"
#include "ctree.h"
#include "volumes.h"
#include "extent_map.h"
#include "compression.h"
+#include "btrfs_inode.h"
static struct kmem_cache *extent_map_cache;
@@ -26,12 +28,9 @@ void __cold extent_map_exit(void)
kmem_cache_destroy(extent_map_cache);
}
-/**
- * extent_map_tree_init - initialize extent map tree
- * @tree: tree to initialize
- *
- * Initialize the extent tree @tree. Should be called for each new inode
- * or other user of the extent_map interface.
+/*
+ * Initialize the extent tree @tree. Should be called for each new inode or
+ * other user of the extent_map interface.
*/
void extent_map_tree_init(struct extent_map_tree *tree)
{
@@ -40,12 +39,9 @@ void extent_map_tree_init(struct extent_map_tree *tree)
rwlock_init(&tree->lock);
}
-/**
- * alloc_extent_map - allocate new extent map structure
- *
- * Allocate a new extent_map structure. The new structure is
- * returned with a reference count of one and needs to be
- * freed using free_extent_map()
+/*
+ * Allocate a new extent_map structure. The new structure is returned with a
+ * reference count of one and needs to be freed using free_extent_map()
*/
struct extent_map *alloc_extent_map(void)
{
@@ -54,26 +50,20 @@ struct extent_map *alloc_extent_map(void)
if (!em)
return NULL;
RB_CLEAR_NODE(&em->rb_node);
- em->flags = 0;
em->compress_type = BTRFS_COMPRESS_NONE;
- em->generation = 0;
refcount_set(&em->refs, 1);
INIT_LIST_HEAD(&em->list);
return em;
}
-/**
- * free_extent_map - drop reference count of an extent_map
- * @em: extent map being released
- *
- * Drops the reference out on @em by one and free the structure
- * if the reference count hits zero.
+/*
+ * Drop the reference out on @em by one and free the structure if the reference
+ * count hits zero.
*/
void free_extent_map(struct extent_map *em)
{
if (!em)
return;
- WARN_ON(refcount_read(&em->refs) == 0);
if (refcount_dec_and_test(&em->refs)) {
WARN_ON(extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
@@ -83,7 +73,7 @@ void free_extent_map(struct extent_map *em)
}
}
-/* simple helper to do math around the end of an extent, handling wrap */
+/* Do the math around the end of an extent, handling wrapping. */
static u64 range_end(u64 start, u64 len)
{
if (start + len < start)
@@ -139,12 +129,11 @@ static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
}
/*
- * search through the tree for an extent_map with a given offset. If
- * it can't be found, try to find some neighboring extents
+ * Search through the tree for an extent_map with a given offset. If it can't
+ * be found, try to find some neighboring extents
*/
static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
- struct rb_node **prev_ret,
- struct rb_node **next_ret)
+ struct rb_node **prev_or_next_ret)
{
struct rb_node *n = root->rb_node;
struct rb_node *prev = NULL;
@@ -152,6 +141,8 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
struct extent_map *entry;
struct extent_map *prev_entry = NULL;
+ ASSERT(prev_or_next_ret);
+
while (n) {
entry = rb_entry(n, struct extent_map, rb_node);
prev = n;
@@ -165,28 +156,33 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
return n;
}
- if (prev_ret) {
- orig_prev = prev;
- while (prev && offset >= extent_map_end(prev_entry)) {
- prev = rb_next(prev);
- prev_entry = rb_entry(prev, struct extent_map, rb_node);
- }
- *prev_ret = prev;
- prev = orig_prev;
+ orig_prev = prev;
+ while (prev && offset >= extent_map_end(prev_entry)) {
+ prev = rb_next(prev);
+ prev_entry = rb_entry(prev, struct extent_map, rb_node);
+ }
+
+ /*
+ * Previous extent map found, return as in this case the caller does not
+ * care about the next one.
+ */
+ if (prev) {
+ *prev_or_next_ret = prev;
+ return NULL;
}
- if (next_ret) {
+ prev = orig_prev;
+ prev_entry = rb_entry(prev, struct extent_map, rb_node);
+ while (prev && offset < prev_entry->start) {
+ prev = rb_prev(prev);
prev_entry = rb_entry(prev, struct extent_map, rb_node);
- while (prev && offset < prev_entry->start) {
- prev = rb_prev(prev);
- prev_entry = rb_entry(prev, struct extent_map, rb_node);
- }
- *next_ret = prev;
}
+ *prev_or_next_ret = prev;
+
return NULL;
}
-/* check to see if two extent_map structs are adjacent and safe to merge */
+/* Check to see if two extent_map structs are adjacent and safe to merge. */
static int mergable_maps(struct extent_map *prev, struct extent_map *next)
{
if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
@@ -284,8 +280,9 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
}
}
-/**
- * unpin_extent_cache - unpin an extent from the cache
+/*
+ * Unpin an extent from the cache.
+ *
* @tree: tree to unpin the extent in
* @start: logical offset in the file
* @len: length of the extent
@@ -336,6 +333,8 @@ out:
void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
{
+ lockdep_assert_held_write(&tree->lock);
+
clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
if (extent_map_in_tree(em))
try_merge_map(tree, em);
@@ -382,11 +381,11 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
__clear_extent_bit(&device->alloc_state, stripe->physical,
stripe->physical + stripe_size - 1, bits,
- 0, 0, NULL, GFP_NOWAIT, NULL);
+ NULL, GFP_NOWAIT, NULL);
}
}
-/**
+/*
* Add new extent map to the extent tree
*
* @tree: tree to insert new map in
@@ -425,16 +424,13 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
{
struct extent_map *em;
struct rb_node *rb_node;
- struct rb_node *prev = NULL;
- struct rb_node *next = NULL;
+ struct rb_node *prev_or_next = NULL;
u64 end = range_end(start, len);
- rb_node = __tree_search(&tree->map.rb_root, start, &prev, &next);
+ rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next);
if (!rb_node) {
- if (prev)
- rb_node = prev;
- else if (next)
- rb_node = next;
+ if (prev_or_next)
+ rb_node = prev_or_next;
else
return NULL;
}
@@ -448,8 +444,9 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
return em;
}
-/**
- * lookup_extent_mapping - lookup extent_map
+/*
+ * Lookup extent_map that intersects @start + @len range.
+ *
* @tree: tree to lookup in
* @start: byte offset to start the search
* @len: length of the lookup range
@@ -465,8 +462,9 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
return __lookup_extent_mapping(tree, start, len, 1);
}
-/**
- * search_extent_mapping - find a nearby extent map
+/*
+ * Find a nearby extent map intersecting @start + @len (not an exact search).
+ *
* @tree: tree to lookup in
* @start: byte offset to start the search
* @len: length of the lookup range
@@ -482,13 +480,14 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
return __lookup_extent_mapping(tree, start, len, 0);
}
-/**
- * remove_extent_mapping - removes an extent_map from the extent tree
+/*
+ * Remove an extent_map from the extent tree.
+ *
* @tree: extent tree to remove from
* @em: extent map being removed
*
- * Removes @em from @tree. No reference counts are dropped, and no checks
- * are done to see if the range is in use
+ * Remove @em from @tree. No reference counts are dropped, and no checks
+ * are done to see if the range is in use.
*/
void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
{
@@ -520,7 +519,7 @@ void replace_extent_mapping(struct extent_map_tree *tree,
setup_extent_mapping(tree, new, modified);
}
-static struct extent_map *next_extent_map(struct extent_map *em)
+static struct extent_map *next_extent_map(const struct extent_map *em)
{
struct rb_node *next;
@@ -582,8 +581,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
return add_extent_mapping(em_tree, em, 0);
}
-/**
- * Add extent mapping into em_tree
+/*
+ * Add extent mapping into em_tree.
*
* @fs_info: the filesystem
* @em_tree: extent tree into which we want to insert the extent mapping
@@ -610,6 +609,13 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
int ret;
struct extent_map *em = *em_in;
+ /*
+ * Tree-checker should have rejected any inline extent with non-zero
+ * file offset. Here just do a sanity check.
+ */
+ if (em->block_start == EXTENT_MAP_INLINE)
+ ASSERT(em->start == 0);
+
ret = add_extent_mapping(em_tree, em, 0);
/* it is possible that someone inserted the extent into the tree
* while we had the lock dropped. It is also possible that
@@ -658,3 +664,293 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
ASSERT(ret == 0 || ret == -EEXIST);
return ret;
}
+
+/*
+ * Drop all extent maps from a tree in the fastest possible way, rescheduling
+ * if needed. This avoids searching the tree, from the root down to the first
+ * extent map, before each deletion.
+ */
+static void drop_all_extent_maps_fast(struct extent_map_tree *tree)
+{
+ write_lock(&tree->lock);
+ while (!RB_EMPTY_ROOT(&tree->map.rb_root)) {
+ struct extent_map *em;
+ struct rb_node *node;
+
+ node = rb_first_cached(&tree->map);
+ em = rb_entry(node, struct extent_map, rb_node);
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ remove_extent_mapping(tree, em);
+ free_extent_map(em);
+ cond_resched_rwlock_write(&tree->lock);
+ }
+ write_unlock(&tree->lock);
+}
+
+/*
+ * Drop all extent maps in a given range.
+ *
+ * @inode: The target inode.
+ * @start: Start offset of the range.
+ * @end: End offset of the range (inclusive value).
+ * @skip_pinned: Indicate if pinned extent maps should be ignored or not.
+ *
+ * This drops all the extent maps that intersect the given range [@start, @end].
+ * Extent maps that partially overlap the range and extend behind or beyond it,
+ * are split.
+ * The caller should have locked an appropriate file range in the inode's io
+ * tree before calling this function.
+ */
+void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
+ bool skip_pinned)
+{
+ struct extent_map *split;
+ struct extent_map *split2;
+ struct extent_map *em;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+ u64 len = end - start + 1;
+
+ WARN_ON(end < start);
+ if (end == (u64)-1) {
+ if (start == 0 && !skip_pinned) {
+ drop_all_extent_maps_fast(em_tree);
+ return;
+ }
+ len = (u64)-1;
+ } else {
+ /* Make end offset exclusive for use in the loop below. */
+ end++;
+ }
+
+ /*
+ * It's ok if we fail to allocate the extent maps, see the comment near
+ * the bottom of the loop below. We only need two spare extent maps in
+ * the worst case, where the first extent map that intersects our range
+ * starts before the range and the last extent map that intersects our
+ * range ends after our range (and they might be the same extent map),
+ * because we need to split those two extent maps at the boundaries.
+ */
+ split = alloc_extent_map();
+ split2 = alloc_extent_map();
+
+ write_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+
+ while (em) {
+ /* extent_map_end() returns exclusive value (last byte + 1). */
+ const u64 em_end = extent_map_end(em);
+ struct extent_map *next_em = NULL;
+ u64 gen;
+ unsigned long flags;
+ bool modified;
+ bool compressed;
+
+ if (em_end < end) {
+ next_em = next_extent_map(em);
+ if (next_em) {
+ if (next_em->start < end)
+ refcount_inc(&next_em->refs);
+ else
+ next_em = NULL;
+ }
+ }
+
+ if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+ start = em_end;
+ if (end != (u64)-1)
+ len = start + len - em_end;
+ goto next;
+ }
+
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ clear_bit(EXTENT_FLAG_LOGGING, &flags);
+ modified = !list_empty(&em->list);
+
+ /*
+ * The extent map does not cross our target range, so no need to
+ * split it, we can remove it directly.
+ */
+ if (em->start >= start && em_end <= end)
+ goto remove_em;
+
+ flags = em->flags;
+ gen = em->generation;
+ compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+
+ if (em->start < start) {
+ if (!split) {
+ split = split2;
+ split2 = NULL;
+ if (!split)
+ goto remove_em;
+ }
+ split->start = em->start;
+ split->len = start - em->start;
+
+ if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ split->orig_start = em->orig_start;
+ split->block_start = em->block_start;
+
+ if (compressed)
+ split->block_len = em->block_len;
+ else
+ split->block_len = split->len;
+ split->orig_block_len = max(split->block_len,
+ em->orig_block_len);
+ split->ram_bytes = em->ram_bytes;
+ } else {
+ split->orig_start = split->start;
+ split->block_len = 0;
+ split->block_start = em->block_start;
+ split->orig_block_len = 0;
+ split->ram_bytes = split->len;
+ }
+
+ split->generation = gen;
+ split->flags = flags;
+ split->compress_type = em->compress_type;
+ replace_extent_mapping(em_tree, em, split, modified);
+ free_extent_map(split);
+ split = split2;
+ split2 = NULL;
+ }
+ if (em_end > end) {
+ if (!split) {
+ split = split2;
+ split2 = NULL;
+ if (!split)
+ goto remove_em;
+ }
+ split->start = start + len;
+ split->len = em_end - (start + len);
+ split->block_start = em->block_start;
+ split->flags = flags;
+ split->compress_type = em->compress_type;
+ split->generation = gen;
+
+ if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ split->orig_block_len = max(em->block_len,
+ em->orig_block_len);
+
+ split->ram_bytes = em->ram_bytes;
+ if (compressed) {
+ split->block_len = em->block_len;
+ split->orig_start = em->orig_start;
+ } else {
+ const u64 diff = start + len - em->start;
+
+ split->block_len = split->len;
+ split->block_start += diff;
+ split->orig_start = em->orig_start;
+ }
+ } else {
+ split->ram_bytes = split->len;
+ split->orig_start = split->start;
+ split->block_len = 0;
+ split->orig_block_len = 0;
+ }
+
+ if (extent_map_in_tree(em)) {
+ replace_extent_mapping(em_tree, em, split,
+ modified);
+ } else {
+ int ret;
+
+ ret = add_extent_mapping(em_tree, split,
+ modified);
+ /* Logic error, shouldn't happen. */
+ ASSERT(ret == 0);
+ if (WARN_ON(ret != 0) && modified)
+ btrfs_set_inode_full_sync(inode);
+ }
+ free_extent_map(split);
+ split = NULL;
+ }
+remove_em:
+ if (extent_map_in_tree(em)) {
+ /*
+ * If the extent map is still in the tree it means that
+ * either of the following is true:
+ *
+ * 1) It fits entirely in our range (doesn't end beyond
+ * it or starts before it);
+ *
+ * 2) It starts before our range and/or ends after our
+ * range, and we were not able to allocate the extent
+ * maps for split operations, @split and @split2.
+ *
+ * If we are at case 2) then we just remove the entire
+ * extent map - this is fine since if anyone needs it to
+ * access the subranges outside our range, will just
+ * load it again from the subvolume tree's file extent
+ * item. However if the extent map was in the list of
+ * modified extents, then we must mark the inode for a
+ * full fsync, otherwise a fast fsync will miss this
+ * extent if it's new and needs to be logged.
+ */
+ if ((em->start < start || em_end > end) && modified) {
+ ASSERT(!split);
+ btrfs_set_inode_full_sync(inode);
+ }
+ remove_extent_mapping(em_tree, em);
+ }
+
+ /*
+ * Once for the tree reference (we replaced or removed the
+ * extent map from the tree).
+ */
+ free_extent_map(em);
+next:
+ /* Once for us (for our lookup reference). */
+ free_extent_map(em);
+
+ em = next_em;
+ }
+
+ write_unlock(&em_tree->lock);
+
+ free_extent_map(split);
+ free_extent_map(split2);
+}
+
+/*
+ * Replace a range in the inode's extent map tree with a new extent map.
+ *
+ * @inode: The target inode.
+ * @new_em: The new extent map to add to the inode's extent map tree.
+ * @modified: Indicate if the new extent map should be added to the list of
+ * modified extents (for fast fsync tracking).
+ *
+ * Drops all the extent maps in the inode's extent map tree that intersect the
+ * range of the new extent map and adds the new extent map to the tree.
+ * The caller should have locked an appropriate file range in the inode's io
+ * tree before calling this function.
+ */
+int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
+ struct extent_map *new_em,
+ bool modified)
+{
+ const u64 end = new_em->start + new_em->len - 1;
+ struct extent_map_tree *tree = &inode->extent_tree;
+ int ret;
+
+ ASSERT(!extent_map_in_tree(new_em));
+
+ /*
+ * The caller has locked an appropriate file range in the inode's io
+ * tree, but getting -EEXIST when adding the new extent map can still
+ * happen in case there are extents that partially cover the range, and
+ * this is due to two tasks operating on different parts of the extent.
+ * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from
+ * btrfs_get_extent") for an example and details.
+ */
+ do {
+ btrfs_drop_extent_map_range(inode, new_em->start, end, false);
+ write_lock(&tree->lock);
+ ret = add_extent_mapping(tree, new_em, modified);
+ write_unlock(&tree->lock);
+ } while (ret == -EEXIST);
+
+ return ret;
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index d2fa32ffe304..ad311864272a 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -63,6 +63,8 @@ struct extent_map_tree {
rwlock_t lock;
};
+struct btrfs_inode;
+
static inline int extent_map_in_tree(const struct extent_map *em)
{
return !RB_EMPTY_NODE(&em->rb_node);
@@ -104,5 +106,11 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
struct extent_map_tree *em_tree,
struct extent_map **em_in, u64 start, u64 len);
+void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
+ u64 start, u64 end,
+ bool skip_pinned);
+int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
+ struct extent_map *new_em,
+ bool modified);
#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index c828f971a346..5de73466b2ca 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -9,13 +9,18 @@
#include <linux/highmem.h>
#include <linux/sched/mm.h>
#include <crypto/hash.h>
+#include "messages.h"
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
-#include "volumes.h"
+#include "bio.h"
#include "print-tree.h"
#include "compression.h"
+#include "fs.h"
+#include "accessors.h"
+#include "file-item.h"
+#include "super.h"
#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
sizeof(struct btrfs_item) * 2) / \
@@ -24,8 +29,8 @@
#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
PAGE_SIZE))
-/**
- * Set inode's size according to filesystem options
+/*
+ * Set inode's size according to filesystem options.
*
* @inode: inode we want to update the disk_i_size for
* @new_i_size: i_size we want to set to, 0 if we use i_size
@@ -64,8 +69,8 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
spin_unlock(&inode->lock);
}
-/**
- * Mark range within a file as having a new extent inserted
+/*
+ * Mark range within a file as having a new extent inserted.
*
* @inode: inode being modified
* @start: start file offset of the file extent we've inserted
@@ -92,8 +97,8 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
EXTENT_DIRTY);
}
-/**
- * Marks an inode range as not having a backing extent
+/*
+ * Mark an inode range as not having a backing extent.
*
* @inode: inode being modified
* @start: start file offset of the file extent we've inserted
@@ -118,23 +123,43 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
return 0;
return clear_extent_bit(&inode->file_extent_tree, start,
- start + len - 1, EXTENT_DIRTY, 0, 0, NULL);
+ start + len - 1, EXTENT_DIRTY, NULL);
+}
+
+static size_t bytes_to_csum_size(const struct btrfs_fs_info *fs_info, u32 bytes)
+{
+ ASSERT(IS_ALIGNED(bytes, fs_info->sectorsize));
+
+ return (bytes >> fs_info->sectorsize_bits) * fs_info->csum_size;
}
-static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
- u16 csum_size)
+static size_t csum_size_to_bytes(const struct btrfs_fs_info *fs_info, u32 csum_size)
{
- u32 ncsums = (PAGE_SIZE - sizeof(struct btrfs_ordered_sum)) / csum_size;
+ ASSERT(IS_ALIGNED(csum_size, fs_info->csum_size));
- return ncsums * fs_info->sectorsize;
+ return (csum_size / fs_info->csum_size) << fs_info->sectorsize_bits;
+}
+
+static inline u32 max_ordered_sum_bytes(const struct btrfs_fs_info *fs_info)
+{
+ u32 max_csum_size = round_down(PAGE_SIZE - sizeof(struct btrfs_ordered_sum),
+ fs_info->csum_size);
+
+ return csum_size_to_bytes(fs_info, max_csum_size);
+}
+
+/*
+ * Calculate the total size needed to allocate for an ordered sum structure
+ * spanning @bytes in the file.
+ */
+static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes)
+{
+ return sizeof(struct btrfs_ordered_sum) + bytes_to_csum_size(fs_info, bytes);
}
-int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
+int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 objectid, u64 pos,
- u64 disk_offset, u64 disk_num_bytes,
- u64 num_bytes, u64 offset, u64 ram_bytes,
- u8 compression, u8 encryption, u16 other_encoding)
+ u64 objectid, u64 pos, u64 num_bytes)
{
int ret = 0;
struct btrfs_file_extent_item *item;
@@ -157,16 +182,16 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
- btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
- btrfs_set_file_extent_offset(leaf, item, offset);
+ btrfs_set_file_extent_disk_bytenr(leaf, item, 0);
+ btrfs_set_file_extent_disk_num_bytes(leaf, item, 0);
+ btrfs_set_file_extent_offset(leaf, item, 0);
btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
- btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, item, num_bytes);
btrfs_set_file_extent_generation(leaf, item, trans->transid);
btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
- btrfs_set_file_extent_compression(leaf, item, compression);
- btrfs_set_file_extent_encryption(leaf, item, encryption);
- btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
+ btrfs_set_file_extent_compression(leaf, item, 0);
+ btrfs_set_file_extent_encryption(leaf, item, 0);
+ btrfs_set_file_extent_other_encoding(leaf, item, 0);
btrfs_mark_buffer_dirty(leaf);
out:
@@ -246,7 +271,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
/*
* Find checksums for logical bytenr range [disk_bytenr, disk_bytenr + len) and
- * estore the result to @dst.
+ * store the result to @dst.
*
* Return >0 for the number of sectors we found.
* Return 0 for the range [disk_bytenr, disk_bytenr + sectorsize) has no csum
@@ -352,15 +377,15 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
return ret;
}
-/**
+/*
* Lookup the checksum for the read bio in csum tree.
*
- * @inode: inode that the bio is for.
- * @bio: bio to look up.
- * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return
- * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If
- * NULL, the checksum buffer is allocated and returned in
- * btrfs_bio(bio)->csum instead.
+ * @inode: inode that the bio is for.
+ * @bio: bio to look up.
+ * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return
+ * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If
+ * NULL, the checksum buffer is allocated and returned in
+ * btrfs_bio(bio)->csum instead.
*
* Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
*/
@@ -502,8 +527,9 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
return ret;
}
-int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
- struct list_head *list, int search_commit)
+int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
+ struct list_head *list, int search_commit,
+ bool nowait)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
@@ -512,11 +538,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct btrfs_ordered_sum *sums;
struct btrfs_csum_item *item;
LIST_HEAD(tmplist);
- unsigned long offset;
int ret;
- size_t size;
- u64 csum_end;
- const u32 csum_size = fs_info->csum_size;
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(end + 1, fs_info->sectorsize));
@@ -525,6 +547,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
if (!path)
return -ENOMEM;
+ path->nowait = nowait;
if (search_commit) {
path->skip_locking = 1;
path->reada = READA_FORWARD;
@@ -541,16 +564,33 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
if (ret > 0 && path->slots[0] > 0) {
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+
+ /*
+ * There are two cases we can hit here for the previous csum
+ * item:
+ *
+ * |<- search range ->|
+ * |<- csum item ->|
+ *
+ * Or
+ * |<- search range ->|
+ * |<- csum item ->|
+ *
+ * Check if the previous csum item covers the leading part of
+ * the search range. If so we have to start from previous csum
+ * item.
+ */
if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
key.type == BTRFS_EXTENT_CSUM_KEY) {
- offset = (start - key.offset) >> fs_info->sectorsize_bits;
- if (offset * csum_size <
+ if (bytes_to_csum_size(fs_info, start - key.offset) <
btrfs_item_size(leaf, path->slots[0] - 1))
path->slots[0]--;
}
}
while (start <= end) {
+ u64 csum_end;
+
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
@@ -570,8 +610,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
if (key.offset > start)
start = key.offset;
- size = btrfs_item_size(leaf, path->slots[0]);
- csum_end = key.offset + (size / csum_size) * fs_info->sectorsize;
+ csum_end = key.offset + csum_size_to_bytes(fs_info,
+ btrfs_item_size(leaf, path->slots[0]));
if (csum_end <= start) {
path->slots[0]++;
continue;
@@ -581,8 +621,11 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_csum_item);
while (start < csum_end) {
+ unsigned long offset;
+ size_t size;
+
size = min_t(size_t, csum_end - start,
- max_ordered_sum_bytes(fs_info, csum_size));
+ max_ordered_sum_bytes(fs_info));
sums = kzalloc(btrfs_ordered_sum_size(fs_info, size),
GFP_NOFS);
if (!sums) {
@@ -593,16 +636,14 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
sums->bytenr = start;
sums->len = (int)size;
- offset = (start - key.offset) >> fs_info->sectorsize_bits;
- offset *= csum_size;
- size >>= fs_info->sectorsize_bits;
+ offset = bytes_to_csum_size(fs_info, start - key.offset);
read_extent_buffer(path->nodes[0],
sums->sums,
((unsigned long)item) + offset,
- csum_size * size);
+ bytes_to_csum_size(fs_info, size));
- start += fs_info->sectorsize * size;
+ start += size;
list_add_tail(&sums->list, &tmplist);
}
path->slots[0]++;
@@ -620,8 +661,129 @@ fail:
return ret;
}
-/**
- * Calculate checksums of the data contained inside a bio
+/*
+ * Do the same work as btrfs_lookup_csums_list(), the difference is in how
+ * we return the result.
+ *
+ * This version will set the corresponding bits in @csum_bitmap to represent
+ * that there is a csum found.
+ * Each bit represents a sector. Thus caller should ensure @csum_buf passed
+ * in is large enough to contain all csums.
+ */
+int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
+ u8 *csum_buf, unsigned long *csum_bitmap)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_csum_item *item;
+ const u64 orig_start = start;
+ int ret;
+
+ ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+ IS_ALIGNED(end + 1, fs_info->sectorsize));
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key.type = BTRFS_EXTENT_CSUM_KEY;
+ key.offset = start;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto fail;
+ if (ret > 0 && path->slots[0] > 0) {
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+
+ /*
+ * There are two cases we can hit here for the previous csum
+ * item:
+ *
+ * |<- search range ->|
+ * |<- csum item ->|
+ *
+ * Or
+ * |<- search range ->|
+ * |<- csum item ->|
+ *
+ * Check if the previous csum item covers the leading part of
+ * the search range. If so we have to start from previous csum
+ * item.
+ */
+ if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
+ key.type == BTRFS_EXTENT_CSUM_KEY) {
+ if (bytes_to_csum_size(fs_info, start - key.offset) <
+ btrfs_item_size(leaf, path->slots[0] - 1))
+ path->slots[0]--;
+ }
+ }
+
+ while (start <= end) {
+ u64 csum_end;
+
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto fail;
+ if (ret > 0)
+ break;
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ key.type != BTRFS_EXTENT_CSUM_KEY ||
+ key.offset > end)
+ break;
+
+ if (key.offset > start)
+ start = key.offset;
+
+ csum_end = key.offset + csum_size_to_bytes(fs_info,
+ btrfs_item_size(leaf, path->slots[0]));
+ if (csum_end <= start) {
+ path->slots[0]++;
+ continue;
+ }
+
+ csum_end = min(csum_end, end + 1);
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_csum_item);
+ while (start < csum_end) {
+ unsigned long offset;
+ size_t size;
+ u8 *csum_dest = csum_buf + bytes_to_csum_size(fs_info,
+ start - orig_start);
+
+ size = min_t(size_t, csum_end - start, end + 1 - start);
+
+ offset = bytes_to_csum_size(fs_info, start - key.offset);
+
+ read_extent_buffer(path->nodes[0], csum_dest,
+ ((unsigned long)item) + offset,
+ bytes_to_csum_size(fs_info, size));
+
+ bitmap_set(csum_bitmap,
+ (start - orig_start) >> fs_info->sectorsize_bits,
+ size >> fs_info->sectorsize_bits);
+
+ start += size;
+ }
+ path->slots[0]++;
+ }
+ ret = 0;
+fail:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Calculate checksums of the data contained inside a bio.
*
* @inode: Owner of the data inside the bio
* @bio: Contains the data to be checksummed
@@ -736,15 +898,16 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
}
/*
- * helper function for csum removal, this expects the
- * key to describe the csum pointed to by the path, and it expects
- * the csum to overlap the range [bytenr, len]
+ * Remove one checksum overlapping a range.
*
- * The csum should not be entirely contained in the range and the
- * range should not be entirely contained in the csum.
+ * This expects the key to describe the csum pointed to by the path, and it
+ * expects the csum to overlap the range [bytenr, len]
*
- * This calls btrfs_truncate_item with the correct args based on the
- * overlap, and fixes up the key as required.
+ * The csum should not be entirely contained in the range and the range should
+ * not be entirely contained in the csum.
+ *
+ * This calls btrfs_truncate_item with the correct args based on the overlap,
+ * and fixes up the key as required.
*/
static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
@@ -793,8 +956,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
}
/*
- * deletes the csum items from the csum tree for a given
- * range of bytes.
+ * Delete the csum items from the csum tree for a given range of bytes.
*/
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len)
@@ -1199,7 +1361,6 @@ out:
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
const struct btrfs_path *path,
struct btrfs_file_extent_item *fi,
- const bool new_inline,
struct extent_map *em)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1251,10 +1412,9 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
*/
em->orig_start = EXTENT_MAP_HOLE;
em->block_len = (u64)-1;
- if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) {
+ em->compress_type = compress_type;
+ if (compress_type != BTRFS_COMPRESS_NONE)
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- em->compress_type = compress_type;
- }
} else {
btrfs_err(fs_info,
"unknown file extent item type %d, inode %llu, offset %llu, "
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
new file mode 100644
index 000000000000..031225668434
--- /dev/null
+++ b/fs/btrfs/file-item.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_FILE_ITEM_H
+#define BTRFS_FILE_ITEM_H
+
+#include "accessors.h"
+
+#define BTRFS_FILE_EXTENT_INLINE_DATA_START \
+ (offsetof(struct btrfs_file_extent_item, disk_bytenr))
+
+static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info)
+{
+ return BTRFS_MAX_ITEM_SIZE(info) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
+}
+
+/*
+ * Return the number of bytes used by the item on disk, minus the size of any
+ * extent headers. If a file is compressed on disk, this is the compressed
+ * size.
+ */
+static inline u32 btrfs_file_extent_inline_item_len(
+ const struct extent_buffer *eb,
+ int nr)
+{
+ return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
+}
+
+static inline unsigned long btrfs_file_extent_inline_start(
+ const struct btrfs_file_extent_item *e)
+{
+ return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
+}
+
+static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
+{
+ return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
+}
+
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr, u64 len);
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
+int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 objectid, u64 pos,
+ u64 num_bytes);
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid,
+ u64 bytenr, int mod);
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_ordered_sum *sums);
+blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
+ u64 offset, bool one_ordered);
+int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
+ struct list_head *list, int search_commit,
+ bool nowait);
+int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
+ u8 *csum_buf, unsigned long *csum_bitmap);
+void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
+ const struct btrfs_path *path,
+ struct btrfs_file_extent_item *fi,
+ struct extent_map *em);
+int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
+ u64 len);
+int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len);
+void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size);
+u64 btrfs_file_extent_end(const struct btrfs_path *path);
+
+#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5a3f6e0d9688..91b00eb2440e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -30,329 +30,13 @@
#include "delalloc-space.h"
#include "reflink.h"
#include "subpage.h"
-
-static struct kmem_cache *btrfs_inode_defrag_cachep;
-/*
- * when auto defrag is enabled we
- * queue up these defrag structs to remember which
- * inodes need defragging passes
- */
-struct inode_defrag {
- struct rb_node rb_node;
- /* objectid */
- u64 ino;
- /*
- * transid where the defrag was added, we search for
- * extents newer than this
- */
- u64 transid;
-
- /* root objectid */
- u64 root;
-
- /*
- * The extent size threshold for autodefrag.
- *
- * This value is different for compressed/non-compressed extents,
- * thus needs to be passed from higher layer.
- * (aka, inode_should_defrag())
- */
- u32 extent_thresh;
-};
-
-static int __compare_inode_defrag(struct inode_defrag *defrag1,
- struct inode_defrag *defrag2)
-{
- if (defrag1->root > defrag2->root)
- return 1;
- else if (defrag1->root < defrag2->root)
- return -1;
- else if (defrag1->ino > defrag2->ino)
- return 1;
- else if (defrag1->ino < defrag2->ino)
- return -1;
- else
- return 0;
-}
-
-/* pop a record for an inode into the defrag tree. The lock
- * must be held already
- *
- * If you're inserting a record for an older transid than an
- * existing record, the transid already in the tree is lowered
- *
- * If an existing record is found the defrag item you
- * pass in is freed
- */
-static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
- struct inode_defrag *defrag)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct inode_defrag *entry;
- struct rb_node **p;
- struct rb_node *parent = NULL;
- int ret;
-
- p = &fs_info->defrag_inodes.rb_node;
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct inode_defrag, rb_node);
-
- ret = __compare_inode_defrag(defrag, entry);
- if (ret < 0)
- p = &parent->rb_left;
- else if (ret > 0)
- p = &parent->rb_right;
- else {
- /* if we're reinserting an entry for
- * an old defrag run, make sure to
- * lower the transid of our existing record
- */
- if (defrag->transid < entry->transid)
- entry->transid = defrag->transid;
- entry->extent_thresh = min(defrag->extent_thresh,
- entry->extent_thresh);
- return -EEXIST;
- }
- }
- set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
- rb_link_node(&defrag->rb_node, parent, p);
- rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
- return 0;
-}
-
-static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
-{
- if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
- return 0;
-
- if (btrfs_fs_closing(fs_info))
- return 0;
-
- return 1;
-}
-
-/*
- * insert a defrag record for this inode if auto defrag is
- * enabled
- */
-int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, u32 extent_thresh)
-{
- struct btrfs_root *root = inode->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode_defrag *defrag;
- u64 transid;
- int ret;
-
- if (!__need_auto_defrag(fs_info))
- return 0;
-
- if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
- return 0;
-
- if (trans)
- transid = trans->transid;
- else
- transid = inode->root->last_trans;
-
- defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
- if (!defrag)
- return -ENOMEM;
-
- defrag->ino = btrfs_ino(inode);
- defrag->transid = transid;
- defrag->root = root->root_key.objectid;
- defrag->extent_thresh = extent_thresh;
-
- spin_lock(&fs_info->defrag_inodes_lock);
- if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
- /*
- * If we set IN_DEFRAG flag and evict the inode from memory,
- * and then re-read this inode, this new inode doesn't have
- * IN_DEFRAG flag. At the case, we may find the existed defrag.
- */
- ret = __btrfs_add_inode_defrag(inode, defrag);
- if (ret)
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- } else {
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- }
- spin_unlock(&fs_info->defrag_inodes_lock);
- return 0;
-}
-
-/*
- * pick the defragable inode that we want, if it doesn't exist, we will get
- * the next one.
- */
-static struct inode_defrag *
-btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
-{
- struct inode_defrag *entry = NULL;
- struct inode_defrag tmp;
- struct rb_node *p;
- struct rb_node *parent = NULL;
- int ret;
-
- tmp.ino = ino;
- tmp.root = root;
-
- spin_lock(&fs_info->defrag_inodes_lock);
- p = fs_info->defrag_inodes.rb_node;
- while (p) {
- parent = p;
- entry = rb_entry(parent, struct inode_defrag, rb_node);
-
- ret = __compare_inode_defrag(&tmp, entry);
- if (ret < 0)
- p = parent->rb_left;
- else if (ret > 0)
- p = parent->rb_right;
- else
- goto out;
- }
-
- if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
- parent = rb_next(parent);
- if (parent)
- entry = rb_entry(parent, struct inode_defrag, rb_node);
- else
- entry = NULL;
- }
-out:
- if (entry)
- rb_erase(parent, &fs_info->defrag_inodes);
- spin_unlock(&fs_info->defrag_inodes_lock);
- return entry;
-}
-
-void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
-{
- struct inode_defrag *defrag;
- struct rb_node *node;
-
- spin_lock(&fs_info->defrag_inodes_lock);
- node = rb_first(&fs_info->defrag_inodes);
- while (node) {
- rb_erase(node, &fs_info->defrag_inodes);
- defrag = rb_entry(node, struct inode_defrag, rb_node);
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
-
- cond_resched_lock(&fs_info->defrag_inodes_lock);
-
- node = rb_first(&fs_info->defrag_inodes);
- }
- spin_unlock(&fs_info->defrag_inodes_lock);
-}
-
-#define BTRFS_DEFRAG_BATCH 1024
-
-static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
- struct inode_defrag *defrag)
-{
- struct btrfs_root *inode_root;
- struct inode *inode;
- struct btrfs_ioctl_defrag_range_args range;
- int ret = 0;
- u64 cur = 0;
-
-again:
- if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
- goto cleanup;
- if (!__need_auto_defrag(fs_info))
- goto cleanup;
-
- /* get the inode */
- inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
- if (IS_ERR(inode_root)) {
- ret = PTR_ERR(inode_root);
- goto cleanup;
- }
-
- inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
- btrfs_put_root(inode_root);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- goto cleanup;
- }
-
- if (cur >= i_size_read(inode)) {
- iput(inode);
- goto cleanup;
- }
-
- /* do a chunk of defrag */
- clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
- memset(&range, 0, sizeof(range));
- range.len = (u64)-1;
- range.start = cur;
- range.extent_thresh = defrag->extent_thresh;
-
- sb_start_write(fs_info->sb);
- ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
- BTRFS_DEFRAG_BATCH);
- sb_end_write(fs_info->sb);
- iput(inode);
-
- if (ret < 0)
- goto cleanup;
-
- cur = max(cur + fs_info->sectorsize, range.start);
- goto again;
-
-cleanup:
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- return ret;
-}
-
-/*
- * run through the list of inodes in the FS that need
- * defragging
- */
-int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
-{
- struct inode_defrag *defrag;
- u64 first_ino = 0;
- u64 root_objectid = 0;
-
- atomic_inc(&fs_info->defrag_running);
- while (1) {
- /* Pause the auto defragger. */
- if (test_bit(BTRFS_FS_STATE_REMOUNTING,
- &fs_info->fs_state))
- break;
-
- if (!__need_auto_defrag(fs_info))
- break;
-
- /* find an inode to defrag */
- defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
- first_ino);
- if (!defrag) {
- if (root_objectid || first_ino) {
- root_objectid = 0;
- first_ino = 0;
- continue;
- } else {
- break;
- }
- }
-
- first_ino = defrag->ino + 1;
- root_objectid = defrag->root;
-
- __btrfs_run_defrag_inode(fs_info, defrag);
- }
- atomic_dec(&fs_info->defrag_running);
-
- /*
- * during unmount, we use the transaction_wait queue to
- * wait for the defragger to stop
- */
- wake_up(&fs_info->transaction_wait);
- return 0;
-}
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "file-item.h"
+#include "ioctl.h"
+#include "file.h"
+#include "super.h"
/* simple helper to fault in pages and copy. This should go away
* and be replaced with calls into generic code.
@@ -473,7 +157,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
*/
clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, cached);
+ cached);
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
extra_bits, cached);
@@ -499,159 +183,6 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
}
/*
- * this drops all the extents in the cache that intersect the range
- * [start, end]. Existing extents are split as required.
- */
-void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
- int skip_pinned)
-{
- struct extent_map *em;
- struct extent_map *split = NULL;
- struct extent_map *split2 = NULL;
- struct extent_map_tree *em_tree = &inode->extent_tree;
- u64 len = end - start + 1;
- u64 gen;
- int ret;
- int testend = 1;
- unsigned long flags;
- int compressed = 0;
- bool modified;
-
- WARN_ON(end < start);
- if (end == (u64)-1) {
- len = (u64)-1;
- testend = 0;
- }
- while (1) {
- int no_splits = 0;
-
- modified = false;
- if (!split)
- split = alloc_extent_map();
- if (!split2)
- split2 = alloc_extent_map();
- if (!split || !split2)
- no_splits = 1;
-
- write_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
- if (!em) {
- write_unlock(&em_tree->lock);
- break;
- }
- flags = em->flags;
- gen = em->generation;
- if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
- if (testend && em->start + em->len >= start + len) {
- free_extent_map(em);
- write_unlock(&em_tree->lock);
- break;
- }
- start = em->start + em->len;
- if (testend)
- len = start + len - (em->start + em->len);
- free_extent_map(em);
- write_unlock(&em_tree->lock);
- continue;
- }
- compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
- clear_bit(EXTENT_FLAG_LOGGING, &flags);
- modified = !list_empty(&em->list);
- if (no_splits)
- goto next;
-
- if (em->start < start) {
- split->start = em->start;
- split->len = start - em->start;
-
- if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- split->orig_start = em->orig_start;
- split->block_start = em->block_start;
-
- if (compressed)
- split->block_len = em->block_len;
- else
- split->block_len = split->len;
- split->orig_block_len = max(split->block_len,
- em->orig_block_len);
- split->ram_bytes = em->ram_bytes;
- } else {
- split->orig_start = split->start;
- split->block_len = 0;
- split->block_start = em->block_start;
- split->orig_block_len = 0;
- split->ram_bytes = split->len;
- }
-
- split->generation = gen;
- split->flags = flags;
- split->compress_type = em->compress_type;
- replace_extent_mapping(em_tree, em, split, modified);
- free_extent_map(split);
- split = split2;
- split2 = NULL;
- }
- if (testend && em->start + em->len > start + len) {
- u64 diff = start + len - em->start;
-
- split->start = start + len;
- split->len = em->start + em->len - (start + len);
- split->flags = flags;
- split->compress_type = em->compress_type;
- split->generation = gen;
-
- if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- split->orig_block_len = max(em->block_len,
- em->orig_block_len);
-
- split->ram_bytes = em->ram_bytes;
- if (compressed) {
- split->block_len = em->block_len;
- split->block_start = em->block_start;
- split->orig_start = em->orig_start;
- } else {
- split->block_len = split->len;
- split->block_start = em->block_start
- + diff;
- split->orig_start = em->orig_start;
- }
- } else {
- split->ram_bytes = split->len;
- split->orig_start = split->start;
- split->block_len = 0;
- split->block_start = em->block_start;
- split->orig_block_len = 0;
- }
-
- if (extent_map_in_tree(em)) {
- replace_extent_mapping(em_tree, em, split,
- modified);
- } else {
- ret = add_extent_mapping(em_tree, split,
- modified);
- ASSERT(ret == 0); /* Logic error */
- }
- free_extent_map(split);
- split = NULL;
- }
-next:
- if (extent_map_in_tree(em))
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
-
- /* once for us */
- free_extent_map(em);
- /* once for the tree*/
- free_extent_map(em);
- }
- if (split)
- free_extent_map(split);
- if (split2)
- free_extent_map(split2);
-}
-
-/*
* this is very complex, but the basic idea is to drop all extents
* in the range start - end. hint_block is filled in with a block number
* that would be a good hint to the block allocator for this file.
@@ -708,7 +239,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
}
if (args->drop_cache)
- btrfs_drop_extent_cache(inode, args->start, args->end - 1, 0);
+ btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
if (args->start >= inode->disk_i_size && !args->replace_extent)
modify_tree = 0;
@@ -849,7 +380,10 @@ next_slot:
args->start - extent_offset,
0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
}
key.offset = args->start;
}
@@ -936,7 +470,10 @@ delete_extent_item:
key.offset - extent_offset, 0,
false);
ret = btrfs_free_extent(trans, &ref);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
args->bytes_found += extent_end - key.offset;
}
@@ -1339,26 +876,54 @@ static int prepare_uptodate_page(struct inode *inode,
return 0;
}
+static unsigned int get_prepare_fgp_flags(bool nowait)
+{
+ unsigned int fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
+
+ if (nowait)
+ fgp_flags |= FGP_NOWAIT;
+
+ return fgp_flags;
+}
+
+static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
+{
+ gfp_t gfp;
+
+ gfp = btrfs_alloc_write_mask(inode->i_mapping);
+ if (nowait) {
+ gfp &= ~__GFP_DIRECT_RECLAIM;
+ gfp |= GFP_NOWAIT;
+ }
+
+ return gfp;
+}
+
/*
* this just gets pages into the page cache and locks them down.
*/
static noinline int prepare_pages(struct inode *inode, struct page **pages,
size_t num_pages, loff_t pos,
- size_t write_bytes, bool force_uptodate)
+ size_t write_bytes, bool force_uptodate,
+ bool nowait)
{
int i;
unsigned long index = pos >> PAGE_SHIFT;
- gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+ gfp_t mask = get_prepare_gfp_flags(inode, nowait);
+ unsigned int fgp_flags = get_prepare_fgp_flags(nowait);
int err = 0;
int faili;
for (i = 0; i < num_pages; i++) {
again:
- pages[i] = find_or_create_page(inode->i_mapping, index + i,
- mask | __GFP_WRITE);
+ pages[i] = pagecache_get_page(inode->i_mapping, index + i,
+ fgp_flags, mask | __GFP_WRITE);
if (!pages[i]) {
faili = i - 1;
- err = -ENOMEM;
+ if (nowait)
+ err = -EAGAIN;
+ else
+ err = -ENOMEM;
goto fail;
}
@@ -1376,7 +941,7 @@ again:
pos + write_bytes, false);
if (err) {
put_page(pages[i]);
- if (err == -EAGAIN) {
+ if (!nowait && err == -EAGAIN) {
err = 0;
goto again;
}
@@ -1411,7 +976,7 @@ static noinline int
lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos,
size_t write_bytes,
- u64 *lockstart, u64 *lockend,
+ u64 *lockstart, u64 *lockend, bool nowait,
struct extent_state **cached_state)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1426,15 +991,28 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
if (start_pos < inode->vfs_inode.i_size) {
struct btrfs_ordered_extent *ordered;
- lock_extent_bits(&inode->io_tree, start_pos, last_pos,
- cached_state);
+ if (nowait) {
+ if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
+ cached_state)) {
+ for (i = 0; i < num_pages; i++) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ pages[i] = NULL;
+ }
+
+ return -EAGAIN;
+ }
+ } else {
+ lock_extent(&inode->io_tree, start_pos, last_pos, cached_state);
+ }
+
ordered = btrfs_lookup_ordered_range(inode, start_pos,
last_pos - start_pos + 1);
if (ordered &&
ordered->file_offset + ordered->num_bytes > start_pos &&
ordered->file_offset <= last_pos) {
- unlock_extent_cached(&inode->io_tree, start_pos,
- last_pos, cached_state);
+ unlock_extent(&inode->io_tree, start_pos, last_pos,
+ cached_state);
for (i = 0; i < num_pages; i++) {
unlock_page(pages[i]);
put_page(pages[i]);
@@ -1481,10 +1059,11 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
* NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
*/
int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes)
+ size_t *write_bytes, bool nowait)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
+ struct extent_state *cached_state = NULL;
u64 lockstart, lockend;
u64 num_bytes;
int ret;
@@ -1500,17 +1079,24 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
fs_info->sectorsize) - 1;
num_bytes = lockend - lockstart + 1;
- btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL);
+ if (nowait) {
+ if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
+ &cached_state)) {
+ btrfs_drew_write_unlock(&root->snapshot_lock);
+ return -EAGAIN;
+ }
+ } else {
+ btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
+ &cached_state);
+ }
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
- NULL, NULL, NULL, false);
- if (ret <= 0) {
- ret = 0;
+ NULL, NULL, NULL, nowait, false);
+ if (ret <= 0)
btrfs_drew_write_unlock(&root->snapshot_lock);
- } else {
+ else
*write_bytes = min_t(size_t, *write_bytes ,
num_bytes - pos + lockstart);
- }
- unlock_extent(&inode->io_tree, lockstart, lockend);
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
return ret;
}
@@ -1607,11 +1193,13 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
bool force_page_uptodate = false;
loff_t old_isize = i_size_read(inode);
unsigned int ilock_flags = 0;
+ const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
+ unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
- if (iocb->ki_flags & IOCB_NOWAIT)
+ if (nowait)
ilock_flags |= BTRFS_ILOCK_TRY;
- ret = btrfs_inode_lock(inode, ilock_flags);
+ ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
if (ret < 0)
return ret;
@@ -1664,18 +1252,29 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
extent_changeset_release(data_reserved);
ret = btrfs_check_data_free_space(BTRFS_I(inode),
&data_reserved, pos,
- write_bytes);
+ write_bytes, nowait);
if (ret < 0) {
+ int can_nocow;
+
+ if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) {
+ ret = -EAGAIN;
+ break;
+ }
+
/*
* If we don't have to COW at the offset, reserve
* metadata only. write_bytes may get smaller than
* requested here.
*/
- if (btrfs_check_nocow_lock(BTRFS_I(inode), pos,
- &write_bytes) > 0)
- only_release_metadata = true;
- else
+ can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos,
+ &write_bytes, nowait);
+ if (can_nocow < 0)
+ ret = can_nocow;
+ if (can_nocow > 0)
+ ret = 0;
+ if (ret)
break;
+ only_release_metadata = true;
}
num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
@@ -1685,7 +1284,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
WARN_ON(reserve_bytes == 0);
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
reserve_bytes,
- reserve_bytes, false);
+ reserve_bytes, nowait);
if (ret) {
if (!only_release_metadata)
btrfs_free_reserved_data_space(BTRFS_I(inode),
@@ -1693,19 +1292,27 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
write_bytes);
else
btrfs_check_nocow_unlock(BTRFS_I(inode));
+
+ if (nowait && ret == -ENOSPC)
+ ret = -EAGAIN;
break;
}
release_bytes = reserve_bytes;
again:
+ ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags);
+ if (ret) {
+ btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
+ break;
+ }
+
/*
* This is going to setup the pages array with the number of
* pages we want, so we don't really need to worry about the
* contents of pages from loop to loop
*/
ret = prepare_pages(inode, pages, num_pages,
- pos, write_bytes,
- force_page_uptodate);
+ pos, write_bytes, force_page_uptodate, false);
if (ret) {
btrfs_delalloc_release_extents(BTRFS_I(inode),
reserve_bytes);
@@ -1715,10 +1322,11 @@ again:
extents_locked = lock_and_cleanup_extent_if_need(
BTRFS_I(inode), pages,
num_pages, pos, write_bytes, &lockstart,
- &lockend, &cached_state);
+ &lockend, nowait, &cached_state);
if (extents_locked < 0) {
- if (extents_locked == -EAGAIN)
+ if (!nowait && extents_locked == -EAGAIN)
goto again;
+
btrfs_delalloc_release_extents(BTRFS_I(inode),
reserve_bytes);
ret = extents_locked;
@@ -1782,8 +1390,8 @@ again:
* possible cached extent state to avoid a memory leak.
*/
if (extents_locked)
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state);
else
free_extent_state(cached_state);
@@ -1801,8 +1409,6 @@ again:
cond_resched();
- balance_dirty_pages_ratelimited(inode->i_mapping);
-
pos += copied;
num_written += copied;
}
@@ -1828,7 +1434,7 @@ again:
iocb->ki_pos += num_written;
}
out:
- btrfs_inode_unlock(inode, ilock_flags);
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
return num_written ? num_written : ret;
}
@@ -1858,6 +1464,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
loff_t endbyte;
ssize_t err;
unsigned int ilock_flags = 0;
+ struct iomap_dio *dio;
if (iocb->ki_flags & IOCB_NOWAIT)
ilock_flags |= BTRFS_ILOCK_TRY;
@@ -1867,19 +1474,19 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
ilock_flags |= BTRFS_ILOCK_SHARED;
relock:
- err = btrfs_inode_lock(inode, ilock_flags);
+ err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
if (err < 0)
return err;
err = generic_write_checks(iocb, from);
if (err <= 0) {
- btrfs_inode_unlock(inode, ilock_flags);
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
return err;
}
err = btrfs_write_check(iocb, from, err);
if (err < 0) {
- btrfs_inode_unlock(inode, ilock_flags);
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
goto out;
}
@@ -1890,13 +1497,13 @@ relock:
*/
if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
pos + iov_iter_count(from) > i_size_read(inode)) {
- btrfs_inode_unlock(inode, ilock_flags);
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
ilock_flags &= ~BTRFS_ILOCK_SHARED;
goto relock;
}
if (check_direct_IO(fs_info, from, pos)) {
- btrfs_inode_unlock(inode, ilock_flags);
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
goto buffered;
}
@@ -1918,11 +1525,22 @@ relock:
* So here we disable page faults in the iov_iter and then retry if we
* got -EFAULT, faulting in the pages before the retry.
*/
-again:
from->nofault = true;
- err = btrfs_dio_rw(iocb, from, written);
+ dio = btrfs_dio_write(iocb, from, written);
from->nofault = false;
+ /*
+ * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
+ * iocb, and that needs to lock the inode. So unlock it before calling
+ * iomap_dio_complete() to avoid a deadlock.
+ */
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+
+ if (IS_ERR_OR_NULL(dio))
+ err = PTR_ERR_OR_ZERO(dio);
+ else
+ err = iomap_dio_complete(dio);
+
/* No increment (+=) because iomap returns a cumulative value. */
if (err > 0)
written = err;
@@ -1948,12 +1566,10 @@ again:
} else {
fault_in_iov_iter_readable(from, left);
prev_left = left;
- goto again;
+ goto relock;
}
}
- btrfs_inode_unlock(inode, ilock_flags);
-
/*
* If 'err' is -ENOTBLK or we have not written all data, then it means
* we must fallback to buffered IO.
@@ -1965,8 +1581,8 @@ buffered:
/*
* If we are in a NOWAIT context, then return -EAGAIN to signal the caller
* it must retry the operation in a context where blocking is acceptable,
- * since we currently don't have NOWAIT semantics support for buffered IO
- * and may block there for many reasons (reserving space for example).
+ * because even if we end up not blocking during the buffered IO attempt
+ * below, we will block when flushing and waiting for the IO.
*/
if (iocb->ki_flags & IOCB_NOWAIT) {
err = -EAGAIN;
@@ -2006,7 +1622,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
loff_t count;
ssize_t ret;
- btrfs_inode_lock(inode, 0);
+ btrfs_inode_lock(BTRFS_I(inode), 0);
count = encoded->len;
ret = generic_write_checks_count(iocb, &count);
if (ret == 0 && count != encoded->len) {
@@ -2025,7 +1641,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
ret = btrfs_do_encoded_write(iocb, from, encoded);
out:
- btrfs_inode_unlock(inode, 0);
+ btrfs_inode_unlock(BTRFS_I(inode), 0);
return ret;
}
@@ -2045,7 +1661,7 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
if (BTRFS_FS_ERROR(inode->root->fs_info))
return -EROFS;
- if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+ if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
return -EOPNOTSUPP;
if (sync)
@@ -2086,10 +1702,12 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
{
struct btrfs_file_private *private = filp->private_data;
- if (private && private->filldir_buf)
+ if (private) {
kfree(private->filldir_buf);
- kfree(private);
- filp->private_data = NULL;
+ free_extent_state(private->llseek_cached_state);
+ kfree(private);
+ filp->private_data = NULL;
+ }
/*
* Set by setattr when we are about to truncate a file from a non-zero
@@ -2196,19 +1814,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret)
goto out;
- btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
atomic_inc(&root->log_batch);
/*
- * Always check for the full sync flag while holding the inode's lock,
- * to avoid races with other tasks. The flag must be either set all the
- * time during logging or always off all the time while logging.
- */
- full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
-
- /*
* Before we acquired the inode's lock and the mmap lock, someone may
* have dirtied more pages in the target range. We need to make sure
* that writeback for any such pages does not start while we are logging
@@ -2228,11 +1838,22 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret) {
- btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
goto out;
}
/*
+ * Always check for the full sync flag while holding the inode's lock,
+ * to avoid races with other tasks. The flag must be either set all the
+ * time during logging or always off all the time while logging.
+ * We check the flag here after starting delalloc above, because when
+ * running delalloc the full sync flag may be set if we need to drop
+ * extra extent map ranges due to temporary memory allocation failures.
+ */
+ full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+
+ /*
* We have to do this here to avoid the priority inversion of waiting on
* IO of a lower priority task while holding a transaction open.
*
@@ -2320,7 +1941,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
- btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
if (ret == BTRFS_NO_LOG_SYNC) {
ret = btrfs_end_transaction(trans);
@@ -2380,6 +2001,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = btrfs_commit_transaction(trans);
out:
ASSERT(list_empty(&ctx.list));
+ ASSERT(list_empty(&ctx.conflict_inodes));
err = file_check_and_advance_wb_err(file);
if (!ret)
ret = err;
@@ -2387,7 +2009,7 @@ out:
out_release_extents:
btrfs_release_log_ctx_extents(&ctx);
- btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
goto out;
}
@@ -2448,7 +2070,6 @@ static int fill_holes(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
struct extent_map *hole_em;
- struct extent_map_tree *em_tree = &inode->extent_tree;
struct btrfs_key key;
int ret;
@@ -2505,8 +2126,8 @@ static int fill_holes(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
- offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0);
+ ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset,
+ end - offset);
if (ret)
return ret;
@@ -2515,7 +2136,7 @@ out:
hole_em = alloc_extent_map();
if (!hole_em) {
- btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+ btrfs_drop_extent_map_range(inode, offset, end - 1, false);
btrfs_set_inode_full_sync(inode);
} else {
hole_em->start = offset;
@@ -2529,12 +2150,7 @@ out:
hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = trans->transid;
- do {
- btrfs_drop_extent_cache(inode, offset, end - 1, 0);
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, hole_em, 1);
- write_unlock(&em_tree->lock);
- } while (ret == -EEXIST);
+ ret = btrfs_replace_extent_map_range(inode, hole_em, true);
free_extent_map(hole_em);
if (ret)
btrfs_set_inode_full_sync(inode);
@@ -2591,8 +2207,8 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
while (1) {
truncate_pagecache_range(inode, lockstart, lockend);
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- cached_state);
+ lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ cached_state);
/*
* We can't have ordered extents in the range, nor dirty/writeback
* pages, because we have locked the inode's VFS lock in exclusive
@@ -2607,8 +2223,8 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
page_lockend))
break;
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ cached_state);
}
btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
@@ -2988,7 +2604,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
bool truncated_block = false;
bool updated_inode = false;
- btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
ret = btrfs_wait_ordered_range(inode, offset, len);
if (ret)
@@ -3008,9 +2624,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
if (ret)
goto out_only_mutex;
- lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
- lockend = round_down(offset + len,
- btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
+ lockstart = round_up(offset, fs_info->sectorsize);
+ lockend = round_down(offset + len, fs_info->sectorsize) - 1;
same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
/*
@@ -3037,7 +2652,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
truncated_block = true;
ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
if (ret) {
- btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
return ret;
}
}
@@ -3108,8 +2723,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
out:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
out_only_mutex:
if (!updated_inode && truncated_block && !ret) {
/*
@@ -3136,7 +2751,7 @@ out_only_mutex:
ret = ret2;
}
}
- btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
return ret;
}
@@ -3212,7 +2827,7 @@ enum {
static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
u64 offset)
{
- const u64 sectorsize = btrfs_inode_sectorsize(inode);
+ const u64 sectorsize = inode->root->fs_info->sectorsize;
struct extent_map *em;
int ret;
@@ -3242,7 +2857,7 @@ static int btrfs_zero_range(struct inode *inode,
struct extent_changeset *data_reserved = NULL;
int ret;
u64 alloc_hint = 0;
- const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode));
+ const u64 sectorsize = fs_info->sectorsize;
u64 alloc_start = round_down(offset, sectorsize);
u64 alloc_end = round_up(offset + len, sectorsize);
u64 bytes_to_reserve = 0;
@@ -3382,16 +2997,16 @@ reserve_space:
ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
alloc_start, bytes_to_reserve);
if (ret) {
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state);
goto out;
}
ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
alloc_end - alloc_start,
i_blocksize(inode),
offset + len, &alloc_hint);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
/* btrfs_prealloc_file_range releases reserved space on error */
if (ret) {
space_reserved = false;
@@ -3428,7 +3043,7 @@ static long btrfs_fallocate(struct file *file, int mode,
u64 data_space_reserved = 0;
u64 qgroup_reserved = 0;
struct extent_map *em;
- int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
+ int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
int ret;
/* Do not allow fallocate in ZONED mode */
@@ -3447,7 +3062,7 @@ static long btrfs_fallocate(struct file *file, int mode,
if (mode & FALLOC_FL_PUNCH_HOLE)
return btrfs_punch_hole(file, offset, len);
- btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
ret = inode_newsize_ok(inode, offset + len);
@@ -3497,13 +3112,13 @@ static long btrfs_fallocate(struct file *file, int mode,
if (mode & FALLOC_FL_ZERO_RANGE) {
ret = btrfs_zero_range(inode, offset, len, mode);
- btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
return ret;
}
locked_end = alloc_end - 1;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
- &cached_state);
+ lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state);
btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
@@ -3592,31 +3207,303 @@ static long btrfs_fallocate(struct file *file, int mode,
*/
ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
out_unlock:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
- &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state);
out:
- btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
extent_changeset_free(data_reserved);
return ret;
}
-static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
- int whence)
+/*
+ * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
+ * that has unflushed and/or flushing delalloc. There might be other adjacent
+ * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
+ * looping while it gets adjacent subranges, and merging them together.
+ */
+static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state,
+ bool *search_io_tree,
+ u64 *delalloc_start_ret, u64 *delalloc_end_ret)
+{
+ u64 len = end + 1 - start;
+ u64 delalloc_len = 0;
+ struct btrfs_ordered_extent *oe;
+ u64 oe_start;
+ u64 oe_end;
+
+ /*
+ * Search the io tree first for EXTENT_DELALLOC. If we find any, it
+ * means we have delalloc (dirty pages) for which writeback has not
+ * started yet.
+ */
+ if (*search_io_tree) {
+ spin_lock(&inode->lock);
+ if (inode->delalloc_bytes > 0) {
+ spin_unlock(&inode->lock);
+ *delalloc_start_ret = start;
+ delalloc_len = count_range_bits(&inode->io_tree,
+ delalloc_start_ret, end,
+ len, EXTENT_DELALLOC, 1,
+ cached_state);
+ } else {
+ spin_unlock(&inode->lock);
+ }
+ }
+
+ if (delalloc_len > 0) {
+ /*
+ * If delalloc was found then *delalloc_start_ret has a sector size
+ * aligned value (rounded down).
+ */
+ *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
+
+ if (*delalloc_start_ret == start) {
+ /* Delalloc for the whole range, nothing more to do. */
+ if (*delalloc_end_ret == end)
+ return true;
+ /* Else trim our search range for ordered extents. */
+ start = *delalloc_end_ret + 1;
+ len = end + 1 - start;
+ }
+ } else {
+ /* No delalloc, future calls don't need to search again. */
+ *search_io_tree = false;
+ }
+
+ /*
+ * Now also check if there's any ordered extent in the range.
+ * We do this because:
+ *
+ * 1) When delalloc is flushed, the file range is locked, we clear the
+ * EXTENT_DELALLOC bit from the io tree and create an extent map and
+ * an ordered extent for the write. So we might just have been called
+ * after delalloc is flushed and before the ordered extent completes
+ * and inserts the new file extent item in the subvolume's btree;
+ *
+ * 2) We may have an ordered extent created by flushing delalloc for a
+ * subrange that starts before the subrange we found marked with
+ * EXTENT_DELALLOC in the io tree.
+ *
+ * We could also use the extent map tree to find such delalloc that is
+ * being flushed, but using the ordered extents tree is more efficient
+ * because it's usually much smaller as ordered extents are removed from
+ * the tree once they complete. With the extent maps, we mau have them
+ * in the extent map tree for a very long time, and they were either
+ * created by previous writes or loaded by read operations.
+ */
+ oe = btrfs_lookup_first_ordered_range(inode, start, len);
+ if (!oe)
+ return (delalloc_len > 0);
+
+ /* The ordered extent may span beyond our search range. */
+ oe_start = max(oe->file_offset, start);
+ oe_end = min(oe->file_offset + oe->num_bytes - 1, end);
+
+ btrfs_put_ordered_extent(oe);
+
+ /* Don't have unflushed delalloc, return the ordered extent range. */
+ if (delalloc_len == 0) {
+ *delalloc_start_ret = oe_start;
+ *delalloc_end_ret = oe_end;
+ return true;
+ }
+
+ /*
+ * We have both unflushed delalloc (io_tree) and an ordered extent.
+ * If the ranges are adjacent returned a combined range, otherwise
+ * return the leftmost range.
+ */
+ if (oe_start < *delalloc_start_ret) {
+ if (oe_end < *delalloc_start_ret)
+ *delalloc_end_ret = oe_end;
+ *delalloc_start_ret = oe_start;
+ } else if (*delalloc_end_ret + 1 == oe_start) {
+ *delalloc_end_ret = oe_end;
+ }
+
+ return true;
+}
+
+/*
+ * Check if there's delalloc in a given range.
+ *
+ * @inode: The inode.
+ * @start: The start offset of the range. It does not need to be
+ * sector size aligned.
+ * @end: The end offset (inclusive value) of the search range.
+ * It does not need to be sector size aligned.
+ * @cached_state: Extent state record used for speeding up delalloc
+ * searches in the inode's io_tree. Can be NULL.
+ * @delalloc_start_ret: Output argument, set to the start offset of the
+ * subrange found with delalloc (may not be sector size
+ * aligned).
+ * @delalloc_end_ret: Output argument, set to he end offset (inclusive value)
+ * of the subrange found with delalloc.
+ *
+ * Returns true if a subrange with delalloc is found within the given range, and
+ * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
+ * end offsets of the subrange.
+ */
+bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state,
+ u64 *delalloc_start_ret, u64 *delalloc_end_ret)
+{
+ u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
+ u64 prev_delalloc_end = 0;
+ bool search_io_tree = true;
+ bool ret = false;
+
+ while (cur_offset < end) {
+ u64 delalloc_start;
+ u64 delalloc_end;
+ bool delalloc;
+
+ delalloc = find_delalloc_subrange(inode, cur_offset, end,
+ cached_state, &search_io_tree,
+ &delalloc_start,
+ &delalloc_end);
+ if (!delalloc)
+ break;
+
+ if (prev_delalloc_end == 0) {
+ /* First subrange found. */
+ *delalloc_start_ret = max(delalloc_start, start);
+ *delalloc_end_ret = delalloc_end;
+ ret = true;
+ } else if (delalloc_start == prev_delalloc_end + 1) {
+ /* Subrange adjacent to the previous one, merge them. */
+ *delalloc_end_ret = delalloc_end;
+ } else {
+ /* Subrange not adjacent to the previous one, exit. */
+ break;
+ }
+
+ prev_delalloc_end = delalloc_end;
+ cur_offset = delalloc_end + 1;
+ cond_resched();
+ }
+
+ return ret;
+}
+
+/*
+ * Check if there's a hole or delalloc range in a range representing a hole (or
+ * prealloc extent) found in the inode's subvolume btree.
+ *
+ * @inode: The inode.
+ * @whence: Seek mode (SEEK_DATA or SEEK_HOLE).
+ * @start: Start offset of the hole region. It does not need to be sector
+ * size aligned.
+ * @end: End offset (inclusive value) of the hole region. It does not
+ * need to be sector size aligned.
+ * @start_ret: Return parameter, used to set the start of the subrange in the
+ * hole that matches the search criteria (seek mode), if such
+ * subrange is found (return value of the function is true).
+ * The value returned here may not be sector size aligned.
+ *
+ * Returns true if a subrange matching the given seek mode is found, and if one
+ * is found, it updates @start_ret with the start of the subrange.
+ */
+static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
+ struct extent_state **cached_state,
+ u64 start, u64 end, u64 *start_ret)
{
+ u64 delalloc_start;
+ u64 delalloc_end;
+ bool delalloc;
+
+ delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state,
+ &delalloc_start, &delalloc_end);
+ if (delalloc && whence == SEEK_DATA) {
+ *start_ret = delalloc_start;
+ return true;
+ }
+
+ if (delalloc && whence == SEEK_HOLE) {
+ /*
+ * We found delalloc but it starts after out start offset. So we
+ * have a hole between our start offset and the delalloc start.
+ */
+ if (start < delalloc_start) {
+ *start_ret = start;
+ return true;
+ }
+ /*
+ * Delalloc range starts at our start offset.
+ * If the delalloc range's length is smaller than our range,
+ * then it means we have a hole that starts where the delalloc
+ * subrange ends.
+ */
+ if (delalloc_end < end) {
+ *start_ret = delalloc_end + 1;
+ return true;
+ }
+
+ /* There's delalloc for the whole range. */
+ return false;
+ }
+
+ if (!delalloc && whence == SEEK_HOLE) {
+ *start_ret = start;
+ return true;
+ }
+
+ /*
+ * No delalloc in the range and we are seeking for data. The caller has
+ * to iterate to the next extent item in the subvolume btree.
+ */
+ return false;
+}
+
+static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
+{
+ struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host);
+ struct btrfs_file_private *private = file->private_data;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
- loff_t i_size = inode->vfs_inode.i_size;
+ struct extent_state **delalloc_cached_state;
+ const loff_t i_size = i_size_read(&inode->vfs_inode);
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *root = inode->root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ u64 last_extent_end;
u64 lockstart;
u64 lockend;
u64 start;
- u64 len;
- int ret = 0;
+ int ret;
+ bool found = false;
if (i_size == 0 || offset >= i_size)
return -ENXIO;
/*
+ * Quick path. If the inode has no prealloc extents and its number of
+ * bytes used matches its i_size, then it can not have holes.
+ */
+ if (whence == SEEK_HOLE &&
+ !(inode->flags & BTRFS_INODE_PREALLOC) &&
+ inode_get_bytes(&inode->vfs_inode) == i_size)
+ return i_size;
+
+ if (!private) {
+ private = kzalloc(sizeof(*private), GFP_KERNEL);
+ /*
+ * No worries if memory allocation failed.
+ * The private structure is used only for speeding up multiple
+ * lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
+ * so everything will still be correct.
+ */
+ file->private_data = private;
+ }
+
+ if (private)
+ delalloc_cached_state = &private->llseek_cached_state;
+ else
+ delalloc_cached_state = NULL;
+
+ /*
* offset can be negative, in this case we start finding DATA/HOLE from
* the very start of the file.
*/
@@ -3627,45 +3514,167 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
if (lockend <= lockstart)
lockend = lockstart + fs_info->sectorsize;
lockend--;
- len = lockend - lockstart + 1;
- lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state);
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = READA_FORWARD;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
+
+ last_extent_end = lockstart;
+
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
while (start < i_size) {
- em = btrfs_get_extent_fiemap(inode, start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- em = NULL;
- break;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_file_extent_item *extent;
+ u64 extent_end;
+
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+
+ leaf = path->nodes[0];
}
- if (whence == SEEK_HOLE &&
- (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
- break;
- else if (whence == SEEK_DATA &&
- (em->block_start != EXTENT_MAP_HOLE &&
- !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
break;
- start = em->start + em->len;
- free_extent_map(em);
- em = NULL;
+ extent_end = btrfs_file_extent_end(path);
+
+ /*
+ * In the first iteration we may have a slot that points to an
+ * extent that ends before our start offset, so skip it.
+ */
+ if (extent_end <= start) {
+ path->slots[0]++;
+ continue;
+ }
+
+ /* We have an implicit hole, NO_HOLES feature is likely set. */
+ if (last_extent_end < key.offset) {
+ u64 search_start = last_extent_end;
+ u64 found_start;
+
+ /*
+ * First iteration, @start matches @offset and it's
+ * within the hole.
+ */
+ if (start == offset)
+ search_start = offset;
+
+ found = find_desired_extent_in_hole(inode, whence,
+ delalloc_cached_state,
+ search_start,
+ key.offset - 1,
+ &found_start);
+ if (found) {
+ start = found_start;
+ break;
+ }
+ /*
+ * Didn't find data or a hole (due to delalloc) in the
+ * implicit hole range, so need to analyze the extent.
+ */
+ }
+
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_disk_bytenr(leaf, extent) == 0 ||
+ btrfs_file_extent_type(leaf, extent) ==
+ BTRFS_FILE_EXTENT_PREALLOC) {
+ /*
+ * Explicit hole or prealloc extent, search for delalloc.
+ * A prealloc extent is treated like a hole.
+ */
+ u64 search_start = key.offset;
+ u64 found_start;
+
+ /*
+ * First iteration, @start matches @offset and it's
+ * within the hole.
+ */
+ if (start == offset)
+ search_start = offset;
+
+ found = find_desired_extent_in_hole(inode, whence,
+ delalloc_cached_state,
+ search_start,
+ extent_end - 1,
+ &found_start);
+ if (found) {
+ start = found_start;
+ break;
+ }
+ /*
+ * Didn't find data or a hole (due to delalloc) in the
+ * implicit hole range, so need to analyze the next
+ * extent item.
+ */
+ } else {
+ /*
+ * Found a regular or inline extent.
+ * If we are seeking for data, adjust the start offset
+ * and stop, we're done.
+ */
+ if (whence == SEEK_DATA) {
+ start = max_t(u64, key.offset, offset);
+ found = true;
+ break;
+ }
+ /*
+ * Else, we are seeking for a hole, check the next file
+ * extent item.
+ */
+ }
+
+ start = extent_end;
+ last_extent_end = extent_end;
+ path->slots[0]++;
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
cond_resched();
}
- free_extent_map(em);
- unlock_extent_cached(&inode->io_tree, lockstart, lockend,
- &cached_state);
- if (ret) {
- offset = ret;
- } else {
- if (whence == SEEK_DATA && start >= i_size)
- offset = -ENXIO;
- else
- offset = min_t(loff_t, start, i_size);
+
+ /* We have an implicit hole from the last extent found up to i_size. */
+ if (!found && start < i_size) {
+ found = find_desired_extent_in_hole(inode, whence,
+ delalloc_cached_state, start,
+ i_size - 1, &start);
+ if (!found)
+ start = i_size;
}
- return offset;
+out:
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+ btrfs_free_path(path);
+
+ if (ret < 0)
+ return ret;
+
+ if (whence == SEEK_DATA && start >= i_size)
+ return -ENXIO;
+
+ return min_t(loff_t, start, i_size);
}
static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
@@ -3677,9 +3686,9 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
return generic_file_llseek(file, offset, whence);
case SEEK_DATA:
case SEEK_HOLE:
- btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
- offset = find_desired_extent(BTRFS_I(inode), offset, whence);
- btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
+ offset = find_desired_extent(file, offset, whence);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
break;
}
@@ -3693,7 +3702,7 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
{
int ret;
- filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
+ filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
ret = fsverity_file_open(inode, filp);
if (ret)
@@ -3734,7 +3743,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
return 0;
- btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
again:
/*
* This is similar to what we do for direct IO writes, see the comment
@@ -3753,7 +3762,7 @@ again:
*/
pagefault_disable();
to->nofault = true;
- ret = btrfs_dio_rw(iocb, to, read);
+ ret = btrfs_dio_read(iocb, to, read);
to->nofault = false;
pagefault_enable();
@@ -3783,7 +3792,7 @@ again:
goto again;
}
}
- btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
return ret < 0 ? ret : read;
}
@@ -3810,6 +3819,7 @@ const struct file_operations btrfs_file_operations = {
.mmap = btrfs_file_mmap,
.open = btrfs_file_open,
.release = btrfs_release_file,
+ .get_unmapped_area = thp_get_unmapped_area,
.fsync = btrfs_sync_file,
.fallocate = btrfs_fallocate,
.unlocked_ioctl = btrfs_ioctl,
@@ -3819,23 +3829,6 @@ const struct file_operations btrfs_file_operations = {
.remap_file_range = btrfs_remap_file_range,
};
-void __cold btrfs_auto_defrag_exit(void)
-{
- kmem_cache_destroy(btrfs_inode_defrag_cachep);
-}
-
-int __init btrfs_auto_defrag_init(void)
-{
- btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
- sizeof(struct inode_defrag), 0,
- SLAB_MEM_SPREAD,
- NULL);
- if (!btrfs_inode_defrag_cachep)
- return -ENOMEM;
-
- return 0;
-}
-
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
{
int ret;
diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h
new file mode 100644
index 000000000000..82b34fbb295f
--- /dev/null
+++ b/fs/btrfs/file.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_FILE_H
+#define BTRFS_FILE_H
+
+extern const struct file_operations btrfs_file_operations;
+
+int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_drop_extents_args *args);
+int btrfs_replace_file_extents(struct btrfs_inode *inode,
+ struct btrfs_path *path, const u64 start,
+ const u64 end,
+ struct btrfs_replace_extent_info *extent_info,
+ struct btrfs_trans_handle **trans_out);
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode, u64 start, u64 end);
+ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded);
+int btrfs_release_file(struct inode *inode, struct file *file);
+int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
+ size_t num_pages, loff_t pos, size_t write_bytes,
+ struct extent_state **cached, bool noreserve);
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
+int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes, bool nowait);
+void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
+bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state,
+ u64 *delalloc_start_ret, u64 *delalloc_end_ret);
+
+#endif
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 996da650ecdc..0d250d052487 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -11,8 +11,10 @@
#include <linux/ratelimit.h>
#include <linux/error-injection.h>
#include <linux/sched/mm.h>
-#include "misc.h"
#include "ctree.h"
+#include "fs.h"
+#include "messages.h"
+#include "misc.h"
#include "free-space-cache.h"
#include "transaction.h"
#include "disk-io.h"
@@ -24,11 +26,18 @@
#include "discard.h"
#include "subpage.h"
#include "inode-item.h"
+#include "accessors.h"
+#include "file-item.h"
+#include "file.h"
+#include "super.h"
#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
#define MAX_CACHE_BYTES_PER_GIG SZ_64K
#define FORCE_EXTENT_THRESHOLD SZ_1M
+static struct kmem_cache *btrfs_free_space_cachep;
+static struct kmem_cache *btrfs_free_space_bitmap_cachep;
+
struct btrfs_trim_range {
u64 start;
u64 bytes;
@@ -48,6 +57,24 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset,
u64 bytes, bool update_stats);
+static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
+{
+ struct btrfs_free_space *info;
+ struct rb_node *node;
+
+ while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
+ info = rb_entry(node, struct btrfs_free_space, offset_index);
+ if (!info->bitmap) {
+ unlink_free_space(ctl, info, true);
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ } else {
+ free_bitmap(ctl, info);
+ }
+
+ cond_resched_lock(&ctl->tree_lock);
+ }
+}
+
static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_path *path,
u64 offset)
@@ -126,10 +153,8 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
block_group->disk_cache_state = BTRFS_DC_CLEAR;
}
- if (!block_group->iref) {
+ if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags))
block_group->inode = igrab(inode);
- block_group->iref = 1;
- }
spin_unlock(&block_group->lock);
return inode;
@@ -235,14 +260,13 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
}
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
if (ret) {
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(BTRFS_I(inode));
goto out;
}
clear_nlink(inode);
/* One for the block groups ref */
spin_lock(&block_group->lock);
- if (block_group->iref) {
- block_group->iref = 0;
+ if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) {
block_group->inode = NULL;
spin_unlock(&block_group->lock);
iput(inode);
@@ -250,7 +274,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
spin_unlock(&block_group->lock);
}
/* One for the lookup ref */
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(BTRFS_I(inode));
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
key.type = 0;
@@ -333,8 +357,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
btrfs_i_size_write(inode, 0);
truncate_pagecache(vfs_inode, 0);
- lock_extent_bits(&inode->io_tree, 0, (u64)-1, &cached_state);
- btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+ lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
+ btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
/*
* We skip the throttling logic for free space cache inodes, so we don't
@@ -345,7 +369,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
- unlock_extent_cached(&inode->io_tree, 0, (u64)-1, &cached_state);
+ unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
if (ret)
goto fail;
@@ -693,6 +717,12 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
max_bitmaps = max_t(u64, max_bitmaps, 1);
+ if (ctl->total_bitmaps > max_bitmaps)
+ btrfs_err(block_group->fs_info,
+"invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu",
+ block_group->start, block_group->length,
+ ctl->total_bitmaps, ctl->unit, max_bitmaps,
+ bytes_per_bg);
ASSERT(ctl->total_bitmaps <= max_bitmaps);
/*
@@ -875,7 +905,10 @@ out:
return ret;
free_cache:
io_ctl_drop_pages(&io_ctl);
+
+ spin_lock(&ctl->tree_lock);
__btrfs_remove_free_space_cache(ctl);
+ spin_unlock(&ctl->tree_lock);
goto out;
}
@@ -914,6 +947,8 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group,
return ret;
}
+static struct lock_class_key btrfs_free_space_inode_key;
+
int load_free_space_cache(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -983,6 +1018,14 @@ int load_free_space_cache(struct btrfs_block_group *block_group)
}
spin_unlock(&block_group->lock);
+ /*
+ * Reinitialize the class of struct inode's mapping->invalidate_lock for
+ * free space inodes to prevent false positives related to locks for normal
+ * inodes.
+ */
+ lockdep_set_class(&(&inode->i_data)->invalidate_lock,
+ &btrfs_free_space_inode_key);
+
ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
path, block_group->start);
btrfs_free_path(path);
@@ -1001,7 +1044,13 @@ int load_free_space_cache(struct btrfs_block_group *block_group)
if (ret == 0)
ret = 1;
} else {
+ /*
+ * We need to call the _locked variant so we don't try to update
+ * the discard counters.
+ */
+ spin_lock(&tmp_ctl.tree_lock);
__btrfs_remove_free_space_cache(&tmp_ctl);
+ spin_unlock(&tmp_ctl.tree_lock);
btrfs_warn(fs_info,
"block group %llu has wrong amount of free space",
block_group->start);
@@ -1123,7 +1172,7 @@ update_cache_item(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
- EXTENT_DELALLOC, 0, 0, NULL);
+ EXTENT_DELALLOC, NULL);
goto fail;
}
leaf = path->nodes[0];
@@ -1135,8 +1184,8 @@ update_cache_item(struct btrfs_trans_handle *trans,
if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
found_key.offset != offset) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
- inode->i_size - 1, EXTENT_DELALLOC, 0,
- 0, NULL);
+ inode->i_size - 1, EXTENT_DELALLOC,
+ NULL);
btrfs_release_path(path);
goto fail;
}
@@ -1232,7 +1281,7 @@ static int flush_dirty_cache(struct inode *inode)
ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
- EXTENT_DELALLOC, 0, 0, NULL);
+ EXTENT_DELALLOC, NULL);
return ret;
}
@@ -1252,8 +1301,8 @@ cleanup_write_cache_enospc(struct inode *inode,
struct extent_state **cached_state)
{
io_ctl_drop_pages(io_ctl);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
- i_size_read(inode) - 1, cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ cached_state);
}
static int __btrfs_wait_cache_io(struct btrfs_root *root,
@@ -1323,8 +1372,8 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
path, block_group->start);
}
-/**
- * Write out cached info to an inode
+/*
+ * Write out cached info to an inode.
*
* @root: root the inode belongs to
* @inode: freespace inode we are writing out
@@ -1378,8 +1427,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
if (ret)
goto out_unlock;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- &cached_state);
+ lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ &cached_state);
io_ctl_set_generation(io_ctl, trans->transid);
@@ -1434,8 +1483,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
io_ctl_drop_pages(io_ctl);
io_ctl_free(io_ctl);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
- i_size_read(inode) - 1, &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ &cached_state);
/*
* at this point the pages are under IO and we're happy,
@@ -2677,8 +2726,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
btrfs_mark_bg_unused(block_group);
} else if (bg_reclaim_threshold &&
reclaimable_unusable >=
- div_factor_fine(block_group->zone_capacity,
- bg_reclaim_threshold)) {
+ mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) {
btrfs_mark_bg_to_reclaim(block_group);
}
@@ -2860,7 +2908,8 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
if (btrfs_is_zoned(fs_info)) {
btrfs_info(fs_info, "free space %llu active %d",
block_group->zone_capacity - block_group->alloc_offset,
- block_group->zone_is_active);
+ test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &block_group->runtime_flags));
return;
}
@@ -2964,34 +3013,6 @@ static void __btrfs_return_cluster_to_free_space(
btrfs_put_block_group(block_group);
}
-static void __btrfs_remove_free_space_cache_locked(
- struct btrfs_free_space_ctl *ctl)
-{
- struct btrfs_free_space *info;
- struct rb_node *node;
-
- while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
- info = rb_entry(node, struct btrfs_free_space, offset_index);
- if (!info->bitmap) {
- unlink_free_space(ctl, info, true);
- kmem_cache_free(btrfs_free_space_cachep, info);
- } else {
- free_bitmap(ctl, info);
- }
-
- cond_resched_lock(&ctl->tree_lock);
- }
-}
-
-void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
-{
- spin_lock(&ctl->tree_lock);
- __btrfs_remove_free_space_cache_locked(ctl);
- if (ctl->block_group)
- btrfs_discard_update_discardable(ctl->block_group);
- spin_unlock(&ctl->tree_lock);
-}
-
void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
@@ -3009,16 +3030,13 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
cond_resched_lock(&ctl->tree_lock);
}
- __btrfs_remove_free_space_cache_locked(ctl);
+ __btrfs_remove_free_space_cache(ctl);
btrfs_discard_update_discardable(block_group);
spin_unlock(&ctl->tree_lock);
}
-/**
- * btrfs_is_free_space_trimmed - see if everything is trimmed
- * @block_group: block_group of interest
- *
+/*
* Walk @block_group's free space rb_tree to determine if everything is trimmed.
*/
bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
@@ -3992,7 +4010,7 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group,
*trimmed = 0;
spin_lock(&block_group->lock);
- if (block_group->removed) {
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
return 0;
}
@@ -4022,7 +4040,7 @@ int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
*trimmed = 0;
spin_lock(&block_group->lock);
- if (block_group->removed) {
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
return 0;
}
@@ -4044,7 +4062,7 @@ int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
*trimmed = 0;
spin_lock(&block_group->lock);
- if (block_group->removed) {
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
return 0;
}
@@ -4119,6 +4137,31 @@ out:
return ret;
}
+int __init btrfs_free_space_init(void)
+{
+ btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
+ sizeof(struct btrfs_free_space), 0,
+ SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_free_space_cachep)
+ return -ENOMEM;
+
+ btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
+ PAGE_SIZE, PAGE_SIZE,
+ SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_free_space_bitmap_cachep) {
+ kmem_cache_destroy(btrfs_free_space_cachep);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+void __cold btrfs_free_space_exit(void)
+{
+ kmem_cache_destroy(btrfs_free_space_cachep);
+ kmem_cache_destroy(btrfs_free_space_bitmap_cachep);
+}
+
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
/*
* Use this if you need to make a bitmap or extent entry specifically, it
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 15591b299895..a855e0483e03 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -43,6 +43,17 @@ static inline bool btrfs_free_space_trimming_bitmap(
return (info->trim_state == BTRFS_TRIM_STATE_TRIMMING);
}
+/*
+ * Deltas are an effective way to populate global statistics. Give macro names
+ * to make it clear what we're doing. An example is discard_extents in
+ * btrfs_free_space_ctl.
+ */
+enum {
+ BTRFS_STAT_CURR,
+ BTRFS_STAT_PREV,
+ BTRFS_STAT_NR_ENTRIES,
+};
+
struct btrfs_free_space_ctl {
spinlock_t tree_lock;
struct rb_root free_space_offset;
@@ -79,6 +90,8 @@ struct btrfs_io_ctl {
int bitmaps;
};
+int __init btrfs_free_space_init(void);
+void __cold btrfs_free_space_exit(void);
struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
struct btrfs_path *path);
int create_free_space_inode(struct btrfs_trans_handle *trans,
@@ -113,7 +126,6 @@ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
int btrfs_remove_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
-void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group);
bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group);
u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 1bf89aa67216..c667e878ef1a 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -5,12 +5,17 @@
#include <linux/kernel.h>
#include <linux/sched/mm.h>
+#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "locking.h"
#include "free-space-tree.h"
#include "transaction.h"
#include "block-group.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "root-tree.h"
static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
@@ -803,7 +808,7 @@ int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
u32 flags;
int ret;
- if (block_group->needs_free_space) {
+ if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
ret = __add_block_group_free_space(trans, block_group, path);
if (ret)
return ret;
@@ -996,7 +1001,7 @@ int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
u32 flags;
int ret;
- if (block_group->needs_free_space) {
+ if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
ret = __add_block_group_free_space(trans, block_group, path);
if (ret)
return ret;
@@ -1299,7 +1304,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
{
int ret;
- block_group->needs_free_space = 0;
+ clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags);
ret = add_new_free_space_info(trans, block_group, path);
if (ret)
@@ -1321,7 +1326,7 @@ int add_block_group_free_space(struct btrfs_trans_handle *trans,
return 0;
mutex_lock(&block_group->free_space_lock);
- if (!block_group->needs_free_space)
+ if (!test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags))
goto out;
path = btrfs_alloc_path();
@@ -1354,7 +1359,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
return 0;
- if (block_group->needs_free_space) {
+ if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
/* We never added this block group to the free space tree. */
return 0;
}
@@ -1453,8 +1458,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
ASSERT(key.objectid < end && key.objectid + key.offset <= end);
- caching_ctl->progress = key.objectid;
-
offset = key.objectid;
while (offset < key.objectid + key.offset) {
bit = free_space_test_bit(block_group, path, offset);
@@ -1490,8 +1493,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
goto out;
}
- caching_ctl->progress = (u64)-1;
-
ret = 0;
out:
return ret;
@@ -1531,8 +1532,6 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
ASSERT(key.objectid < end && key.objectid + key.offset <= end);
- caching_ctl->progress = key.objectid;
-
total_found += add_new_free_space(block_group, key.objectid,
key.objectid + key.offset);
if (total_found > CACHING_CTL_WAKE_UP) {
@@ -1552,8 +1551,6 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
goto out;
}
- caching_ctl->progress = (u64)-1;
-
ret = 0;
out:
return ret;
diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
new file mode 100644
index 000000000000..5553e1f8afe8
--- /dev/null
+++ b/fs/btrfs/fs.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "messages.h"
+#include "ctree.h"
+#include "fs.h"
+#include "accessors.h"
+
+void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
+ const char *name)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_incompat_flags(disk_super);
+ if (!(features & flag)) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_incompat_flags(disk_super);
+ if (!(features & flag)) {
+ features |= flag;
+ btrfs_set_super_incompat_flags(disk_super, features);
+ btrfs_info(fs_info,
+ "setting incompat feature flag for %s (0x%llx)",
+ name, flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
+void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
+ const char *name)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_incompat_flags(disk_super);
+ if (features & flag) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_incompat_flags(disk_super);
+ if (features & flag) {
+ features &= ~flag;
+ btrfs_set_super_incompat_flags(disk_super, features);
+ btrfs_info(fs_info,
+ "clearing incompat feature flag for %s (0x%llx)",
+ name, flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
+void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
+ const char *name)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (!(features & flag)) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (!(features & flag)) {
+ features |= flag;
+ btrfs_set_super_compat_ro_flags(disk_super, features);
+ btrfs_info(fs_info,
+ "setting compat-ro feature flag for %s (0x%llx)",
+ name, flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
+void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
+ const char *name)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (features & flag) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (features & flag) {
+ features &= ~flag;
+ btrfs_set_super_compat_ro_flags(disk_super, features);
+ btrfs_info(fs_info,
+ "clearing compat-ro feature flag for %s (0x%llx)",
+ name, flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
new file mode 100644
index 000000000000..a749367e5ae2
--- /dev/null
+++ b/fs/btrfs/fs.h
@@ -0,0 +1,976 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_FS_H
+#define BTRFS_FS_H
+
+#include <linux/fs.h>
+#include <linux/btrfs_tree.h>
+#include <linux/sizes.h>
+#include "extent-io-tree.h"
+#include "extent_map.h"
+#include "async-thread.h"
+#include "block-rsv.h"
+
+#define BTRFS_MAX_EXTENT_SIZE SZ_128M
+
+#define BTRFS_OLDEST_GENERATION 0ULL
+
+#define BTRFS_EMPTY_DIR_SIZE 0
+
+#define BTRFS_DIRTY_METADATA_THRESH SZ_32M
+
+#define BTRFS_SUPER_INFO_OFFSET SZ_64K
+#define BTRFS_SUPER_INFO_SIZE 4096
+static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
+
+/*
+ * The reserved space at the beginning of each device. It covers the primary
+ * super block and leaves space for potential use by other tools like
+ * bootloaders or to lower potential damage of accidental overwrite.
+ */
+#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M)
+/*
+ * Runtime (in-memory) states of filesystem
+ */
+enum {
+ /* Global indicator of serious filesystem errors */
+ BTRFS_FS_STATE_ERROR,
+ /*
+ * Filesystem is being remounted, allow to skip some operations, like
+ * defrag
+ */
+ BTRFS_FS_STATE_REMOUNTING,
+ /* Filesystem in RO mode */
+ BTRFS_FS_STATE_RO,
+ /* Track if a transaction abort has been reported on this filesystem */
+ BTRFS_FS_STATE_TRANS_ABORTED,
+ /*
+ * Bio operations should be blocked on this filesystem because a source
+ * or target device is being destroyed as part of a device replace
+ */
+ BTRFS_FS_STATE_DEV_REPLACING,
+ /* The btrfs_fs_info created for self-tests */
+ BTRFS_FS_STATE_DUMMY_FS_INFO,
+
+ BTRFS_FS_STATE_NO_CSUMS,
+
+ /* Indicates there was an error cleaning up a log tree. */
+ BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
+
+ BTRFS_FS_STATE_COUNT
+};
+
+enum {
+ BTRFS_FS_CLOSING_START,
+ BTRFS_FS_CLOSING_DONE,
+ BTRFS_FS_LOG_RECOVERING,
+ BTRFS_FS_OPEN,
+ BTRFS_FS_QUOTA_ENABLED,
+ BTRFS_FS_UPDATE_UUID_TREE_GEN,
+ BTRFS_FS_CREATING_FREE_SPACE_TREE,
+ BTRFS_FS_BTREE_ERR,
+ BTRFS_FS_LOG1_ERR,
+ BTRFS_FS_LOG2_ERR,
+ BTRFS_FS_QUOTA_OVERRIDE,
+ /* Used to record internally whether fs has been frozen */
+ BTRFS_FS_FROZEN,
+ /*
+ * Indicate that balance has been set up from the ioctl and is in the
+ * main phase. The fs_info::balance_ctl is initialized.
+ */
+ BTRFS_FS_BALANCE_RUNNING,
+
+ /*
+ * Indicate that relocation of a chunk has started, it's set per chunk
+ * and is toggled between chunks.
+ */
+ BTRFS_FS_RELOC_RUNNING,
+
+ /* Indicate that the cleaner thread is awake and doing something. */
+ BTRFS_FS_CLEANER_RUNNING,
+
+ /*
+ * The checksumming has an optimized version and is considered fast,
+ * so we don't need to offload checksums to workqueues.
+ */
+ BTRFS_FS_CSUM_IMPL_FAST,
+
+ /* Indicate that the discard workqueue can service discards. */
+ BTRFS_FS_DISCARD_RUNNING,
+
+ /* Indicate that we need to cleanup space cache v1 */
+ BTRFS_FS_CLEANUP_SPACE_CACHE_V1,
+
+ /* Indicate that we can't trust the free space tree for caching yet */
+ BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED,
+
+ /* Indicate whether there are any tree modification log users */
+ BTRFS_FS_TREE_MOD_LOG_USERS,
+
+ /* Indicate that we want the transaction kthread to commit right now. */
+ BTRFS_FS_COMMIT_TRANS,
+
+ /* Indicate we have half completed snapshot deletions pending. */
+ BTRFS_FS_UNFINISHED_DROPS,
+
+ /* Indicate we have to finish a zone to do next allocation. */
+ BTRFS_FS_NEED_ZONE_FINISH,
+
+ /* Indicate that we want to commit the transaction. */
+ BTRFS_FS_NEED_TRANS_COMMIT,
+
+#if BITS_PER_LONG == 32
+ /* Indicate if we have error/warn message printed on 32bit systems */
+ BTRFS_FS_32BIT_ERROR,
+ BTRFS_FS_32BIT_WARN,
+#endif
+};
+
+/*
+ * Flags for mount options.
+ *
+ * Note: don't forget to add new options to btrfs_show_options()
+ */
+enum {
+ BTRFS_MOUNT_NODATASUM = (1UL << 0),
+ BTRFS_MOUNT_NODATACOW = (1UL << 1),
+ BTRFS_MOUNT_NOBARRIER = (1UL << 2),
+ BTRFS_MOUNT_SSD = (1UL << 3),
+ BTRFS_MOUNT_DEGRADED = (1UL << 4),
+ BTRFS_MOUNT_COMPRESS = (1UL << 5),
+ BTRFS_MOUNT_NOTREELOG = (1UL << 6),
+ BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7),
+ BTRFS_MOUNT_SSD_SPREAD = (1UL << 8),
+ BTRFS_MOUNT_NOSSD = (1UL << 9),
+ BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10),
+ BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11),
+ BTRFS_MOUNT_SPACE_CACHE = (1UL << 12),
+ BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13),
+ BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14),
+ BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15),
+ BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16),
+ BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17),
+ BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18),
+ BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19),
+ BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20),
+ BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21),
+ BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22),
+ BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23),
+ BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24),
+ BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25),
+ BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26),
+ BTRFS_MOUNT_REF_VERIFY = (1UL << 27),
+ BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28),
+ BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29),
+ BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30),
+ BTRFS_MOUNT_NODISCARD = (1UL << 31),
+};
+
+/*
+ * Compat flags that we support. If any incompat flags are set other than the
+ * ones specified below then we will fail to mount
+ */
+#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
+#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL
+#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL
+
+#define BTRFS_FEATURE_COMPAT_RO_SUPP \
+ (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \
+ BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
+ BTRFS_FEATURE_COMPAT_RO_VERITY | \
+ BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE)
+
+#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
+
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG
+ */
+#define BTRFS_FEATURE_INCOMPAT_SUPP \
+ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
+ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
+ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
+ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \
+ BTRFS_FEATURE_INCOMPAT_RAID56 | \
+ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
+ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
+ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
+ BTRFS_FEATURE_INCOMPAT_ZONED | \
+ BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
+#else
+#define BTRFS_FEATURE_INCOMPAT_SUPP \
+ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
+ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
+ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
+ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \
+ BTRFS_FEATURE_INCOMPAT_RAID56 | \
+ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
+ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
+ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
+ BTRFS_FEATURE_INCOMPAT_ZONED)
+#endif
+
+#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
+ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL
+
+#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
+#define BTRFS_DEFAULT_MAX_INLINE (2048)
+
+struct btrfs_dev_replace {
+ /* See #define above */
+ u64 replace_state;
+ /* Seconds since 1-Jan-1970 */
+ time64_t time_started;
+ /* Seconds since 1-Jan-1970 */
+ time64_t time_stopped;
+ atomic64_t num_write_errors;
+ atomic64_t num_uncorrectable_read_errors;
+
+ u64 cursor_left;
+ u64 committed_cursor_left;
+ u64 cursor_left_last_write_of_item;
+ u64 cursor_right;
+
+ /* See #define above */
+ u64 cont_reading_from_srcdev_mode;
+
+ int is_valid;
+ int item_needs_writeback;
+ struct btrfs_device *srcdev;
+ struct btrfs_device *tgtdev;
+
+ struct mutex lock_finishing_cancel_unmount;
+ struct rw_semaphore rwsem;
+
+ struct btrfs_scrub_progress scrub_progress;
+
+ struct percpu_counter bio_counter;
+ wait_queue_head_t replace_wait;
+};
+
+/*
+ * Free clusters are used to claim free space in relatively large chunks,
+ * allowing us to do less seeky writes. They are used for all metadata
+ * allocations. In ssd_spread mode they are also used for data allocations.
+ */
+struct btrfs_free_cluster {
+ spinlock_t lock;
+ spinlock_t refill_lock;
+ struct rb_root root;
+
+ /* Largest extent in this cluster */
+ u64 max_size;
+
+ /* First extent starting offset */
+ u64 window_start;
+
+ /* We did a full search and couldn't create a cluster */
+ bool fragmented;
+
+ struct btrfs_block_group *block_group;
+ /*
+ * When a cluster is allocated from a block group, we put the cluster
+ * onto a list in the block group so that it can be freed before the
+ * block group is freed.
+ */
+ struct list_head block_group_list;
+};
+
+/* Discard control. */
+/*
+ * Async discard uses multiple lists to differentiate the discard filter
+ * parameters. Index 0 is for completely free block groups where we need to
+ * ensure the entire block group is trimmed without being lossy. Indices
+ * afterwards represent monotonically decreasing discard filter sizes to
+ * prioritize what should be discarded next.
+ */
+#define BTRFS_NR_DISCARD_LISTS 3
+#define BTRFS_DISCARD_INDEX_UNUSED 0
+#define BTRFS_DISCARD_INDEX_START 1
+
+struct btrfs_discard_ctl {
+ struct workqueue_struct *discard_workers;
+ struct delayed_work work;
+ spinlock_t lock;
+ struct btrfs_block_group *block_group;
+ struct list_head discard_list[BTRFS_NR_DISCARD_LISTS];
+ u64 prev_discard;
+ u64 prev_discard_time;
+ atomic_t discardable_extents;
+ atomic64_t discardable_bytes;
+ u64 max_discard_size;
+ u64 delay_ms;
+ u32 iops_limit;
+ u32 kbps_limit;
+ u64 discard_extent_bytes;
+ u64 discard_bitmap_bytes;
+ atomic64_t discard_bytes_saved;
+};
+
+/*
+ * Exclusive operations (device replace, resize, device add/remove, balance)
+ */
+enum btrfs_exclusive_operation {
+ BTRFS_EXCLOP_NONE,
+ BTRFS_EXCLOP_BALANCE_PAUSED,
+ BTRFS_EXCLOP_BALANCE,
+ BTRFS_EXCLOP_DEV_ADD,
+ BTRFS_EXCLOP_DEV_REMOVE,
+ BTRFS_EXCLOP_DEV_REPLACE,
+ BTRFS_EXCLOP_RESIZE,
+ BTRFS_EXCLOP_SWAP_ACTIVATE,
+};
+
+/* Store data about transaction commits, exported via sysfs. */
+struct btrfs_commit_stats {
+ /* Total number of commits */
+ u64 commit_count;
+ /* The maximum commit duration so far in ns */
+ u64 max_commit_dur;
+ /* The last commit duration in ns */
+ u64 last_commit_dur;
+ /* The total commit duration in ns */
+ u64 total_commit_dur;
+};
+
+struct btrfs_fs_info {
+ u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+ unsigned long flags;
+ struct btrfs_root *tree_root;
+ struct btrfs_root *chunk_root;
+ struct btrfs_root *dev_root;
+ struct btrfs_root *fs_root;
+ struct btrfs_root *quota_root;
+ struct btrfs_root *uuid_root;
+ struct btrfs_root *data_reloc_root;
+ struct btrfs_root *block_group_root;
+
+ /* The log root tree is a directory of all the other log roots */
+ struct btrfs_root *log_root_tree;
+
+ /* The tree that holds the global roots (csum, extent, etc) */
+ rwlock_t global_root_lock;
+ struct rb_root global_root_tree;
+
+ spinlock_t fs_roots_radix_lock;
+ struct radix_tree_root fs_roots_radix;
+
+ /* Block group cache stuff */
+ rwlock_t block_group_cache_lock;
+ struct rb_root_cached block_group_cache_tree;
+
+ /* Keep track of unallocated space */
+ atomic64_t free_chunk_space;
+
+ /* Track ranges which are used by log trees blocks/logged data extents */
+ struct extent_io_tree excluded_extents;
+
+ /* logical->physical extent mapping */
+ struct extent_map_tree mapping_tree;
+
+ /*
+ * Block reservation for extent, checksum, root tree and delayed dir
+ * index item.
+ */
+ struct btrfs_block_rsv global_block_rsv;
+ /* Block reservation for metadata operations */
+ struct btrfs_block_rsv trans_block_rsv;
+ /* Block reservation for chunk tree */
+ struct btrfs_block_rsv chunk_block_rsv;
+ /* Block reservation for delayed operations */
+ struct btrfs_block_rsv delayed_block_rsv;
+ /* Block reservation for delayed refs */
+ struct btrfs_block_rsv delayed_refs_rsv;
+
+ struct btrfs_block_rsv empty_block_rsv;
+
+ u64 generation;
+ u64 last_trans_committed;
+ /*
+ * Generation of the last transaction used for block group relocation
+ * since the filesystem was last mounted (or 0 if none happened yet).
+ * Must be written and read while holding btrfs_fs_info::commit_root_sem.
+ */
+ u64 last_reloc_trans;
+ u64 avg_delayed_ref_runtime;
+
+ /*
+ * This is updated to the current trans every time a full commit is
+ * required instead of the faster short fsync log commits
+ */
+ u64 last_trans_log_full_commit;
+ unsigned long mount_opt;
+
+ unsigned long compress_type:4;
+ unsigned int compress_level;
+ u32 commit_interval;
+ /*
+ * It is a suggestive number, the read side is safe even it gets a
+ * wrong number because we will write out the data into a regular
+ * extent. The write side(mount/remount) is under ->s_umount lock,
+ * so it is also safe.
+ */
+ u64 max_inline;
+
+ struct btrfs_transaction *running_transaction;
+ wait_queue_head_t transaction_throttle;
+ wait_queue_head_t transaction_wait;
+ wait_queue_head_t transaction_blocked_wait;
+ wait_queue_head_t async_submit_wait;
+
+ /*
+ * Used to protect the incompat_flags, compat_flags, compat_ro_flags
+ * when they are updated.
+ *
+ * Because we do not clear the flags for ever, so we needn't use
+ * the lock on the read side.
+ *
+ * We also needn't use the lock when we mount the fs, because
+ * there is no other task which will update the flag.
+ */
+ spinlock_t super_lock;
+ struct btrfs_super_block *super_copy;
+ struct btrfs_super_block *super_for_commit;
+ struct super_block *sb;
+ struct inode *btree_inode;
+ struct mutex tree_log_mutex;
+ struct mutex transaction_kthread_mutex;
+ struct mutex cleaner_mutex;
+ struct mutex chunk_mutex;
+
+ /*
+ * This is taken to make sure we don't set block groups ro after the
+ * free space cache has been allocated on them.
+ */
+ struct mutex ro_block_group_mutex;
+
+ /*
+ * This is used during read/modify/write to make sure no two ios are
+ * trying to mod the same stripe at the same time.
+ */
+ struct btrfs_stripe_hash_table *stripe_hash_table;
+
+ /*
+ * This protects the ordered operations list only while we are
+ * processing all of the entries on it. This way we make sure the
+ * commit code doesn't find the list temporarily empty because another
+ * function happens to be doing non-waiting preflush before jumping
+ * into the main commit.
+ */
+ struct mutex ordered_operations_mutex;
+
+ struct rw_semaphore commit_root_sem;
+
+ struct rw_semaphore cleanup_work_sem;
+
+ struct rw_semaphore subvol_sem;
+
+ spinlock_t trans_lock;
+ /*
+ * The reloc mutex goes with the trans lock, it is taken during commit
+ * to protect us from the relocation code.
+ */
+ struct mutex reloc_mutex;
+
+ struct list_head trans_list;
+ struct list_head dead_roots;
+ struct list_head caching_block_groups;
+
+ spinlock_t delayed_iput_lock;
+ struct list_head delayed_iputs;
+ atomic_t nr_delayed_iputs;
+ wait_queue_head_t delayed_iputs_wait;
+
+ atomic64_t tree_mod_seq;
+
+ /* This protects tree_mod_log and tree_mod_seq_list */
+ rwlock_t tree_mod_log_lock;
+ struct rb_root tree_mod_log;
+ struct list_head tree_mod_seq_list;
+
+ atomic_t async_delalloc_pages;
+
+ /* This is used to protect the following list -- ordered_roots. */
+ spinlock_t ordered_root_lock;
+
+ /*
+ * All fs/file tree roots in which there are data=ordered extents
+ * pending writeback are added into this list.
+ *
+ * These can span multiple transactions and basically include every
+ * dirty data page that isn't from nodatacow.
+ */
+ struct list_head ordered_roots;
+
+ struct mutex delalloc_root_mutex;
+ spinlock_t delalloc_root_lock;
+ /* All fs/file tree roots that have delalloc inodes. */
+ struct list_head delalloc_roots;
+
+ /*
+ * There is a pool of worker threads for checksumming during writes and
+ * a pool for checksumming after reads. This is because readers can
+ * run with FS locks held, and the writers may be waiting for those
+ * locks. We don't want ordering in the pending list to cause
+ * deadlocks, and so the two are serviced separately.
+ *
+ * A third pool does submit_bio to avoid deadlocking with the other two.
+ */
+ struct btrfs_workqueue *workers;
+ struct btrfs_workqueue *hipri_workers;
+ struct btrfs_workqueue *delalloc_workers;
+ struct btrfs_workqueue *flush_workers;
+ struct workqueue_struct *endio_workers;
+ struct workqueue_struct *endio_meta_workers;
+ struct workqueue_struct *rmw_workers;
+ struct workqueue_struct *compressed_write_workers;
+ struct btrfs_workqueue *endio_write_workers;
+ struct btrfs_workqueue *endio_freespace_worker;
+ struct btrfs_workqueue *caching_workers;
+
+ /*
+ * Fixup workers take dirty pages that didn't properly go through the
+ * cow mechanism and make them safe to write. It happens for the
+ * sys_munmap function call path.
+ */
+ struct btrfs_workqueue *fixup_workers;
+ struct btrfs_workqueue *delayed_workers;
+
+ struct task_struct *transaction_kthread;
+ struct task_struct *cleaner_kthread;
+ u32 thread_pool_size;
+
+ struct kobject *space_info_kobj;
+ struct kobject *qgroups_kobj;
+ struct kobject *discard_kobj;
+
+ /* Used to keep from writing metadata until there is a nice batch */
+ struct percpu_counter dirty_metadata_bytes;
+ struct percpu_counter delalloc_bytes;
+ struct percpu_counter ordered_bytes;
+ s32 dirty_metadata_batch;
+ s32 delalloc_batch;
+
+ struct list_head dirty_cowonly_roots;
+
+ struct btrfs_fs_devices *fs_devices;
+
+ /*
+ * The space_info list is effectively read only after initial setup.
+ * It is populated at mount time and cleaned up after all block groups
+ * are removed. RCU is used to protect it.
+ */
+ struct list_head space_info;
+
+ struct btrfs_space_info *data_sinfo;
+
+ struct reloc_control *reloc_ctl;
+
+ /* data_alloc_cluster is only used in ssd_spread mode */
+ struct btrfs_free_cluster data_alloc_cluster;
+
+ /* All metadata allocations go through this cluster. */
+ struct btrfs_free_cluster meta_alloc_cluster;
+
+ /* Auto defrag inodes go here. */
+ spinlock_t defrag_inodes_lock;
+ struct rb_root defrag_inodes;
+ atomic_t defrag_running;
+
+ /* Used to protect avail_{data, metadata, system}_alloc_bits */
+ seqlock_t profiles_lock;
+ /*
+ * These three are in extended format (availability of single chunks is
+ * denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other types are denoted
+ * by corresponding BTRFS_BLOCK_GROUP_* bits)
+ */
+ u64 avail_data_alloc_bits;
+ u64 avail_metadata_alloc_bits;
+ u64 avail_system_alloc_bits;
+
+ /* Balance state */
+ spinlock_t balance_lock;
+ struct mutex balance_mutex;
+ atomic_t balance_pause_req;
+ atomic_t balance_cancel_req;
+ struct btrfs_balance_control *balance_ctl;
+ wait_queue_head_t balance_wait_q;
+
+ /* Cancellation requests for chunk relocation */
+ atomic_t reloc_cancel_req;
+
+ u32 data_chunk_allocations;
+ u32 metadata_ratio;
+
+ void *bdev_holder;
+
+ /* Private scrub information */
+ struct mutex scrub_lock;
+ atomic_t scrubs_running;
+ atomic_t scrub_pause_req;
+ atomic_t scrubs_paused;
+ atomic_t scrub_cancel_req;
+ wait_queue_head_t scrub_pause_wait;
+ /*
+ * The worker pointers are NULL iff the refcount is 0, ie. scrub is not
+ * running.
+ */
+ refcount_t scrub_workers_refcnt;
+ struct workqueue_struct *scrub_workers;
+ struct workqueue_struct *scrub_wr_completion_workers;
+ struct workqueue_struct *scrub_parity_workers;
+ struct btrfs_subpage_info *subpage_info;
+
+ struct btrfs_discard_ctl discard_ctl;
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ u32 check_integrity_print_mask;
+#endif
+ /* Is qgroup tracking in a consistent state? */
+ u64 qgroup_flags;
+
+ /* Holds configuration and tracking. Protected by qgroup_lock. */
+ struct rb_root qgroup_tree;
+ spinlock_t qgroup_lock;
+
+ /*
+ * Used to avoid frequently calling ulist_alloc()/ulist_free()
+ * when doing qgroup accounting, it must be protected by qgroup_lock.
+ */
+ struct ulist *qgroup_ulist;
+
+ /*
+ * Protect user change for quota operations. If a transaction is needed,
+ * it must be started before locking this lock.
+ */
+ struct mutex qgroup_ioctl_lock;
+
+ /* List of dirty qgroups to be written at next commit. */
+ struct list_head dirty_qgroups;
+
+ /* Used by qgroup for an efficient tree traversal. */
+ u64 qgroup_seq;
+
+ /* Qgroup rescan items. */
+ /* Protects the progress item */
+ struct mutex qgroup_rescan_lock;
+ struct btrfs_key qgroup_rescan_progress;
+ struct btrfs_workqueue *qgroup_rescan_workers;
+ struct completion qgroup_rescan_completion;
+ struct btrfs_work qgroup_rescan_work;
+ /* Protected by qgroup_rescan_lock */
+ bool qgroup_rescan_running;
+ u8 qgroup_drop_subtree_thres;
+
+ /* Filesystem state */
+ unsigned long fs_state;
+
+ struct btrfs_delayed_root *delayed_root;
+
+ /* Extent buffer radix tree */
+ spinlock_t buffer_lock;
+ /* Entries are eb->start / sectorsize */
+ struct radix_tree_root buffer_radix;
+
+ /* Next backup root to be overwritten */
+ int backup_root_index;
+
+ /* Device replace state */
+ struct btrfs_dev_replace dev_replace;
+
+ struct semaphore uuid_tree_rescan_sem;
+
+ /* Used to reclaim the metadata space in the background. */
+ struct work_struct async_reclaim_work;
+ struct work_struct async_data_reclaim_work;
+ struct work_struct preempt_reclaim_work;
+
+ /* Reclaim partially filled block groups in the background */
+ struct work_struct reclaim_bgs_work;
+ struct list_head reclaim_bgs;
+ int bg_reclaim_threshold;
+
+ spinlock_t unused_bgs_lock;
+ struct list_head unused_bgs;
+ struct mutex unused_bg_unpin_mutex;
+ /* Protect block groups that are going to be deleted */
+ struct mutex reclaim_bgs_lock;
+
+ /* Cached block sizes */
+ u32 nodesize;
+ u32 sectorsize;
+ /* ilog2 of sectorsize, use to avoid 64bit division */
+ u32 sectorsize_bits;
+ u32 csum_size;
+ u32 csums_per_leaf;
+ u32 stripesize;
+
+ /*
+ * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular
+ * filesystem, on zoned it depends on the device constraints.
+ */
+ u64 max_extent_size;
+
+ /* Block groups and devices containing active swapfiles. */
+ spinlock_t swapfile_pins_lock;
+ struct rb_root swapfile_pins;
+
+ struct crypto_shash *csum_shash;
+
+ /* Type of exclusive operation running, protected by super_lock */
+ enum btrfs_exclusive_operation exclusive_operation;
+
+ /*
+ * Zone size > 0 when in ZONED mode, otherwise it's used for a check
+ * if the mode is enabled
+ */
+ u64 zone_size;
+
+ /* Max size to emit ZONE_APPEND write command */
+ u64 max_zone_append_size;
+ struct mutex zoned_meta_io_lock;
+ spinlock_t treelog_bg_lock;
+ u64 treelog_bg;
+
+ /*
+ * Start of the dedicated data relocation block group, protected by
+ * relocation_bg_lock.
+ */
+ spinlock_t relocation_bg_lock;
+ u64 data_reloc_bg;
+ struct mutex zoned_data_reloc_io_lock;
+
+ u64 nr_global_roots;
+
+ spinlock_t zone_active_bgs_lock;
+ struct list_head zone_active_bgs;
+
+ /* Updates are not protected by any lock */
+ struct btrfs_commit_stats commit_stats;
+
+ /*
+ * Last generation where we dropped a non-relocation root.
+ * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen()
+ * to change it and to read it, respectively.
+ */
+ u64 last_root_drop_gen;
+
+ /*
+ * Annotations for transaction events (structures are empty when
+ * compiled without lockdep).
+ */
+ struct lockdep_map btrfs_trans_num_writers_map;
+ struct lockdep_map btrfs_trans_num_extwriters_map;
+ struct lockdep_map btrfs_state_change_map[4];
+ struct lockdep_map btrfs_trans_pending_ordered_map;
+ struct lockdep_map btrfs_ordered_extent_map;
+
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ spinlock_t ref_verify_lock;
+ struct rb_root block_tree;
+#endif
+
+#ifdef CONFIG_BTRFS_DEBUG
+ struct kobject *debug_kobj;
+ struct list_head allocated_roots;
+
+ spinlock_t eb_leak_lock;
+ struct list_head allocated_ebs;
+#endif
+};
+
+static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info,
+ u64 gen)
+{
+ WRITE_ONCE(fs_info->last_root_drop_gen, gen);
+}
+
+static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info)
+{
+ return READ_ONCE(fs_info->last_root_drop_gen);
+}
+
+/*
+ * Take the number of bytes to be checksummed and figure out how many leaves
+ * it would require to store the csums for that many bytes.
+ */
+static inline u64 btrfs_csum_bytes_to_leaves(
+ const struct btrfs_fs_info *fs_info, u64 csum_bytes)
+{
+ const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits;
+
+ return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf);
+}
+
+/*
+ * Use this if we would be adding new items, as we could split nodes as we cow
+ * down the tree.
+ */
+static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
+ unsigned num_items)
+{
+ return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
+}
+
+/*
+ * Doing a truncate or a modification won't result in new nodes or leaves, just
+ * what we need for COW.
+ */
+static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
+ unsigned num_items)
+{
+ return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
+}
+
+#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
+ sizeof(struct btrfs_item))
+
+static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
+{
+ return fs_info->zone_size > 0;
+}
+
+/*
+ * Count how many fs_info->max_extent_size cover the @size
+ */
+static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size)
+{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (!fs_info)
+ return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
+#endif
+
+ return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size);
+}
+
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type);
+bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type);
+void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
+void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation op);
+
+/* Compatibility and incompatibility defines */
+void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
+ const char *name);
+void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
+ const char *name);
+void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
+ const char *name);
+void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
+ const char *name);
+
+#define __btrfs_fs_incompat(fs_info, flags) \
+ (!!(btrfs_super_incompat_flags((fs_info)->super_copy) & (flags)))
+
+#define __btrfs_fs_compat_ro(fs_info, flags) \
+ (!!(btrfs_super_compat_ro_flags((fs_info)->super_copy) & (flags)))
+
+#define btrfs_set_fs_incompat(__fs_info, opt) \
+ __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt)
+
+#define btrfs_clear_fs_incompat(__fs_info, opt) \
+ __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt)
+
+#define btrfs_fs_incompat(fs_info, opt) \
+ __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+#define btrfs_set_fs_compat_ro(__fs_info, opt) \
+ __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt)
+
+#define btrfs_clear_fs_compat_ro(__fs_info, opt) \
+ __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt)
+
+#define btrfs_fs_compat_ro(fs_info, opt) \
+ __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+
+#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
+#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
+#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
+#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \
+ BTRFS_MOUNT_##opt)
+
+#define btrfs_set_and_info(fs_info, opt, fmt, args...) \
+do { \
+ if (!btrfs_test_opt(fs_info, opt)) \
+ btrfs_info(fs_info, fmt, ##args); \
+ btrfs_set_opt(fs_info->mount_opt, opt); \
+} while (0)
+
+#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \
+do { \
+ if (btrfs_test_opt(fs_info, opt)) \
+ btrfs_info(fs_info, fmt, ##args); \
+ btrfs_clear_opt(fs_info->mount_opt, opt); \
+} while (0)
+
+static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
+{
+ /* Do it this way so we only ever do one test_bit in the normal case. */
+ if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) {
+ if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags))
+ return 2;
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
+ * anything except sleeping. This function is used to check the status of
+ * the fs.
+ * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount,
+ * since setting and checking for SB_RDONLY in the superblock's flags is not
+ * atomic.
+ */
+static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
+{
+ return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) ||
+ btrfs_fs_closing(fs_info);
+}
+
+static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
+{
+ clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
+}
+
+#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
+ &(fs_info)->fs_state)))
+#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \
+ (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \
+ &(fs_info)->fs_state)))
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+
+#define EXPORT_FOR_TESTS
+
+static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
+{
+ return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+}
+
+void btrfs_test_destroy_inode(struct inode *inode);
+
+#else
+
+#define EXPORT_FOR_TESTS static
+
+static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
+{
+ return 0;
+}
+#endif
+
+#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 0eeb5ea87894..b65c45b5d681 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -4,14 +4,20 @@
*/
#include "ctree.h"
+#include "fs.h"
+#include "messages.h"
#include "inode-item.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
+#include "space-info.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "file-item.h"
struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
- int slot, const char *name,
- int name_len)
+ int slot,
+ const struct fscrypt_str *name)
{
struct btrfs_inode_ref *ref;
unsigned long ptr;
@@ -27,9 +33,10 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
len = btrfs_inode_ref_name_len(leaf, ref);
name_ptr = (unsigned long)(ref + 1);
cur_offset += len + sizeof(*ref);
- if (len != name_len)
+ if (len != name->len)
continue;
- if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
+ if (memcmp_extent_buffer(leaf, name->name, name_ptr,
+ name->len) == 0)
return ref;
}
return NULL;
@@ -37,7 +44,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
struct extent_buffer *leaf, int slot, u64 ref_objectid,
- const char *name, int name_len)
+ const struct fscrypt_str *name)
{
struct btrfs_inode_extref *extref;
unsigned long ptr;
@@ -60,9 +67,10 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
name_ptr = (unsigned long)(&extref->name);
ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
- if (ref_name_len == name_len &&
+ if (ref_name_len == name->len &&
btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
- (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0))
+ (memcmp_extent_buffer(leaf, name->name, name_ptr,
+ name->len) == 0))
return extref;
cur_offset += ref_name_len + sizeof(*extref);
@@ -75,7 +83,7 @@ struct btrfs_inode_extref *
btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- const char *name, int name_len,
+ const struct fscrypt_str *name,
u64 inode_objectid, u64 ref_objectid, int ins_len,
int cow)
{
@@ -84,7 +92,7 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
key.objectid = inode_objectid;
key.type = BTRFS_INODE_EXTREF_KEY;
- key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+ key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len);
ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
if (ret < 0)
@@ -92,13 +100,13 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
if (ret > 0)
return NULL;
return btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
- ref_objectid, name, name_len);
+ ref_objectid, name);
}
static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- const char *name, int name_len,
+ const struct fscrypt_str *name,
u64 inode_objectid, u64 ref_objectid,
u64 *index)
{
@@ -107,14 +115,14 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
struct btrfs_inode_extref *extref;
struct extent_buffer *leaf;
int ret;
- int del_len = name_len + sizeof(*extref);
+ int del_len = name->len + sizeof(*extref);
unsigned long ptr;
unsigned long item_start;
u32 item_size;
key.objectid = inode_objectid;
key.type = BTRFS_INODE_EXTREF_KEY;
- key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+ key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len);
path = btrfs_alloc_path();
if (!path)
@@ -132,7 +140,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
* readonly.
*/
extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
- ref_objectid, name, name_len);
+ ref_objectid, name);
if (!extref) {
btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL);
ret = -EROFS;
@@ -168,8 +176,7 @@ out:
}
int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
+ struct btrfs_root *root, const struct fscrypt_str *name,
u64 inode_objectid, u64 ref_objectid, u64 *index)
{
struct btrfs_path *path;
@@ -182,7 +189,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
u32 sub_item_len;
int ret;
int search_ext_refs = 0;
- int del_len = name_len + sizeof(*ref);
+ int del_len = name->len + sizeof(*ref);
key.objectid = inode_objectid;
key.offset = ref_objectid;
@@ -201,8 +208,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
goto out;
}
- ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name,
- name_len);
+ ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name);
if (!ref) {
ret = -ENOENT;
search_ext_refs = 1;
@@ -219,7 +225,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
goto out;
}
ptr = (unsigned long)ref;
- sub_item_len = name_len + sizeof(*ref);
+ sub_item_len = name->len + sizeof(*ref);
item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
item_size - (ptr + sub_item_len - item_start));
@@ -233,7 +239,7 @@ out:
* name in our ref array. Find and remove the extended
* inode ref then.
*/
- return btrfs_del_inode_extref(trans, root, name, name_len,
+ return btrfs_del_inode_extref(trans, root, name,
inode_objectid, ref_objectid, index);
}
@@ -247,12 +253,13 @@ out:
*/
static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, u64 index)
+ const struct fscrypt_str *name,
+ u64 inode_objectid, u64 ref_objectid,
+ u64 index)
{
struct btrfs_inode_extref *extref;
int ret;
- int ins_len = name_len + sizeof(*extref);
+ int ins_len = name->len + sizeof(*extref);
unsigned long ptr;
struct btrfs_path *path;
struct btrfs_key key;
@@ -260,7 +267,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
key.objectid = inode_objectid;
key.type = BTRFS_INODE_EXTREF_KEY;
- key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+ key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len);
path = btrfs_alloc_path();
if (!path)
@@ -272,7 +279,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
if (btrfs_find_name_in_ext_backref(path->nodes[0],
path->slots[0],
ref_objectid,
- name, name_len))
+ name))
goto out;
btrfs_extend_item(path, ins_len);
@@ -286,12 +293,12 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
ptr += btrfs_item_size(leaf, path->slots[0]) - ins_len;
extref = (struct btrfs_inode_extref *)ptr;
- btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
+ btrfs_set_inode_extref_name_len(path->nodes[0], extref, name->len);
btrfs_set_inode_extref_index(path->nodes[0], extref, index);
btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid);
ptr = (unsigned long)&extref->name;
- write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
btrfs_mark_buffer_dirty(path->nodes[0]);
out:
@@ -301,8 +308,7 @@ out:
/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */
int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
+ struct btrfs_root *root, const struct fscrypt_str *name,
u64 inode_objectid, u64 ref_objectid, u64 index)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -311,7 +317,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_inode_ref *ref;
unsigned long ptr;
int ret;
- int ins_len = name_len + sizeof(*ref);
+ int ins_len = name->len + sizeof(*ref);
key.objectid = inode_objectid;
key.offset = ref_objectid;
@@ -327,7 +333,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
if (ret == -EEXIST) {
u32 old_size;
ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
- name, name_len);
+ name);
if (ref)
goto out;
@@ -336,7 +342,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_ref);
ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
- btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len);
btrfs_set_inode_ref_index(path->nodes[0], ref, index);
ptr = (unsigned long)(ref + 1);
ret = 0;
@@ -344,7 +350,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
if (ret == -EOVERFLOW) {
if (btrfs_find_name_in_backref(path->nodes[0],
path->slots[0],
- name, name_len))
+ name))
ret = -EEXIST;
else
ret = -EMLINK;
@@ -353,11 +359,11 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
} else {
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_ref);
- btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len);
btrfs_set_inode_ref_index(path->nodes[0], ref, index);
ptr = (unsigned long)(ref + 1);
}
- write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
btrfs_mark_buffer_dirty(path->nodes[0]);
out:
@@ -370,7 +376,6 @@ out:
if (btrfs_super_incompat_flags(disk_super)
& BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
ret = btrfs_insert_inode_extref(trans, root, name,
- name_len,
inode_objectid,
ref_objectid, index);
}
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index a8fc16d0147f..b80aeb715701 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -64,33 +64,31 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_truncate_control *control);
int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
+ struct btrfs_root *root, const struct fscrypt_str *name,
u64 inode_objectid, u64 ref_objectid, u64 index);
int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, u64 *index);
+ struct btrfs_root *root, const struct fscrypt_str *name,
+ u64 inode_objectid, u64 ref_objectid, u64 *index);
int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid);
-int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path,
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *location, int mod);
struct btrfs_inode_extref *btrfs_lookup_inode_extref(
struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- const char *name, int name_len,
+ const struct fscrypt_str *name,
u64 inode_objectid, u64 ref_objectid, int ins_len,
int cow);
struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
- int slot, const char *name,
- int name_len);
+ int slot,
+ const struct fscrypt_str *name);
struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
struct extent_buffer *leaf, int slot, u64 ref_objectid,
- const char *name, int name_len);
+ const struct fscrypt_str *name);
#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1372210869b1..8bcad9940154 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -43,7 +43,7 @@
#include "ordered-data.h"
#include "xattr.h"
#include "tree-log.h"
-#include "volumes.h"
+#include "bio.h"
#include "compression.h"
#include "locking.h"
#include "free-space-cache.h"
@@ -55,6 +55,21 @@
#include "zoned.h"
#include "subpage.h"
#include "inode-item.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "root-tree.h"
+#include "defrag.h"
+#include "dir-item.h"
+#include "file-item.h"
+#include "uuid-tree.h"
+#include "ioctl.h"
+#include "file.h"
+#include "acl.h"
+#include "relocation.h"
+#include "verity.h"
+#include "super.h"
+#include "orphan.h"
struct btrfs_iget_args {
u64 ino;
@@ -69,7 +84,7 @@ struct btrfs_dio_data {
};
struct btrfs_dio_private {
- struct inode *inode;
+ struct btrfs_inode *inode;
/*
* Since DIO can use anonymous page, we cannot use page_offset() to
@@ -107,13 +122,9 @@ static const struct address_space_operations btrfs_aops;
static const struct file_operations btrfs_dir_file_operations;
static struct kmem_cache *btrfs_inode_cachep;
-struct kmem_cache *btrfs_trans_handle_cachep;
-struct kmem_cache *btrfs_path_cachep;
-struct kmem_cache *btrfs_free_space_cachep;
-struct kmem_cache *btrfs_free_space_bitmap_cachep;
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
-static int btrfs_truncate(struct inode *inode, bool skip_writeback);
+static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
static noinline int cow_file_range(struct btrfs_inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
@@ -125,6 +136,32 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
u64 ram_bytes, int compress_type,
int type);
+static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
+ u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
+{
+ struct btrfs_root *root = inode->root;
+ const u32 csum_size = root->fs_info->csum_size;
+
+ /* Output without objectid, which is more meaningful */
+ if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) {
+ btrfs_warn_rl(root->fs_info,
+"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
+ root->root_key.objectid, btrfs_ino(inode),
+ logical_start,
+ CSUM_FMT_VALUE(csum_size, csum),
+ CSUM_FMT_VALUE(csum_size, csum_expected),
+ mirror_num);
+ } else {
+ btrfs_warn_rl(root->fs_info,
+"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
+ root->root_key.objectid, btrfs_ino(inode),
+ logical_start,
+ CSUM_FMT_VALUE(csum_size, csum),
+ CSUM_FMT_VALUE(csum_size, csum_expected),
+ mirror_num);
+ }
+}
+
/*
* btrfs_inode_lock - lock inode i_rwsem based on arguments passed
*
@@ -135,27 +172,27 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
* return -EAGAIN
* BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
*/
-int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
+int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
{
if (ilock_flags & BTRFS_ILOCK_SHARED) {
if (ilock_flags & BTRFS_ILOCK_TRY) {
- if (!inode_trylock_shared(inode))
+ if (!inode_trylock_shared(&inode->vfs_inode))
return -EAGAIN;
else
return 0;
}
- inode_lock_shared(inode);
+ inode_lock_shared(&inode->vfs_inode);
} else {
if (ilock_flags & BTRFS_ILOCK_TRY) {
- if (!inode_trylock(inode))
+ if (!inode_trylock(&inode->vfs_inode))
return -EAGAIN;
else
return 0;
}
- inode_lock(inode);
+ inode_lock(&inode->vfs_inode);
}
if (ilock_flags & BTRFS_ILOCK_MMAP)
- down_write(&BTRFS_I(inode)->i_mmap_lock);
+ down_write(&inode->i_mmap_lock);
return 0;
}
@@ -165,14 +202,14 @@ int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
* ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
* to decide whether the lock acquired is shared or exclusive.
*/
-void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags)
+void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
{
if (ilock_flags & BTRFS_ILOCK_MMAP)
- up_write(&BTRFS_I(inode)->i_mmap_lock);
+ up_write(&inode->i_mmap_lock);
if (ilock_flags & BTRFS_ILOCK_SHARED)
- inode_unlock_shared(inode);
+ inode_unlock_shared(&inode->vfs_inode);
else
- inode_unlock(inode);
+ inode_unlock(&inode->vfs_inode);
}
/*
@@ -249,7 +286,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
}
-static int btrfs_dirty_inode(struct inode *inode);
+static int btrfs_dirty_inode(struct btrfs_inode *inode);
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
struct btrfs_new_inode_args *args)
@@ -483,7 +520,7 @@ struct async_extent {
};
struct async_chunk {
- struct inode *inode;
+ struct btrfs_inode *inode;
struct page *locked_page;
u64 start;
u64 end;
@@ -611,8 +648,8 @@ static inline void inode_should_defrag(struct btrfs_inode *inode,
*/
static noinline int compress_file_range(struct async_chunk *async_chunk)
{
- struct inode *inode = async_chunk->inode;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_inode *inode = async_chunk->inode;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 blocksize = fs_info->sectorsize;
u64 start = async_chunk->start;
u64 end = async_chunk->end;
@@ -629,8 +666,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
int compressed_extents = 0;
int redirty = 0;
- inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
- SZ_16K);
+ inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
/*
* We need to save i_size before now because it could change in between
@@ -642,7 +678,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
* does that for us.
*/
barrier();
- i_size = i_size_read(inode);
+ i_size = i_size_read(&inode->vfs_inode);
barrier();
actual_end = min_t(u64, i_size, end + 1);
again:
@@ -671,7 +707,7 @@ again:
* isn't an inline extent, since it doesn't save disk space at all.
*/
if (total_compressed <= blocksize &&
- (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+ (start > 0 || end + 1 < inode->disk_i_size))
goto cleanup_and_bail_uncompressed;
/*
@@ -695,7 +731,7 @@ again:
* inode has not been flagged as nocompress. This flag can
* change at any time if we discover bad compression ratios.
*/
- if (inode_need_compress(BTRFS_I(inode), start, end)) {
+ if (inode_need_compress(inode, start, end)) {
WARN_ON(pages);
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
@@ -704,10 +740,10 @@ again:
goto cont;
}
- if (BTRFS_I(inode)->defrag_compress)
- compress_type = BTRFS_I(inode)->defrag_compress;
- else if (BTRFS_I(inode)->prop_compress)
- compress_type = BTRFS_I(inode)->prop_compress;
+ if (inode->defrag_compress)
+ compress_type = inode->defrag_compress;
+ else if (inode->prop_compress)
+ compress_type = inode->prop_compress;
/*
* we need to call clear_page_dirty_for_io on each
@@ -722,14 +758,14 @@ again:
* has moved, the end is the original one.
*/
if (!redirty) {
- extent_range_clear_dirty_for_io(inode, start, end);
+ extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
redirty = 1;
}
/* Compression level is applied here and only here */
ret = btrfs_compress_pages(
compress_type | (fs_info->compress_level << 4),
- inode->i_mapping, start,
+ inode->vfs_inode.i_mapping, start,
pages,
&nr_pages,
&total_in,
@@ -758,12 +794,12 @@ cont:
/* we didn't compress the entire range, try
* to make an uncompressed inline extent.
*/
- ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
+ ret = cow_file_range_inline(inode, actual_end,
0, BTRFS_COMPRESS_NONE,
NULL, false);
} else {
/* try making a compressed inline extent */
- ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
+ ret = cow_file_range_inline(inode, actual_end,
total_compressed,
compress_type, pages,
false);
@@ -786,7 +822,7 @@ cont:
* our outstanding extent for clearing delalloc for this
* range.
*/
- extent_clear_unlock_delalloc(BTRFS_I(inode), start, end,
+ extent_clear_unlock_delalloc(inode, start, end,
NULL,
clear_flags,
PAGE_UNLOCK |
@@ -861,8 +897,8 @@ cont:
/* flag the file so we don't compress in the future */
if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
- !(BTRFS_I(inode)->prop_compress)) {
- BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+ !(inode->prop_compress)) {
+ inode->flags |= BTRFS_INODE_NOCOMPRESS;
}
}
cleanup_and_bail_uncompressed:
@@ -880,7 +916,7 @@ cleanup_and_bail_uncompressed:
}
if (redirty)
- extent_range_redirty_for_io(inode, start, end);
+ extent_range_redirty_for_io(&inode->vfs_inode, start, end);
add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
BTRFS_COMPRESS_NONE);
compressed_extents++;
@@ -977,7 +1013,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
if (!(start >= locked_page_end || end <= locked_page_start))
locked_page = async_chunk->locked_page;
}
- lock_extent(io_tree, start, end);
+ lock_extent(io_tree, start, end, NULL);
/* We have fall back to uncompressed write */
if (!async_extent->pages)
@@ -1024,7 +1060,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
1 << BTRFS_ORDERED_COMPRESSED,
async_extent->compress_type);
if (ret) {
- btrfs_drop_extent_cache(inode, start, end, 0);
+ btrfs_drop_extent_map_range(inode, start, end, false);
goto out_free_reserve;
}
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
@@ -1076,7 +1112,7 @@ out_free:
*/
static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
{
- struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
+ struct btrfs_inode *inode = async_chunk->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct async_extent *async_extent;
u64 alloc_hint = 0;
@@ -1254,7 +1290,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
}
alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
- btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
/*
* Relocation relies on the relocated extents to have exactly the same
@@ -1319,8 +1354,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* skip current ordered extent.
*/
if (ret)
- btrfs_drop_extent_cache(inode, start,
- start + ram_size - 1, 0);
+ btrfs_drop_extent_map_range(inode, start,
+ start + ram_size - 1,
+ false);
}
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
@@ -1360,7 +1396,7 @@ out:
return ret;
out_drop_extent_cache:
- btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
+ btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
@@ -1524,7 +1560,7 @@ static int cow_file_range_async(struct btrfs_inode *inode,
unsigned nofs_flag;
const blk_opf_t write_flags = wbc_to_write_flags(wbc);
- unlock_extent(&inode->io_tree, start, end);
+ unlock_extent(&inode->io_tree, start, end, NULL);
if (inode->flags & BTRFS_INODE_NOCOMPRESS &&
!btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
@@ -1565,7 +1601,7 @@ static int cow_file_range_async(struct btrfs_inode *inode,
*/
ihold(&inode->vfs_inode);
async_chunk[i].async_cow = ctx;
- async_chunk[i].inode = &inode->vfs_inode;
+ async_chunk[i].inode = inode;
async_chunk[i].start = start;
async_chunk[i].end = cur_end;
async_chunk[i].write_flags = write_flags;
@@ -1666,15 +1702,15 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
}
static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
- u64 bytenr, u64 num_bytes)
+ u64 bytenr, u64 num_bytes, bool nowait)
{
struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
struct btrfs_ordered_sum *sums;
int ret;
LIST_HEAD(list);
- ret = btrfs_lookup_csums_range(csum_root, bytenr,
- bytenr + num_bytes - 1, &list, 0);
+ ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1,
+ &list, 0, nowait);
if (ret == 0 && list_empty(&list))
return 0;
@@ -1732,7 +1768,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
* when starting writeback.
*/
count = count_range_bits(io_tree, &range_start, end, range_bytes,
- EXTENT_NORESERVE, 0);
+ EXTENT_NORESERVE, 0, NULL);
if (count > 0 || is_space_ino || is_reloc_ino) {
u64 bytes = count;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1747,7 +1783,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
if (count > 0)
clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
- 0, 0, NULL);
+ NULL);
}
return cow_file_range(inode, locked_page, start, end, page_started,
@@ -1800,6 +1836,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
u8 extent_type;
int can_nocow = 0;
int ret = 0;
+ bool nowait = path->nowait;
fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
extent_type = btrfs_file_extent_type(leaf, fi);
@@ -1876,7 +1913,8 @@ static int can_nocow_file_extent(struct btrfs_path *path,
* Force COW if csums exist in the range. This ensures that csums for a
* given extent are either valid or do not exist.
*/
- ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes);
+ ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes,
+ nowait);
WARN_ON_ONCE(ret > 0 && is_freespace_inode);
if (ret != 0)
goto out;
@@ -2099,8 +2137,8 @@ out_check:
1 << BTRFS_ORDERED_PREALLOC,
BTRFS_COMPRESS_NONE);
if (ret) {
- btrfs_drop_extent_cache(inode, cur_offset,
- nocow_end, 0);
+ btrfs_drop_extent_map_range(inode, cur_offset,
+ nocow_end, false);
goto error;
}
} else {
@@ -2237,10 +2275,10 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
return ret;
}
-void btrfs_split_delalloc_extent(struct inode *inode,
+void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
struct extent_state *orig, u64 split)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 size;
/* not delalloc, ignore it */
@@ -2264,9 +2302,9 @@ void btrfs_split_delalloc_extent(struct inode *inode,
return;
}
- spin_lock(&BTRFS_I(inode)->lock);
- btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_lock(&inode->lock);
+ btrfs_mod_outstanding_extents(inode, 1);
+ spin_unlock(&inode->lock);
}
/*
@@ -2274,10 +2312,10 @@ void btrfs_split_delalloc_extent(struct inode *inode,
* that are just merged onto old extents, such as when we are doing sequential
* writes, so we can properly account for the metadata space we'll need.
*/
-void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
+void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
struct extent_state *other)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 new_size, old_size;
u32 num_extents;
@@ -2292,9 +2330,9 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
/* we're not bigger than the max, unreserve the space and go */
if (new_size <= fs_info->max_extent_size) {
- spin_lock(&BTRFS_I(inode)->lock);
- btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_lock(&inode->lock);
+ btrfs_mod_outstanding_extents(inode, -1);
+ spin_unlock(&inode->lock);
return;
}
@@ -2323,22 +2361,20 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
if (count_max_extents(fs_info, new_size) >= num_extents)
return;
- spin_lock(&BTRFS_I(inode)->lock);
- btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_lock(&inode->lock);
+ btrfs_mod_outstanding_extents(inode, -1);
+ spin_unlock(&inode->lock);
}
static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
- struct inode *inode)
+ struct btrfs_inode *inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
spin_lock(&root->delalloc_lock);
- if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
- list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
- &root->delalloc_inodes);
- set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &BTRFS_I(inode)->runtime_flags);
+ if (list_empty(&inode->delalloc_inodes)) {
+ list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
+ set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags);
root->nr_delalloc_inodes++;
if (root->nr_delalloc_inodes == 1) {
spin_lock(&fs_info->delalloc_root_lock);
@@ -2351,7 +2387,6 @@ static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
spin_unlock(&root->delalloc_lock);
}
-
void __btrfs_del_delalloc_inode(struct btrfs_root *root,
struct btrfs_inode *inode)
{
@@ -2384,10 +2419,10 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
* Properly track delayed allocation bytes in the inode and to maintain the
* list of inodes that have pending delalloc work to be done.
*/
-void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
+void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
u32 bits)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
WARN_ON(1);
@@ -2397,14 +2432,14 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
* bit, which is only set or cleared with irqs on
*/
if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
u64 len = state->end + 1 - state->start;
u32 num_extents = count_max_extents(fs_info, len);
- bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
+ bool do_list = !btrfs_is_free_space_inode(inode);
- spin_lock(&BTRFS_I(inode)->lock);
- btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_lock(&inode->lock);
+ btrfs_mod_outstanding_extents(inode, num_extents);
+ spin_unlock(&inode->lock);
/* For sanity tests */
if (btrfs_is_testing(fs_info))
@@ -2412,22 +2447,21 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
fs_info->delalloc_batch);
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->delalloc_bytes += len;
+ spin_lock(&inode->lock);
+ inode->delalloc_bytes += len;
if (bits & EXTENT_DEFRAG)
- BTRFS_I(inode)->defrag_bytes += len;
+ inode->defrag_bytes += len;
if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &BTRFS_I(inode)->runtime_flags))
+ &inode->runtime_flags))
btrfs_add_delalloc_inodes(root, inode);
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_unlock(&inode->lock);
}
if (!(state->state & EXTENT_DELALLOC_NEW) &&
(bits & EXTENT_DELALLOC_NEW)) {
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
- state->start;
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_lock(&inode->lock);
+ inode->new_delalloc_bytes += state->end + 1 - state->start;
+ spin_unlock(&inode->lock);
}
}
@@ -2435,11 +2469,10 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
* Once a range is no longer delalloc this function ensures that proper
* accounting happens.
*/
-void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
+void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
struct extent_state *state, u32 bits)
{
- struct btrfs_inode *inode = BTRFS_I(vfs_inode);
- struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 len = state->end + 1 - state->start;
u32 num_extents = count_max_extents(fs_info, len);
@@ -2510,10 +2543,9 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
* At IO completion time the cums attached on the ordered extent record
* are inserted into the btree
*/
-static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
- u64 dio_file_offset)
+blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio)
{
- return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
+ return btrfs_csum_one_bio(inode, bio, (u64)-1, false);
}
/*
@@ -2548,7 +2580,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
ASSERT(pre + post < len);
- lock_extent(&inode->io_tree, start, start + len - 1);
+ lock_extent(&inode->io_tree, start, start + len - 1, NULL);
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (!em) {
@@ -2622,7 +2654,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
out_unlock:
write_unlock(&em_tree->lock);
- unlock_extent(&inode->io_tree, start, start + len - 1);
+ unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
out:
free_extent_map(split_pre);
free_extent_map(split_mid);
@@ -2691,17 +2723,18 @@ out:
return errno_to_blk_status(ret);
}
-void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num)
+void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_inode *bi = BTRFS_I(inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
blk_status_t ret;
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- ret = extract_ordered_extent(bi, bio,
+ ret = extract_ordered_extent(inode, bio,
page_offset(bio_first_bvec_all(bio)->bv_page));
- if (ret)
- goto out;
+ if (ret) {
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ return;
+ }
}
/*
@@ -2712,31 +2745,26 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro
* Csum items for reloc roots have already been cloned at this point,
* so they are handled as part of the no-checksum case.
*/
- if (!(bi->flags & BTRFS_INODE_NODATASUM) &&
+ if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
!test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
- !btrfs_is_data_reloc_root(bi->root)) {
- if (!atomic_read(&bi->sync_writers) &&
- btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
- btrfs_submit_bio_start))
+ !btrfs_is_data_reloc_root(inode->root)) {
+ if (!atomic_read(&inode->sync_writers) &&
+ btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA))
return;
- ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false);
- if (ret)
- goto out;
+ ret = btrfs_csum_one_bio(inode, bio, (u64)-1, false);
+ if (ret) {
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ return;
+ }
}
btrfs_submit_bio(fs_info, bio, mirror_num);
- return;
-out:
- if (ret) {
- bio->bi_status = ret;
- bio_endio(bio);
- }
}
-void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio,
int mirror_num, enum btrfs_compression_type compress_type)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
blk_status_t ret;
if (compress_type != BTRFS_COMPRESS_NONE) {
@@ -2744,7 +2772,7 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
* btrfs_submit_compressed_read will handle completing the bio
* if there were any errors, so just return here.
*/
- btrfs_submit_compressed_read(inode, bio, mirror_num);
+ btrfs_submit_compressed_read(&inode->vfs_inode, bio, mirror_num);
return;
}
@@ -2755,10 +2783,9 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
* Lookup bio sums does extra checks around whether we need to csum or
* not, which is why we ignore skip_sum here.
*/
- ret = btrfs_lookup_bio_sums(inode, bio, NULL);
+ ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL);
if (ret) {
- bio->bi_status = ret;
- bio_endio(bio);
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
return;
}
@@ -2818,8 +2845,8 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
ret = set_extent_bit(&inode->io_tree, search_start,
search_start + em_len - 1,
- EXTENT_DELALLOC_NEW, 0, NULL, cached_state,
- GFP_NOFS, NULL);
+ EXTENT_DELALLOC_NEW, cached_state,
+ GFP_NOFS);
next:
search_start = extent_map_end(em);
free_extent_map(em);
@@ -2859,7 +2886,7 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
/* see btrfs_writepage_start_hook for details on why this is required */
struct btrfs_writepage_fixup {
struct page *page;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct btrfs_work work;
};
@@ -2878,7 +2905,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
fixup = container_of(work, struct btrfs_writepage_fixup, work);
page = fixup->page;
- inode = BTRFS_I(fixup->inode);
+ inode = fixup->inode;
page_start = page_offset(page);
page_end = page_offset(page) + PAGE_SIZE - 1;
@@ -2931,7 +2958,7 @@ again:
if (ret)
goto out_page;
- lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
+ lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
/* already ordered? We're done */
if (PageOrdered(page))
@@ -2939,8 +2966,8 @@ again:
ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
if (ordered) {
- unlock_extent_cached(&inode->io_tree, page_start, page_end,
- &cached_state);
+ unlock_extent(&inode->io_tree, page_start, page_end,
+ &cached_state);
unlock_page(page);
btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
@@ -2966,8 +2993,7 @@ out_reserved:
if (free_delalloc_space)
btrfs_delalloc_release_space(inode, data_reserved, page_start,
PAGE_SIZE, true);
- unlock_extent_cached(&inode->io_tree, page_start, page_end,
- &cached_state);
+ unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
out_page:
if (ret) {
/*
@@ -2989,7 +3015,7 @@ out_page:
* that could need flushing space. Recursing back to fixup worker would
* deadlock.
*/
- btrfs_add_delayed_iput(&inode->vfs_inode);
+ btrfs_add_delayed_iput(inode);
}
/*
@@ -3038,7 +3064,7 @@ int btrfs_writepage_cow_fixup(struct page *page)
get_page(page);
btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
fixup->page = page;
- fixup->inode = inode;
+ fixup->inode = BTRFS_I(inode);
btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
return -EAGAIN;
@@ -3225,6 +3251,8 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
clear_bits |= EXTENT_DELALLOC_NEW;
freespace_inode = btrfs_is_free_space_inode(inode);
+ if (!freespace_inode)
+ btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
ret = -EIO;
@@ -3269,7 +3297,7 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
}
clear_bits |= EXTENT_LOCKED;
- lock_extent_bits(io_tree, start, end, &cached_state);
+ lock_extent(io_tree, start, end, &cached_state);
if (freespace_inode)
trans = btrfs_join_transaction_spacecache(root);
@@ -3325,7 +3353,7 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
!test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
clear_extent_bit(&inode->io_tree, start, end,
EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
- 0, 0, &cached_state);
+ &cached_state);
btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode_fallback(trans, root, inode);
@@ -3336,7 +3364,6 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
ret = 0;
out:
clear_extent_bit(&inode->io_tree, start, end, clear_bits,
- (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0,
&cached_state);
if (trans)
@@ -3361,8 +3388,8 @@ out:
unwritten_start += logical_len;
clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
- /* Drop the cache for the part of the extent we didn't write. */
- btrfs_drop_extent_cache(inode, unwritten_start, end, 0);
+ /* Drop extent maps for the part of the extent we didn't write. */
+ btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
/*
* If the ordered extent had an IOERR or something else went
@@ -3439,6 +3466,13 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
return 0;
}
+static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 offset)
+{
+ u64 offset_in_sectors = offset >> fs_info->sectorsize_bits;
+
+ return csums + offset_in_sectors * fs_info->csum_size;
+}
+
/*
* check_data_csum - verify checksum of one sector of uncompressed data
* @inode: inode
@@ -3452,10 +3486,10 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
* When csum mismatch is detected, we will also report the error and fill the
* corrupted range with zero. (Thus it needs the extra parameters)
*/
-int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
+int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio,
u32 bio_offset, struct page *page, u32 pgoff)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
u32 len = fs_info->sectorsize;
u8 *csum_expected;
u8 csum[BTRFS_CSUM_SIZE];
@@ -3469,8 +3503,7 @@ int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
return 0;
zeroit:
- btrfs_print_data_csum_error(BTRFS_I(inode),
- bbio->file_offset + bio_offset,
+ btrfs_print_data_csum_error(inode, bbio->file_offset + bio_offset,
csum, csum_expected, bbio->mirror_num);
if (bbio->device)
btrfs_dev_stat_inc_and_print(bbio->device,
@@ -3495,10 +3528,10 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
u32 bio_offset, struct page *page,
u64 start, u64 end)
{
- struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
const u32 sectorsize = root->fs_info->sectorsize;
u32 pg_off;
unsigned int result = 0;
@@ -3511,7 +3544,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
if (bbio->csum == NULL)
return 0;
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+ if (inode->flags & BTRFS_INODE_NODATASUM)
return 0;
if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)))
@@ -3556,18 +3589,17 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
* the inode to the delayed iput machinery. Delayed iputs are processed at
* transaction commit time/superblock commit/cleaner kthread.
*/
-void btrfs_add_delayed_iput(struct inode *inode)
+void btrfs_add_delayed_iput(struct btrfs_inode *inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_inode *binode = BTRFS_I(inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
- if (atomic_add_unless(&inode->i_count, -1, 1))
+ if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
return;
atomic_inc(&fs_info->nr_delayed_iputs);
spin_lock(&fs_info->delayed_iput_lock);
- ASSERT(list_empty(&binode->delayed_iput));
- list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
+ ASSERT(list_empty(&inode->delayed_iput));
+ list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs);
spin_unlock(&fs_info->delayed_iput_lock);
if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
wake_up_process(fs_info->cleaner_kthread);
@@ -3610,7 +3642,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->delayed_iput_lock);
}
-/**
+/*
* Wait for flushing all delayed iputs
*
* @fs_info: the filesystem
@@ -4255,7 +4287,7 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
- const char *name, int name_len,
+ const struct fscrypt_str *name,
struct btrfs_rename_ctx *rename_ctx)
{
struct btrfs_root *root = dir->root;
@@ -4273,8 +4305,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
goto out;
}
- di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
- name, name_len, -1);
+ di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
if (IS_ERR_OR_NULL(di)) {
ret = di ? PTR_ERR(di) : -ENOENT;
goto err;
@@ -4302,12 +4333,11 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
}
}
- ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
- dir_ino, &index);
+ ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
if (ret) {
btrfs_info(fs_info,
"failed to delete reference to %.*s, inode %llu parent %llu",
- name_len, name, ino, dir_ino);
+ name->len, name->name, ino, dir_ino);
btrfs_abort_transaction(trans, ret);
goto err;
}
@@ -4328,10 +4358,8 @@ skip_backref:
* operations on the log tree, increasing latency for applications.
*/
if (!rename_ctx) {
- btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
- dir_ino);
- btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
- index);
+ btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino);
+ btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
}
/*
@@ -4349,7 +4377,7 @@ err:
if (ret)
goto out;
- btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
+ btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
inode_inc_iversion(&inode->vfs_inode);
inode_inc_iversion(&dir->vfs_inode);
inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
@@ -4362,10 +4390,11 @@ out:
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir, struct btrfs_inode *inode,
- const char *name, int name_len)
+ const struct fscrypt_str *name)
{
int ret;
- ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL);
+
+ ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
if (!ret) {
drop_nlink(&inode->vfs_inode);
ret = btrfs_update_inode(trans, inode->root, inode);
@@ -4381,9 +4410,9 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
* plenty of slack room in the global reserve to migrate, otherwise we cannot
* allow the unlink to occur.
*/
-static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
+static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
{
- struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *root = dir->root;
/*
* 1 for the possible orphan item
@@ -4401,47 +4430,62 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
struct btrfs_trans_handle *trans;
struct inode *inode = d_inode(dentry);
int ret;
+ struct fscrypt_name fname;
- trans = __unlink_start_trans(dir);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+ if (ret)
+ return ret;
+
+ /* This needs to handle no-key deletions later on */
+
+ trans = __unlink_start_trans(BTRFS_I(dir));
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto fscrypt_free;
+ }
btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
0);
- ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
- BTRFS_I(d_inode(dentry)), dentry->d_name.name,
- dentry->d_name.len);
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
+ &fname.disk_name);
if (ret)
- goto out;
+ goto end_trans;
if (inode->i_nlink == 0) {
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
if (ret)
- goto out;
+ goto end_trans;
}
-out:
+end_trans:
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
+fscrypt_free:
+ fscrypt_free_filename(&fname);
return ret;
}
static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
- struct inode *dir, struct dentry *dentry)
+ struct btrfs_inode *dir, struct dentry *dentry)
{
- struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *root = dir->root;
struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key key;
- const char *name = dentry->d_name.name;
- int name_len = dentry->d_name.len;
u64 index;
int ret;
u64 objectid;
- u64 dir_ino = btrfs_ino(BTRFS_I(dir));
+ u64 dir_ino = btrfs_ino(dir);
+ struct fscrypt_name fname;
+
+ ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
+ if (ret)
+ return ret;
+
+ /* This needs to handle no-key deletions later on */
if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
objectid = inode->root->root_key.objectid;
@@ -4449,15 +4493,18 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
objectid = inode->location.objectid;
} else {
WARN_ON(1);
+ fscrypt_free_filename(&fname);
return -EINVAL;
}
path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
- name, name_len, -1);
+ &fname.disk_name, -1);
if (IS_ERR_OR_NULL(di)) {
ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
@@ -4483,8 +4530,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
* call btrfs_del_root_ref, and it _shouldn't_ fail.
*/
if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
- di = btrfs_search_dir_index_item(root, path, dir_ino,
- name, name_len);
+ di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
if (IS_ERR_OR_NULL(di)) {
if (!di)
ret = -ENOENT;
@@ -4501,28 +4547,29 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
} else {
ret = btrfs_del_root_ref(trans, objectid,
root->root_key.objectid, dir_ino,
- &index, name, name_len);
+ &index, &fname.disk_name);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
}
- ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index);
+ ret = btrfs_delete_delayed_dir_index(trans, dir, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
- inode_inc_iversion(dir);
- dir->i_mtime = current_time(dir);
- dir->i_ctime = dir->i_mtime;
- ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir));
+ btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
+ inode_inc_iversion(&dir->vfs_inode);
+ dir->vfs_inode.i_mtime = current_time(&dir->vfs_inode);
+ dir->vfs_inode.i_ctime = dir->vfs_inode.i_mtime;
+ ret = btrfs_update_inode_fallback(trans, root, dir);
if (ret)
btrfs_abort_transaction(trans, ret);
out:
btrfs_free_path(path);
+ fscrypt_free_filename(&fname);
return ret;
}
@@ -4536,6 +4583,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
struct btrfs_path *path;
struct btrfs_dir_item *di;
struct btrfs_key key;
+ struct fscrypt_str name = FSTR_INIT("default", 7);
u64 dir_id;
int ret;
@@ -4546,7 +4594,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
/* Make sure this root isn't set as the default subvol */
dir_id = btrfs_super_root_dir(fs_info->super_copy);
di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
- dir_id, "default", 7, 0);
+ dir_id, &name, 0);
if (di && !IS_ERR(di)) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
if (key.objectid == root->root_key.objectid) {
@@ -4645,10 +4693,10 @@ again:
spin_unlock(&root->inode_lock);
}
-int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
+int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
- struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *root = dir->root;
struct inode *inode = d_inode(dentry);
struct btrfs_root *dest = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
@@ -4705,7 +4753,7 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
- btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
+ btrfs_record_snapshot_destroy(trans, dir);
ret = btrfs_unlink_subvol(trans, dir, dentry);
if (ret) {
@@ -4785,6 +4833,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
int err = 0;
struct btrfs_trans_handle *trans;
u64 last_unlink_trans;
+ struct fscrypt_name fname;
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
@@ -4794,15 +4843,23 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
"extent tree v2 doesn't support snapshot deletion yet");
return -EOPNOTSUPP;
}
- return btrfs_delete_subvolume(dir, dentry);
+ return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
}
- trans = __unlink_start_trans(dir);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+ if (err)
+ return err;
+
+ /* This needs to handle no-key deletions later on */
+
+ trans = __unlink_start_trans(BTRFS_I(dir));
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_notrans;
+ }
if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
- err = btrfs_unlink_subvol(trans, dir, dentry);
+ err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
goto out;
}
@@ -4813,9 +4870,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
/* now the directory is empty */
- err = btrfs_unlink_inode(trans, BTRFS_I(dir),
- BTRFS_I(d_inode(dentry)), dentry->d_name.name,
- dentry->d_name.len);
+ err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
+ &fname.disk_name);
if (!err) {
btrfs_i_size_write(BTRFS_I(inode), 0);
/*
@@ -4834,7 +4890,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
}
out:
btrfs_end_transaction(trans);
+out_notrans:
btrfs_btree_balance_dirty(fs_info);
+ fscrypt_free_filename(&fname);
return err;
}
@@ -4878,9 +4936,9 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
block_end = block_start + blocksize - 1;
ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
- blocksize);
+ blocksize, false);
if (ret < 0) {
- if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) {
+ if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
/* For nocow case, no need to reserve data space */
only_release_metadata = true;
} else {
@@ -4922,12 +4980,11 @@ again:
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, block_start, block_end, &cached_state);
+ lock_extent(io_tree, block_start, block_end, &cached_state);
ordered = btrfs_lookup_ordered_extent(inode, block_start);
if (ordered) {
- unlock_extent_cached(io_tree, block_start, block_end,
- &cached_state);
+ unlock_extent(io_tree, block_start, block_end, &cached_state);
unlock_page(page);
put_page(page);
btrfs_start_ordered_extent(ordered, 1);
@@ -4937,13 +4994,12 @@ again:
clear_extent_bit(&inode->io_tree, block_start, block_end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, &cached_state);
+ &cached_state);
ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
&cached_state);
if (ret) {
- unlock_extent_cached(io_tree, block_start, block_end,
- &cached_state);
+ unlock_extent(io_tree, block_start, block_end, &cached_state);
goto out_unlock;
}
@@ -4960,11 +5016,11 @@ again:
btrfs_page_clear_checked(fs_info, page, block_start,
block_end + 1 - block_start);
btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
- unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
+ unlock_extent(io_tree, block_start, block_end, &cached_state);
if (only_release_metadata)
set_extent_bit(&inode->io_tree, block_start, block_end,
- EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL);
+ EXTENT_NORESERVE, NULL, GFP_NOFS);
out_unlock:
if (ret) {
@@ -5021,8 +5077,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
return ret;
}
- ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
- offset, 0, 0, len, 0, len, 0, 0, 0);
+ ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
if (ret) {
btrfs_abort_transaction(trans, ret);
} else {
@@ -5046,7 +5101,6 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
struct extent_io_tree *io_tree = &inode->io_tree;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
- struct extent_map_tree *em_tree = &inode->extent_tree;
u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
u64 block_end = ALIGN(size, fs_info->sectorsize);
u64 last_byte;
@@ -5094,10 +5148,11 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
if (err)
break;
- btrfs_drop_extent_cache(inode, cur_offset,
- cur_offset + hole_size - 1, 0);
hole_em = alloc_extent_map();
if (!hole_em) {
+ btrfs_drop_extent_map_range(inode, cur_offset,
+ cur_offset + hole_size - 1,
+ false);
btrfs_set_inode_full_sync(inode);
goto next;
}
@@ -5112,16 +5167,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = fs_info->generation;
- while (1) {
- write_lock(&em_tree->lock);
- err = add_extent_mapping(em_tree, hole_em, 1);
- write_unlock(&em_tree->lock);
- if (err != -EEXIST)
- break;
- btrfs_drop_extent_cache(inode, cur_offset,
- cur_offset +
- hole_size - 1, 0);
- }
+ err = btrfs_replace_extent_map_range(inode, hole_em, true);
free_extent_map(hole_em);
} else {
err = btrfs_inode_set_file_extent_range(inode,
@@ -5137,7 +5183,7 @@ next:
break;
}
free_extent_map(em);
- unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state);
+ unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
return err;
}
@@ -5215,7 +5261,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
inode_dio_wait(inode);
- ret = btrfs_truncate(inode, newsize == oldsize);
+ ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
if (ret && inode->i_nlink) {
int err;
@@ -5258,10 +5304,10 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
if (attr->ia_valid) {
setattr_copy(mnt_userns, inode, attr);
inode_inc_iversion(inode);
- err = btrfs_dirty_inode(inode);
+ err = btrfs_dirty_inode(BTRFS_I(inode));
if (!err && attr->ia_valid & ATTR_MODE)
- err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
+ err = posix_acl_chmod(mnt_userns, dentry, inode->i_mode);
}
return err;
@@ -5271,7 +5317,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
* While truncating the inode pages during eviction, we get the VFS
* calling btrfs_invalidate_folio() against each folio of the inode. This
* is slow because the calls to btrfs_invalidate_folio() result in a
- * huge amount of calls to lock_extent_bits() and clear_extent_bit(),
+ * huge amount of calls to lock_extent() and clear_extent_bit(),
* which keep merging and splitting extent_state structures over and over,
* wasting lots of time.
*
@@ -5283,29 +5329,12 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
static void evict_inode_truncate_pages(struct inode *inode)
{
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
struct rb_node *node;
ASSERT(inode->i_state & I_FREEING);
truncate_inode_pages_final(&inode->i_data);
- write_lock(&map_tree->lock);
- while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
- struct extent_map *em;
-
- node = rb_first_cached(&map_tree->map);
- em = rb_entry(node, struct extent_map, rb_node);
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
- clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
- remove_extent_mapping(map_tree, em);
- free_extent_map(em);
- if (need_resched()) {
- write_unlock(&map_tree->lock);
- cond_resched();
- write_lock(&map_tree->lock);
- }
- }
- write_unlock(&map_tree->lock);
+ btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
/*
* Keep looping until we have no more ranges in the io tree.
@@ -5338,7 +5367,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
state_flags = state->state;
spin_unlock(&io_tree->lock);
- lock_extent_bits(io_tree, start, end, &cached_state);
+ lock_extent(io_tree, start, end, &cached_state);
/*
* If still has DELALLOC flag, the extent didn't reach disk,
@@ -5353,8 +5382,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
end - start + 1);
clear_extent_bit(io_tree, start, end,
- EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
+ EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
&cached_state);
cond_resched();
@@ -5534,22 +5562,27 @@ no_delete:
* If no dir entries were found, returns -ENOENT.
* If found a corrupted location in dir entry, returns -EUCLEAN.
*/
-static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
+static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
struct btrfs_key *location, u8 *type)
{
- const char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
struct btrfs_dir_item *di;
struct btrfs_path *path;
- struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *root = dir->root;
int ret = 0;
+ struct fscrypt_name fname;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
- name, namelen, 0);
+ ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
+ if (ret)
+ goto out;
+
+ /* This needs to handle no-key deletions later on */
+
+ di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir),
+ &fname.disk_name, 0);
if (IS_ERR_OR_NULL(di)) {
ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
@@ -5561,12 +5594,13 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
ret = -EUCLEAN;
btrfs_warn(root->fs_info,
"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
- __func__, name, btrfs_ino(BTRFS_I(dir)),
+ __func__, fname.disk_name.name, btrfs_ino(dir),
location->objectid, location->type, location->offset);
}
if (!ret)
- *type = btrfs_dir_type(path->nodes[0], di);
+ *type = btrfs_dir_ftype(path->nodes[0], di);
out:
+ fscrypt_free_filename(&fname);
btrfs_free_path(path);
return ret;
}
@@ -5577,7 +5611,7 @@ out:
* is kind of like crossing a mount point.
*/
static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
- struct inode *dir,
+ struct btrfs_inode *dir,
struct dentry *dentry,
struct btrfs_key *location,
struct btrfs_root **sub_root)
@@ -5589,6 +5623,11 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
struct btrfs_key key;
int ret;
int err = 0;
+ struct fscrypt_name fname;
+
+ ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname);
+ if (ret)
+ return ret;
path = btrfs_alloc_path();
if (!path) {
@@ -5597,7 +5636,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
}
err = -ENOENT;
- key.objectid = BTRFS_I(dir)->root->root_key.objectid;
+ key.objectid = dir->root->root_key.objectid;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = location->objectid;
@@ -5610,13 +5649,12 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
- if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) ||
- btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
+ if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
+ btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len)
goto out;
- ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
- (unsigned long)(ref + 1),
- dentry->d_name.len);
+ ret = memcmp_extent_buffer(leaf, fname.disk_name.name,
+ (unsigned long)(ref + 1), fname.disk_name.len);
if (ret)
goto out;
@@ -5635,19 +5673,20 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
err = 0;
out:
btrfs_free_path(path);
+ fscrypt_free_filename(&fname);
return err;
}
-static void inode_tree_add(struct inode *inode)
+static void inode_tree_add(struct btrfs_inode *inode)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct btrfs_inode *entry;
struct rb_node **p;
struct rb_node *parent;
- struct rb_node *new = &BTRFS_I(inode)->rb_node;
- u64 ino = btrfs_ino(BTRFS_I(inode));
+ struct rb_node *new = &inode->rb_node;
+ u64 ino = btrfs_ino(inode);
- if (inode_unhashed(inode))
+ if (inode_unhashed(&inode->vfs_inode))
return;
parent = NULL;
spin_lock(&root->inode_lock);
@@ -5707,6 +5746,11 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
BTRFS_I(inode)->location.offset = 0;
BTRFS_I(inode)->root = btrfs_grab_root(args->root);
BUG_ON(args->root && !BTRFS_I(inode)->root);
+
+ if (args->root && args->root == args->root->fs_info->tree_root &&
+ args->ino != BTRFS_BTREE_INODE_OBJECTID)
+ set_bit(BTRFS_INODE_FREE_SPACE_INODE,
+ &BTRFS_I(inode)->runtime_flags);
return 0;
}
@@ -5754,7 +5798,7 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
ret = btrfs_read_locked_inode(inode, path);
if (!ret) {
- inode_tree_add(inode);
+ inode_tree_add(BTRFS_I(inode));
unlock_new_inode(inode);
} else {
iget_failed(inode);
@@ -5834,7 +5878,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (dentry->d_name.len > BTRFS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
- ret = btrfs_inode_by_name(dir, dentry, &location, &di_type);
+ ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type);
if (ret < 0)
return ERR_PTR(ret);
@@ -5855,7 +5899,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
return inode;
}
- ret = fixup_tree_root_location(fs_info, dir, dentry,
+ ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
&location, &sub_root);
if (ret < 0) {
if (ret != -ENOENT)
@@ -6003,6 +6047,7 @@ again:
btrfs_for_each_slot(root, &key, &found_key, path, ret) {
struct dir_entry *entry;
struct extent_buffer *leaf = path->nodes[0];
+ u8 ftype;
if (found_key.objectid != key.objectid)
break;
@@ -6026,13 +6071,13 @@ again:
goto again;
}
+ ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di));
entry = addr;
- put_unaligned(name_len, &entry->name_len);
name_ptr = (char *)(entry + 1);
- read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
- name_len);
- put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)),
- &entry->type);
+ read_extent_buffer(leaf, name_ptr,
+ (unsigned long)(di + 1), name_len);
+ put_unaligned(name_len, &entry->name_len);
+ put_unaligned(fs_ftype_to_dtype(ftype), &entry->type);
btrfs_dir_item_key_to_cpu(leaf, di, &location);
put_unaligned(location.objectid, &entry->ino);
put_unaligned(found_key.offset, &entry->offset);
@@ -6090,21 +6135,21 @@ err:
* FIXME, needs more benchmarking...there are no reasons other than performance
* to keep or drop this code.
*/
-static int btrfs_dirty_inode(struct inode *inode)
+static int btrfs_dirty_inode(struct btrfs_inode *inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
int ret;
- if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
+ if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags))
return 0;
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, root, inode);
if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
/* whoops, lets try again with the full transaction */
btrfs_end_transaction(trans);
@@ -6112,10 +6157,10 @@ static int btrfs_dirty_inode(struct inode *inode)
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, root, inode);
}
btrfs_end_transaction(trans);
- if (BTRFS_I(inode)->delayed_node)
+ if (inode->delayed_node)
btrfs_balance_delayed_items(fs_info);
return ret;
@@ -6142,7 +6187,7 @@ static int btrfs_update_time(struct inode *inode, struct timespec64 *now,
inode->i_mtime = *now;
if (flags & S_ATIME)
inode->i_atime = *now;
- return dirty ? btrfs_dirty_inode(inode) : 0;
+ return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0;
}
/*
@@ -6238,9 +6283,18 @@ int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
struct inode *inode = args->inode;
int ret;
+ if (!args->orphan) {
+ ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0,
+ &args->fname);
+ if (ret)
+ return ret;
+ }
+
ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
- if (ret)
+ if (ret) {
+ fscrypt_free_filename(&args->fname);
return ret;
+ }
/* 1 to add inode item */
*trans_num_items = 1;
@@ -6280,6 +6334,7 @@ void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
{
posix_acl_release(args->acl);
posix_acl_release(args->default_acl);
+ fscrypt_free_filename(&args->fname);
}
/*
@@ -6287,27 +6342,27 @@ void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
*
* Currently only the compression flags and the cow flags are inherited.
*/
-static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
+static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir)
{
unsigned int flags;
- flags = BTRFS_I(dir)->flags;
+ flags = dir->flags;
if (flags & BTRFS_INODE_NOCOMPRESS) {
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
- BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+ inode->flags &= ~BTRFS_INODE_COMPRESS;
+ inode->flags |= BTRFS_INODE_NOCOMPRESS;
} else if (flags & BTRFS_INODE_COMPRESS) {
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
- BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+ inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ inode->flags |= BTRFS_INODE_COMPRESS;
}
if (flags & BTRFS_INODE_NODATACOW) {
- BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
- if (S_ISREG(inode->i_mode))
- BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+ inode->flags |= BTRFS_INODE_NODATACOW;
+ if (S_ISREG(inode->vfs_inode.i_mode))
+ inode->flags |= BTRFS_INODE_NODATASUM;
}
- btrfs_sync_inode_flags_to_i_flags(inode);
+ btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
}
int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
@@ -6315,8 +6370,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
{
struct inode *dir = args->dir;
struct inode *inode = args->inode;
- const char *name = args->orphan ? NULL : args->dentry->d_name.name;
- int name_len = args->orphan ? 0 : args->dentry->d_name.len;
+ const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_root *root;
struct btrfs_inode_item *inode_item;
@@ -6367,7 +6421,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
* change it now without compatibility issues.
*/
if (!args->subvol)
- btrfs_inherit_iflags(inode, dir);
+ btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
if (S_ISREG(inode->i_mode)) {
if (btrfs_test_opt(fs_info, NODATASUM))
@@ -6417,7 +6471,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
sizes[1] = 2 + sizeof(*ref);
} else {
key[1].offset = btrfs_ino(BTRFS_I(dir));
- sizes[1] = name_len + sizeof(*ref);
+ sizes[1] = name->len + sizeof(*ref);
}
}
@@ -6456,10 +6510,12 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
write_extent_buffer(path->nodes[0], "..", ptr, 2);
} else {
- btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref,
+ name->len);
btrfs_set_inode_ref_index(path->nodes[0], ref,
BTRFS_I(inode)->dir_index);
- write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ write_extent_buffer(path->nodes[0], name->name, ptr,
+ name->len);
}
}
@@ -6509,7 +6565,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
}
}
- inode_tree_add(inode);
+ inode_tree_add(BTRFS_I(inode));
trace_btrfs_inode_new(inode);
btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
@@ -6520,7 +6576,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
} else {
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
- name_len, 0, BTRFS_I(inode)->dir_index);
+ 0, BTRFS_I(inode)->dir_index);
}
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -6549,7 +6605,7 @@ out:
*/
int btrfs_add_link(struct btrfs_trans_handle *trans,
struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
- const char *name, int name_len, int add_backref, u64 index)
+ const struct fscrypt_str *name, int add_backref, u64 index)
{
int ret = 0;
struct btrfs_key key;
@@ -6568,17 +6624,17 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_add_root_ref(trans, key.objectid,
root->root_key.objectid, parent_ino,
- index, name, name_len);
+ index, name);
} else if (add_backref) {
- ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
- parent_ino, index);
+ ret = btrfs_insert_inode_ref(trans, root, name,
+ ino, parent_ino, index);
}
/* Nothing to clean up yet */
if (ret)
return ret;
- ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key,
+ ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
btrfs_inode_type(&inode->vfs_inode), index);
if (ret == -EEXIST || ret == -EOVERFLOW)
goto fail_dir_item;
@@ -6588,7 +6644,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
}
btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
- name_len * 2);
+ name->len * 2);
inode_inc_iversion(&parent_inode->vfs_inode);
/*
* If we are replaying a log tree, we do not want to update the mtime
@@ -6613,15 +6669,15 @@ fail_dir_item:
int err;
err = btrfs_del_root_ref(trans, key.objectid,
root->root_key.objectid, parent_ino,
- &local_index, name, name_len);
+ &local_index, name);
if (err)
btrfs_abort_transaction(trans, err);
} else if (add_backref) {
u64 local_index;
int err;
- err = btrfs_del_inode_ref(trans, root, name, name_len,
- ino, parent_ino, &local_index);
+ err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino,
+ &local_index);
if (err)
btrfs_abort_transaction(trans, err);
}
@@ -6704,6 +6760,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = d_inode(old_dentry);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct fscrypt_name fname;
u64 index;
int err;
int drop_inode = 0;
@@ -6715,6 +6772,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (inode->i_nlink >= BTRFS_LINK_MAX)
return -EMLINK;
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
+ if (err)
+ goto fail;
+
err = btrfs_set_inode_index(BTRFS_I(dir), &index);
if (err)
goto fail;
@@ -6741,7 +6802,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
- dentry->d_name.name, dentry->d_name.len, 1, index);
+ &fname.disk_name, 1, index);
if (err) {
drop_inode = 1;
@@ -6765,6 +6826,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
}
fail:
+ fscrypt_free_filename(&fname);
if (trans)
btrfs_end_transaction(trans);
if (drop_inode) {
@@ -6791,7 +6853,6 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
static noinline int uncompress_inline(struct btrfs_path *path,
struct page *page,
- size_t pg_offset, u64 extent_offset,
struct btrfs_file_extent_item *item)
{
int ret;
@@ -6802,7 +6863,6 @@ static noinline int uncompress_inline(struct btrfs_path *path,
unsigned long ptr;
int compress_type;
- WARN_ON(pg_offset != 0);
compress_type = btrfs_file_extent_compression(leaf, item);
max_size = btrfs_file_extent_ram_bytes(leaf, item);
inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
@@ -6814,8 +6874,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
read_extent_buffer(leaf, tmp, ptr, inline_size);
max_size = min_t(unsigned long, PAGE_SIZE, max_size);
- ret = btrfs_decompress(compress_type, tmp, page,
- extent_offset, inline_size, max_size);
+ ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size);
/*
* decompression code contains a memset to fill in any space between the end
@@ -6825,25 +6884,52 @@ static noinline int uncompress_inline(struct btrfs_path *path,
* cover that region here.
*/
- if (max_size + pg_offset < PAGE_SIZE)
- memzero_page(page, pg_offset + max_size,
- PAGE_SIZE - max_size - pg_offset);
+ if (max_size < PAGE_SIZE)
+ memzero_page(page, max_size, PAGE_SIZE - max_size);
kfree(tmp);
return ret;
}
-/**
- * btrfs_get_extent - Lookup the first extent overlapping a range in a file.
+static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path,
+ struct page *page)
+{
+ struct btrfs_file_extent_item *fi;
+ void *kaddr;
+ size_t copy_size;
+
+ if (!page || PageUptodate(page))
+ return 0;
+
+ ASSERT(page_offset(page) == 0);
+
+ fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
+ return uncompress_inline(path, page, fi);
+
+ copy_size = min_t(u64, PAGE_SIZE,
+ btrfs_file_extent_ram_bytes(path->nodes[0], fi));
+ kaddr = kmap_local_page(page);
+ read_extent_buffer(path->nodes[0], kaddr,
+ btrfs_file_extent_inline_start(fi), copy_size);
+ kunmap_local(kaddr);
+ if (copy_size < PAGE_SIZE)
+ memzero_page(page, copy_size, PAGE_SIZE - copy_size);
+ return 0;
+}
+
+/*
+ * Lookup the first extent overlapping a range in a file.
+ *
* @inode: file to search in
* @page: page to read extent data into if the extent is inline
* @pg_offset: offset into @page to copy to
* @start: file offset
* @len: length of range starting at @start
*
- * This returns the first &struct extent_map which overlaps with the given
- * range, reading it from the B-tree and caching it if necessary. Note that
- * there may be more extents which overlap the given range after the returned
- * extent_map.
+ * Return the first &struct extent_map which overlaps the given range, reading
+ * it from the B-tree and caching it if necessary. Note that there may be more
+ * extents which overlap the given range after the returned extent_map.
*
* If @page is not NULL and the extent is inline, this also reads the extent
* data directly into the page and marks the extent up to date in the io_tree.
@@ -6867,7 +6953,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct btrfs_key found_key;
struct extent_map *em = NULL;
struct extent_map_tree *em_tree = &inode->extent_tree;
- struct extent_io_tree *io_tree = &inode->io_tree;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
@@ -6985,53 +7070,33 @@ next:
goto insert;
}
- btrfs_extent_item_to_extent_map(inode, path, item, !page, em);
+ btrfs_extent_item_to_extent_map(inode, path, item, em);
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
goto insert;
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- unsigned long ptr;
- char *map;
- size_t size;
- size_t extent_offset;
- size_t copy_size;
-
- if (!page)
- goto out;
+ /*
+ * Inline extent can only exist at file offset 0. This is
+ * ensured by tree-checker and inline extent creation path.
+ * Thus all members representing file offsets should be zero.
+ */
+ ASSERT(pg_offset == 0);
+ ASSERT(extent_start == 0);
+ ASSERT(em->start == 0);
- size = btrfs_file_extent_ram_bytes(leaf, item);
- extent_offset = page_offset(page) + pg_offset - extent_start;
- copy_size = min_t(u64, PAGE_SIZE - pg_offset,
- size - extent_offset);
- em->start = extent_start + extent_offset;
- em->len = ALIGN(copy_size, fs_info->sectorsize);
- em->orig_block_len = em->len;
- em->orig_start = em->start;
- ptr = btrfs_file_extent_inline_start(item) + extent_offset;
+ /*
+ * btrfs_extent_item_to_extent_map() should have properly
+ * initialized em members already.
+ *
+ * Other members are not utilized for inline extents.
+ */
+ ASSERT(em->block_start == EXTENT_MAP_INLINE);
+ ASSERT(em->len = fs_info->sectorsize);
- if (!PageUptodate(page)) {
- if (btrfs_file_extent_compression(leaf, item) !=
- BTRFS_COMPRESS_NONE) {
- ret = uncompress_inline(path, page, pg_offset,
- extent_offset, item);
- if (ret)
- goto out;
- } else {
- map = kmap_local_page(page);
- read_extent_buffer(leaf, map + pg_offset, ptr,
- copy_size);
- if (pg_offset + copy_size < PAGE_SIZE) {
- memset(map + pg_offset + copy_size, 0,
- PAGE_SIZE - pg_offset -
- copy_size);
- }
- kunmap_local(map);
- }
- flush_dcache_page(page);
- }
- set_extent_uptodate(io_tree, em->start,
- extent_map_end(em) - 1, NULL, GFP_NOFS);
+ ret = read_inline_extent(inode, path, page);
+ if (ret < 0)
+ goto out;
goto insert;
}
not_found:
@@ -7065,133 +7130,6 @@ out:
return em;
}
-struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
- u64 start, u64 len)
-{
- struct extent_map *em;
- struct extent_map *hole_em = NULL;
- u64 delalloc_start = start;
- u64 end;
- u64 delalloc_len;
- u64 delalloc_end;
- int err = 0;
-
- em = btrfs_get_extent(inode, NULL, 0, start, len);
- if (IS_ERR(em))
- return em;
- /*
- * If our em maps to:
- * - a hole or
- * - a pre-alloc extent,
- * there might actually be delalloc bytes behind it.
- */
- if (em->block_start != EXTENT_MAP_HOLE &&
- !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- return em;
- else
- hole_em = em;
-
- /* check to see if we've wrapped (len == -1 or similar) */
- end = start + len;
- if (end < start)
- end = (u64)-1;
- else
- end -= 1;
-
- em = NULL;
-
- /* ok, we didn't find anything, lets look for delalloc */
- delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
- end, len, EXTENT_DELALLOC, 1);
- delalloc_end = delalloc_start + delalloc_len;
- if (delalloc_end < delalloc_start)
- delalloc_end = (u64)-1;
-
- /*
- * We didn't find anything useful, return the original results from
- * get_extent()
- */
- if (delalloc_start > end || delalloc_end <= start) {
- em = hole_em;
- hole_em = NULL;
- goto out;
- }
-
- /*
- * Adjust the delalloc_start to make sure it doesn't go backwards from
- * the start they passed in
- */
- delalloc_start = max(start, delalloc_start);
- delalloc_len = delalloc_end - delalloc_start;
-
- if (delalloc_len > 0) {
- u64 hole_start;
- u64 hole_len;
- const u64 hole_end = extent_map_end(hole_em);
-
- em = alloc_extent_map();
- if (!em) {
- err = -ENOMEM;
- goto out;
- }
-
- ASSERT(hole_em);
- /*
- * When btrfs_get_extent can't find anything it returns one
- * huge hole
- *
- * Make sure what it found really fits our range, and adjust to
- * make sure it is based on the start from the caller
- */
- if (hole_end <= start || hole_em->start > end) {
- free_extent_map(hole_em);
- hole_em = NULL;
- } else {
- hole_start = max(hole_em->start, start);
- hole_len = hole_end - hole_start;
- }
-
- if (hole_em && delalloc_start > hole_start) {
- /*
- * Our hole starts before our delalloc, so we have to
- * return just the parts of the hole that go until the
- * delalloc starts
- */
- em->len = min(hole_len, delalloc_start - hole_start);
- em->start = hole_start;
- em->orig_start = hole_start;
- /*
- * Don't adjust block start at all, it is fixed at
- * EXTENT_MAP_HOLE
- */
- em->block_start = hole_em->block_start;
- em->block_len = hole_len;
- if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
- set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
- } else {
- /*
- * Hole is out of passed range or it starts after
- * delalloc range
- */
- em->start = delalloc_start;
- em->len = delalloc_len;
- em->orig_start = delalloc_start;
- em->block_start = EXTENT_MAP_DELALLOC;
- em->block_len = delalloc_len;
- }
- } else {
- return hole_em;
- }
-out:
-
- free_extent_map(hole_em);
- if (err) {
- free_extent_map(em);
- return ERR_PTR(err);
- }
- return em;
-}
-
static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
const u64 start,
const u64 len,
@@ -7221,7 +7159,8 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
if (ret) {
if (em) {
free_extent_map(em);
- btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+ btrfs_drop_extent_map_range(inode, start,
+ start + len - 1, false);
}
em = ERR_PTR(ret);
}
@@ -7292,7 +7231,7 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
*/
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes, bool strict)
+ u64 *ram_bytes, bool nowait, bool strict)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct can_nocow_file_extent_args nocow_args = { 0 };
@@ -7308,6 +7247,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ path->nowait = nowait;
ret = btrfs_lookup_file_extent(NULL, root, path,
btrfs_ino(BTRFS_I(inode)), offset, 0);
@@ -7401,10 +7341,11 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
while (1) {
if (nowait) {
- if (!try_lock_extent(io_tree, lockstart, lockend))
+ if (!try_lock_extent(io_tree, lockstart, lockend,
+ cached_state))
return -EAGAIN;
} else {
- lock_extent_bits(io_tree, lockstart, lockend, cached_state);
+ lock_extent(io_tree, lockstart, lockend, cached_state);
}
/*
* We're concerned with the entire range that we're going to be
@@ -7426,7 +7367,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
lockstart, lockend)))
break;
- unlock_extent_cached(io_tree, lockstart, lockend, cached_state);
+ unlock_extent(io_tree, lockstart, lockend, cached_state);
if (ordered) {
if (nowait) {
@@ -7488,7 +7429,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
u64 ram_bytes, int compress_type,
int type)
{
- struct extent_map_tree *em_tree;
struct extent_map *em;
int ret;
@@ -7497,7 +7437,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
type == BTRFS_ORDERED_NOCOW ||
type == BTRFS_ORDERED_REGULAR);
- em_tree = &inode->extent_tree;
em = alloc_extent_map();
if (!em)
return ERR_PTR(-ENOMEM);
@@ -7518,18 +7457,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
em->compress_type = compress_type;
}
- do {
- btrfs_drop_extent_cache(inode, em->start,
- em->start + em->len - 1, 0);
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 1);
- write_unlock(&em_tree->lock);
- /*
- * The caller has taken lock_extent(), who could race with us
- * to add em?
- */
- } while (ret == -EEXIST);
-
+ ret = btrfs_replace_extent_map_range(inode, em, true);
if (ret) {
free_extent_map(em);
return ERR_PTR(ret);
@@ -7577,7 +7505,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
block_start = em->block_start + (start - em->start);
if (can_nocow_extent(inode, start, &len, &orig_start,
- &orig_block_len, &ram_bytes, false) == 1) {
+ &orig_block_len, &ram_bytes, false, false) == 1) {
bg = btrfs_inc_nocow_writers(fs_info, block_start);
if (bg)
can_nocow = true;
@@ -7762,7 +7690,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
if (write && !(flags & IOMAP_NOWAIT)) {
ret = btrfs_check_data_free_space(BTRFS_I(inode),
&dio_data->data_reserved,
- start, data_alloc_len);
+ start, data_alloc_len, false);
if (!ret)
dio_data->data_space_reserved = true;
else if (ret && !(BTRFS_I(inode)->flags &
@@ -7884,8 +7812,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
}
if (unlock_extents)
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
else
free_extent_state(cached_state);
@@ -7914,8 +7842,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
return 0;
unlock_err:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
err:
if (dio_data->data_space_reserved) {
btrfs_free_reserved_data_space(BTRFS_I(inode),
@@ -7938,7 +7866,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
if (!write && (iomap->type == IOMAP_HOLE)) {
/* If reading from a hole, unlock and return */
- unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
+ NULL);
return 0;
}
@@ -7950,7 +7879,7 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
pos, length, false);
else
unlock_extent(&BTRFS_I(inode)->io_tree, pos,
- pos + length - 1);
+ pos + length - 1, NULL);
ret = -ENOTBLK;
}
@@ -7969,40 +7898,35 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
return;
if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) {
- btrfs_mark_ordered_io_finished(BTRFS_I(dip->inode), NULL,
+ btrfs_mark_ordered_io_finished(dip->inode, NULL,
dip->file_offset, dip->bytes,
!dip->bio.bi_status);
} else {
- unlock_extent(&BTRFS_I(dip->inode)->io_tree,
+ unlock_extent(&dip->inode->io_tree,
dip->file_offset,
- dip->file_offset + dip->bytes - 1);
+ dip->file_offset + dip->bytes - 1, NULL);
}
kfree(dip->csums);
bio_endio(&dip->bio);
}
-static void submit_dio_repair_bio(struct inode *inode, struct bio *bio,
- int mirror_num,
- enum btrfs_compression_type compress_type)
+void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num)
{
- struct btrfs_dio_private *dip = bio->bi_private;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
BUG_ON(bio_op(bio) == REQ_OP_WRITE);
refcount_inc(&dip->refs);
- btrfs_submit_bio(fs_info, bio, mirror_num);
+ btrfs_submit_bio(inode->root->fs_info, bio, mirror_num);
}
static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
struct btrfs_bio *bbio,
const bool uptodate)
{
- struct inode *inode = dip->inode;
+ struct inode *inode = &dip->inode->vfs_inode;
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
- struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
blk_status_t err = BLK_STS_OK;
struct bvec_iter iter;
@@ -8013,17 +7937,15 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
u64 start = bbio->file_offset + offset;
if (uptodate &&
- (!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page,
- bv.bv_offset))) {
- clean_io_failure(fs_info, failure_tree, io_tree, start,
- bv.bv_page, btrfs_ino(BTRFS_I(inode)),
- bv.bv_offset);
+ (!csum || !btrfs_check_data_csum(BTRFS_I(inode), bbio, offset,
+ bv.bv_page, bv.bv_offset))) {
+ btrfs_clean_io_failure(BTRFS_I(inode), start,
+ bv.bv_page, bv.bv_offset);
} else {
int ret;
- ret = btrfs_repair_one_sector(inode, bbio, offset,
- bv.bv_page, bv.bv_offset,
- submit_dio_repair_bio);
+ ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset,
+ bv.bv_page, bv.bv_offset, false);
if (ret)
err = errno_to_blk_status(ret);
}
@@ -8032,23 +7954,23 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
return err;
}
-static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
- struct bio *bio,
- u64 dio_file_offset)
+blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode,
+ struct bio *bio,
+ u64 dio_file_offset)
{
- return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false);
+ return btrfs_csum_one_bio(inode, bio, dio_file_offset, false);
}
-static void btrfs_end_dio_bio(struct bio *bio)
+static void btrfs_end_dio_bio(struct btrfs_bio *bbio)
{
- struct btrfs_dio_private *dip = bio->bi_private;
- struct btrfs_bio *bbio = btrfs_bio(bio);
+ struct btrfs_dio_private *dip = bbio->private;
+ struct bio *bio = &bbio->bio;
blk_status_t err = bio->bi_status;
if (err)
- btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
+ btrfs_warn(dip->inode->root->fs_info,
"direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
- btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio),
+ btrfs_ino(dip->inode), bio_op(bio),
bio->bi_opf, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, err);
@@ -8058,41 +7980,40 @@ static void btrfs_end_dio_bio(struct bio *bio)
if (err)
dip->bio.bi_status = err;
- btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio);
+ btrfs_record_physical_zoned(&dip->inode->vfs_inode, bbio->file_offset, bio);
bio_put(bio);
btrfs_dio_private_put(dip);
}
-static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
+static void btrfs_submit_dio_bio(struct bio *bio, struct btrfs_inode *inode,
u64 file_offset, int async_submit)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_dio_private *dip = bio->bi_private;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
blk_status_t ret;
/* Save the original iter for read repair */
if (btrfs_op(bio) == BTRFS_MAP_READ)
btrfs_bio(bio)->iter = bio->bi_iter;
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+ if (inode->flags & BTRFS_INODE_NODATASUM)
goto map;
if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
/* Check btrfs_submit_data_write_bio() for async submit rules */
- if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) &&
+ if (async_submit && !atomic_read(&inode->sync_writers) &&
btrfs_wq_submit_bio(inode, bio, 0, file_offset,
- btrfs_submit_bio_start_direct_io))
+ WQ_SUBMIT_DATA_DIO))
return;
/*
* If we aren't doing async submit, calculate the csum of the
* bio now.
*/
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
+ ret = btrfs_csum_one_bio(inode, bio, file_offset, false);
if (ret) {
- bio->bi_status = ret;
- bio_endio(bio);
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
return;
}
} else {
@@ -8126,7 +8047,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
struct btrfs_dio_data *dio_data = iter->private;
struct extent_map *em = NULL;
- dip->inode = inode;
+ dip->inode = BTRFS_I(inode);
dip->file_offset = file_offset;
dip->bytes = dio_bio->bi_iter.bi_size;
refcount_set(&dip->refs, 1);
@@ -8142,7 +8063,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
*/
status = BLK_STS_RESOURCE;
dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS);
- if (!dip)
+ if (!dip->csums)
goto out_err;
status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums);
@@ -8175,9 +8096,8 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
* This will never fail as it's passing GPF_NOFS and
* the allocation is backed by btrfs_bioset.
*/
- bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
- bio->bi_private = dip;
- bio->bi_end_io = btrfs_end_dio_bio;
+ bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len,
+ btrfs_end_dio_bio, dip);
btrfs_bio(bio)->file_offset = file_offset;
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
@@ -8213,7 +8133,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
async_submit = 1;
}
- btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
+ btrfs_submit_dio_bio(bio, BTRFS_I(inode), file_offset, async_submit);
dio_data->submitted += clone_len;
clone_offset += clone_len;
@@ -8241,13 +8161,21 @@ static const struct iomap_dio_ops btrfs_dio_ops = {
.bio_set = &btrfs_dio_bioset,
};
-ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
+ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
{
struct btrfs_dio_data data;
return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- IOMAP_DIO_PARTIAL | IOMAP_DIO_NOSYNC,
- &data, done_before);
+ IOMAP_DIO_PARTIAL, &data, done_before);
+}
+
+struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
+ size_t done_before)
+{
+ struct btrfs_dio_data data;
+
+ return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ IOMAP_DIO_PARTIAL, &data, done_before);
}
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -8259,6 +8187,25 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ret)
return ret;
+ /*
+ * fiemap_prep() called filemap_write_and_wait() for the whole possible
+ * file range (0 to LLONG_MAX), but that is not enough if we have
+ * compression enabled. The first filemap_fdatawrite_range() only kicks
+ * in the compression of data (in an async thread) and will return
+ * before the compression is done and writeback is started. A second
+ * filemap_fdatawrite_range() is needed to wait for the compression to
+ * complete and writeback to start. We also need to wait for ordered
+ * extents to complete, because our fiemap implementation uses mainly
+ * file extent items to list the extents, searching for extent maps
+ * only for file ranges with holes or prealloc extents to figure out
+ * if we have delalloc in those ranges.
+ */
+ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
+ ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
+ if (ret)
+ return ret;
+ }
+
return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
}
@@ -8391,14 +8338,14 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
}
if (!inode_evicting)
- lock_extent_bits(tree, page_start, page_end, &cached_state);
+ lock_extent(tree, page_start, page_end, &cached_state);
cur = page_start;
while (cur < page_end) {
struct btrfs_ordered_extent *ordered;
- bool delete_states;
u64 range_end;
u32 range_len;
+ u32 extra_flags = 0;
ordered = btrfs_lookup_first_ordered_range(inode, cur,
page_end + 1 - cur);
@@ -8408,7 +8355,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
* No ordered extent covering this range, we are safe
* to delete all extent states in the range.
*/
- delete_states = true;
+ extra_flags = EXTENT_CLEAR_ALL_BITS;
goto next;
}
if (ordered->file_offset > cur) {
@@ -8419,7 +8366,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
* the ordered extent in the next iteration.
*/
range_end = ordered->file_offset - 1;
- delete_states = true;
+ extra_flags = EXTENT_CLEAR_ALL_BITS;
goto next;
}
@@ -8434,7 +8381,6 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
* We can't delete the extent states as
* btrfs_finish_ordered_io() may still use some of them.
*/
- delete_states = false;
goto next;
}
btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
@@ -8451,7 +8397,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
clear_extent_bit(tree, cur, range_end,
EXTENT_DELALLOC |
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 1, 0, &cached_state);
+ EXTENT_DEFRAG, &cached_state);
spin_lock_irq(&inode->ordered_tree.lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
@@ -8459,6 +8405,12 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
cur - ordered->file_offset);
spin_unlock_irq(&inode->ordered_tree.lock);
+ /*
+ * If the ordered extent has finished, we're safe to delete all
+ * the extent states of the range, otherwise
+ * btrfs_finish_ordered_io() will get executed by endio for
+ * other pages, so we can't delete extent states.
+ */
if (btrfs_dec_test_ordered_pending(inode, &ordered,
cur, range_end + 1 - cur)) {
btrfs_finish_ordered_io(ordered);
@@ -8466,14 +8418,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
* The ordered extent has finished, now we're again
* safe to delete all extent states of the range.
*/
- delete_states = true;
- } else {
- /*
- * btrfs_finish_ordered_io() will get executed by endio
- * of other pages, thus we can't delete extent states
- * anymore
- */
- delete_states = false;
+ extra_flags = EXTENT_CLEAR_ALL_BITS;
}
next:
if (ordered)
@@ -8497,8 +8442,8 @@ next:
if (!inode_evicting) {
clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_UPTODATE |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1,
- delete_states, &cached_state);
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
+ extra_flags, &cached_state);
}
cur = range_end + 1;
}
@@ -8589,11 +8534,11 @@ again:
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, page_start, page_end, &cached_state);
+ lock_extent(io_tree, page_start, page_end, &cached_state);
ret2 = set_page_extent_mapped(page);
if (ret2 < 0) {
ret = vmf_error(ret2);
- unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
goto out_unlock;
}
@@ -8604,8 +8549,7 @@ again:
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
PAGE_SIZE);
if (ordered) {
- unlock_extent_cached(io_tree, page_start, page_end,
- &cached_state);
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
unlock_page(page);
up_read(&BTRFS_I(inode)->i_mmap_lock);
btrfs_start_ordered_extent(ordered, 1);
@@ -8633,13 +8577,12 @@ again:
*/
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 0, 0, &cached_state);
+ EXTENT_DEFRAG, &cached_state);
ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
&cached_state);
if (ret2) {
- unlock_extent_cached(io_tree, page_start, page_end,
- &cached_state);
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
ret = VM_FAULT_SIGBUS;
goto out_unlock;
}
@@ -8659,7 +8602,7 @@ again:
btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
- unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
up_read(&BTRFS_I(inode)->i_mmap_lock);
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
@@ -8680,16 +8623,16 @@ out_noreserve:
return ret;
}
-static int btrfs_truncate(struct inode *inode, bool skip_writeback)
+static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
{
struct btrfs_truncate_control control = {
- .inode = BTRFS_I(inode),
- .ino = btrfs_ino(BTRFS_I(inode)),
+ .inode = inode,
+ .ino = btrfs_ino(inode),
.min_type = BTRFS_EXTENT_DATA_KEY,
.clear_extent_range = true,
};
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *rsv;
int ret;
struct btrfs_trans_handle *trans;
@@ -8697,7 +8640,8 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
if (!skip_writeback) {
- ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
+ ret = btrfs_wait_ordered_range(&inode->vfs_inode,
+ inode->vfs_inode.i_size & (~mask),
(u64)-1);
if (ret)
return ret;
@@ -8756,34 +8700,32 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
while (1) {
struct extent_state *cached_state = NULL;
- const u64 new_size = inode->i_size;
+ const u64 new_size = inode->vfs_inode.i_size;
const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
control.new_size = new_size;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
- &cached_state);
+ lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
/*
* We want to drop from the next block forward in case this new
* size is not block aligned since we will be keeping the last
* block of the extent just the way it is.
*/
- btrfs_drop_extent_cache(BTRFS_I(inode),
- ALIGN(new_size, fs_info->sectorsize),
- (u64)-1, 0);
+ btrfs_drop_extent_map_range(inode,
+ ALIGN(new_size, fs_info->sectorsize),
+ (u64)-1, false);
ret = btrfs_truncate_inode_items(trans, root, &control);
- inode_sub_bytes(inode, control.sub_bytes);
- btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size);
+ inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
+ btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
- (u64)-1, &cached_state);
+ unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret != -ENOSPC && ret != -EAGAIN)
break;
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, root, inode);
if (ret)
break;
@@ -8814,7 +8756,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
- ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
+ ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0);
if (ret)
goto out;
trans = btrfs_start_transaction(root, 1);
@@ -8822,14 +8764,14 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
ret = PTR_ERR(trans);
goto out;
}
- btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
}
if (trans) {
int ret2;
trans->block_rsv = &fs_info->trans_block_rsv;
- ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret2 = btrfs_update_inode(trans, root, inode);
if (ret2 && !ret)
ret = ret2;
@@ -8855,7 +8797,7 @@ out:
* extents beyond i_size to drop.
*/
if (control.extents_found > 0)
- btrfs_set_inode_full_sync(BTRFS_I(inode));
+ btrfs_set_inode_full_sync(inode);
return ret;
}
@@ -8908,6 +8850,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->last_log_commit = 0;
spin_lock_init(&ei->lock);
+ spin_lock_init(&ei->io_failure_lock);
ei->outstanding_extents = 0;
if (sb->s_magic != BTRFS_TEST_MAGIC)
btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
@@ -8923,13 +8866,11 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree);
- extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
- extent_io_tree_init(fs_info, &ei->io_failure_tree,
- IO_TREE_INODE_IO_FAILURE, inode);
+ extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
+ ei->io_tree.inode = ei;
extent_io_tree_init(fs_info, &ei->file_extent_tree,
- IO_TREE_INODE_FILE_EXTENT, inode);
- ei->io_tree.track_uptodate = true;
- ei->io_failure_tree.track_uptodate = true;
+ IO_TREE_INODE_FILE_EXTENT);
+ ei->io_failure_tree = RB_ROOT;
atomic_set(&ei->sync_writers, 0);
mutex_init(&ei->log_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
@@ -8944,7 +8885,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_destroy_inode(struct inode *inode)
{
- btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
+ btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
#endif
@@ -8959,6 +8900,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
struct btrfs_ordered_extent *ordered;
struct btrfs_inode *inode = BTRFS_I(vfs_inode);
struct btrfs_root *root = inode->root;
+ bool freespace_inode;
WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
WARN_ON(vfs_inode->i_data.nrpages);
@@ -8980,6 +8922,12 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
if (!root)
return;
+ /*
+ * If this is a free space inode do not take the ordered extents lockdep
+ * map.
+ */
+ freespace_inode = btrfs_is_free_space_inode(inode);
+
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
if (!ordered)
@@ -8988,6 +8936,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
btrfs_err(root->fs_info,
"found ordered extent %llu %llu on inode cleanup",
ordered->file_offset, ordered->num_bytes);
+
+ if (!freespace_inode)
+ btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
+
btrfs_remove_ordered_extent(inode, ordered);
btrfs_put_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
@@ -8995,7 +8947,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
}
btrfs_qgroup_check_reserved_leak(inode);
inode_tree_del(inode);
- btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+ btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
btrfs_put_root(inode->root);
}
@@ -9030,10 +8982,6 @@ void __cold btrfs_destroy_cachep(void)
rcu_barrier();
bioset_exit(&btrfs_dio_bioset);
kmem_cache_destroy(btrfs_inode_cachep);
- kmem_cache_destroy(btrfs_trans_handle_cachep);
- kmem_cache_destroy(btrfs_path_cachep);
- kmem_cache_destroy(btrfs_free_space_cachep);
- kmem_cache_destroy(btrfs_free_space_bitmap_cachep);
}
int __init btrfs_init_cachep(void)
@@ -9045,30 +8993,6 @@ int __init btrfs_init_cachep(void)
if (!btrfs_inode_cachep)
goto fail;
- btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
- sizeof(struct btrfs_trans_handle), 0,
- SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
- if (!btrfs_trans_handle_cachep)
- goto fail;
-
- btrfs_path_cachep = kmem_cache_create("btrfs_path",
- sizeof(struct btrfs_path), 0,
- SLAB_MEM_SPREAD, NULL);
- if (!btrfs_path_cachep)
- goto fail;
-
- btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
- sizeof(struct btrfs_free_space), 0,
- SLAB_MEM_SPREAD, NULL);
- if (!btrfs_free_space_cachep)
- goto fail;
-
- btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
- PAGE_SIZE, PAGE_SIZE,
- SLAB_MEM_SPREAD, NULL);
- if (!btrfs_free_space_bitmap_cachep)
- goto fail;
-
if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
offsetof(struct btrfs_dio_private, bio),
BIOSET_NEED_BVECS))
@@ -9144,6 +9068,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
int ret;
int ret2;
bool need_abort = false;
+ struct fscrypt_name old_fname, new_fname;
+ struct fscrypt_str *old_name, *new_name;
/*
* For non-subvolumes allow exchange only within one subvolume, in the
@@ -9155,6 +9081,19 @@ static int btrfs_rename_exchange(struct inode *old_dir,
new_ino != BTRFS_FIRST_FREE_OBJECTID))
return -EXDEV;
+ ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
+ if (ret)
+ return ret;
+
+ ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
+ if (ret) {
+ fscrypt_free_filename(&old_fname);
+ return ret;
+ }
+
+ old_name = &old_fname.disk_name;
+ new_name = &new_fname.disk_name;
+
/* close the race window with snapshot create/destroy ioctl */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
new_ino == BTRFS_FIRST_FREE_OBJECTID)
@@ -9222,10 +9161,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- ret = btrfs_insert_inode_ref(trans, dest,
- new_dentry->d_name.name,
- new_dentry->d_name.len,
- old_ino,
+ ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino,
btrfs_ino(BTRFS_I(new_dir)),
old_idx);
if (ret)
@@ -9238,10 +9174,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- ret = btrfs_insert_inode_ref(trans, root,
- old_dentry->d_name.name,
- old_dentry->d_name.len,
- new_ino,
+ ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino,
btrfs_ino(BTRFS_I(old_dir)),
new_idx);
if (ret) {
@@ -9272,13 +9205,11 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
- ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
+ ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
} else { /* src is an inode */
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(old_dentry->d_inode),
- old_dentry->d_name.name,
- old_dentry->d_name.len,
- &old_rename_ctx);
+ old_name, &old_rename_ctx);
if (!ret)
ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
}
@@ -9289,13 +9220,11 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* dest is a subvolume */
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
- ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
+ ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
} else { /* dest is an inode */
ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(new_dentry->d_inode),
- new_dentry->d_name.name,
- new_dentry->d_name.len,
- &new_rename_ctx);
+ new_name, &new_rename_ctx);
if (!ret)
ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
}
@@ -9305,16 +9234,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
}
ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
- new_dentry->d_name.name,
- new_dentry->d_name.len, 0, old_idx);
+ new_name, 0, old_idx);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
- old_dentry->d_name.name,
- old_dentry->d_name.len, 0, new_idx);
+ old_name, 0, new_idx);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
@@ -9357,6 +9284,8 @@ out_notrans:
old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
+ fscrypt_free_filename(&new_fname);
+ fscrypt_free_filename(&old_fname);
return ret;
}
@@ -9396,6 +9325,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
int ret;
int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
+ struct fscrypt_name old_fname, new_fname;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -9412,22 +9342,28 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
+ ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
+ if (ret)
+ return ret;
+
+ ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
+ if (ret) {
+ fscrypt_free_filename(&old_fname);
+ return ret;
+ }
/* check for collisions, even if the name isn't there */
- ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
- new_dentry->d_name.name,
- new_dentry->d_name.len);
-
+ ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name);
if (ret) {
if (ret == -EEXIST) {
/* we shouldn't get
* eexist without a new_inode */
if (WARN_ON(!new_inode)) {
- return ret;
+ goto out_fscrypt_names;
}
} else {
/* maybe -EOVERFLOW */
- return ret;
+ goto out_fscrypt_names;
}
}
ret = 0;
@@ -9510,11 +9446,9 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- ret = btrfs_insert_inode_ref(trans, dest,
- new_dentry->d_name.name,
- new_dentry->d_name.len,
- old_ino,
- btrfs_ino(BTRFS_I(new_dir)), index);
+ ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name,
+ old_ino, btrfs_ino(BTRFS_I(new_dir)),
+ index);
if (ret)
goto out_fail;
}
@@ -9533,13 +9467,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
BTRFS_I(old_inode), 1);
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
- ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
+ ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
} else {
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
- BTRFS_I(d_inode(old_dentry)),
- old_dentry->d_name.name,
- old_dentry->d_name.len,
- &rename_ctx);
+ BTRFS_I(d_inode(old_dentry)),
+ &old_fname.disk_name, &rename_ctx);
if (!ret)
ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
}
@@ -9553,13 +9485,12 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
new_inode->i_ctime = current_time(new_inode);
if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
- ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
+ ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
BUG_ON(new_inode->i_nlink == 0);
} else {
ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(d_inode(new_dentry)),
- new_dentry->d_name.name,
- new_dentry->d_name.len);
+ &new_fname.disk_name);
}
if (!ret && new_inode->i_nlink == 0)
ret = btrfs_orphan_add(trans,
@@ -9571,8 +9502,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
}
ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
- new_dentry->d_name.name,
- new_dentry->d_name.len, 0, index);
+ &new_fname.disk_name, 0, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
@@ -9607,6 +9537,9 @@ out_notrans:
out_whiteout_inode:
if (flags & RENAME_WHITEOUT)
iput(whiteout_args.inode);
+out_fscrypt_names:
+ fscrypt_free_filename(&old_fname);
+ fscrypt_free_filename(&new_fname);
return ret;
}
@@ -9726,7 +9659,7 @@ static int start_delalloc_inodes(struct btrfs_root *root,
&work->work);
} else {
ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(BTRFS_I(inode));
if (ret || wbc->nr_to_write <= 0)
goto out;
}
@@ -10008,7 +9941,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key ins;
@@ -10064,11 +9996,10 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
break;
}
- btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
- cur_offset + ins.offset -1, 0);
-
em = alloc_extent_map();
if (!em) {
+ btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
+ cur_offset + ins.offset - 1, false);
btrfs_set_inode_full_sync(BTRFS_I(inode));
goto next;
}
@@ -10083,16 +10014,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
em->generation = trans->transid;
- while (1) {
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 1);
- write_unlock(&em_tree->lock);
- if (ret != -EEXIST)
- break;
- btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
- cur_offset + ins.offset - 1,
- 0);
- }
+ ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
free_extent_map(em);
next:
num_bytes -= ins.offset;
@@ -10168,7 +10090,7 @@ static int btrfs_permission(struct user_namespace *mnt_userns,
}
static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+ struct file *file, umode_t mode)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_trans_handle *trans;
@@ -10176,7 +10098,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
struct inode *inode;
struct btrfs_new_inode_args new_inode_args = {
.dir = dir,
- .dentry = dentry,
+ .dentry = file->f_path.dentry,
.orphan = true,
};
unsigned int trans_num_items;
@@ -10213,7 +10135,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
set_nlink(inode, 1);
if (!ret) {
- d_tmpfile(dentry, inode);
+ d_tmpfile(file, inode);
unlock_new_inode(inode);
mark_inode_dirty(inode);
}
@@ -10225,7 +10147,7 @@ out_new_inode_args:
out_inode:
if (ret)
iput(inode);
- return ret;
+ return finish_open_simple(file, ret);
}
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
@@ -10346,8 +10268,8 @@ static ssize_t btrfs_encoded_read_inline(
}
read_extent_buffer(leaf, tmp, ptr, count);
btrfs_release_path(path);
- unlock_extent_cached(io_tree, start, lockend, cached_state);
- btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
*unlocked = true;
ret = copy_to_iter(tmp, count, iter);
@@ -10371,7 +10293,7 @@ struct btrfs_encoded_read_private {
static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
struct bio *bio, int mirror_num)
{
- struct btrfs_encoded_read_private *priv = bio->bi_private;
+ struct btrfs_encoded_read_private *priv = btrfs_bio(bio)->private;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
blk_status_t ret;
@@ -10389,7 +10311,7 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
{
const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK);
- struct btrfs_encoded_read_private *priv = bbio->bio.bi_private;
+ struct btrfs_encoded_read_private *priv = bbio->private;
struct btrfs_inode *inode = priv->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
u32 sectorsize = fs_info->sectorsize;
@@ -10407,7 +10329,7 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
pgoff = bvec->bv_offset;
for (i = 0; i < nr_sectors; i++) {
ASSERT(pgoff < PAGE_SIZE);
- if (btrfs_check_data_csum(&inode->vfs_inode, bbio, bio_offset,
+ if (btrfs_check_data_csum(inode, bbio, bio_offset,
bvec->bv_page, pgoff))
return BLK_STS_IOERR;
bio_offset += sectorsize;
@@ -10417,10 +10339,9 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
return BLK_STS_OK;
}
-static void btrfs_encoded_read_endio(struct bio *bio)
+static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
{
- struct btrfs_encoded_read_private *priv = bio->bi_private;
- struct btrfs_bio *bbio = btrfs_bio(bio);
+ struct btrfs_encoded_read_private *priv = bbio->private;
blk_status_t status;
status = btrfs_encoded_read_verify_csum(bbio);
@@ -10438,7 +10359,7 @@ static void btrfs_encoded_read_endio(struct bio *bio)
if (!atomic_dec_return(&priv->pending))
wake_up(&priv->wait);
btrfs_bio_free_csum(bbio);
- bio_put(bio);
+ bio_put(&bbio->bio);
}
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
@@ -10485,12 +10406,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
size_t bytes = min_t(u64, remaining, PAGE_SIZE);
if (!bio) {
- bio = btrfs_bio_alloc(BIO_MAX_VECS);
+ bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ,
+ btrfs_encoded_read_endio,
+ &priv);
bio->bi_iter.bi_sector =
(disk_bytenr + cur) >> SECTOR_SHIFT;
- bio->bi_end_io = btrfs_encoded_read_endio;
- bio->bi_private = &priv;
- bio->bi_opf = REQ_OP_READ;
}
if (!bytes ||
@@ -10551,8 +10471,8 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
if (ret)
goto out;
- unlock_extent_cached(io_tree, start, lockend, cached_state);
- btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
*unlocked = true;
if (compressed) {
@@ -10601,10 +10521,10 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
file_accessed(iocb->ki_filp);
- btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
if (iocb->ki_pos >= inode->vfs_inode.i_size) {
- btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
return 0;
}
start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
@@ -10621,13 +10541,13 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
lockend - start + 1);
if (ret)
goto out_unlock_inode;
- lock_extent_bits(io_tree, start, lockend, &cached_state);
+ lock_extent(io_tree, start, lockend, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, start,
lockend - start + 1);
if (!ordered)
break;
btrfs_put_ordered_extent(ordered);
- unlock_extent_cached(io_tree, start, lockend, &cached_state);
+ unlock_extent(io_tree, start, lockend, &cached_state);
cond_resched();
}
@@ -10701,8 +10621,8 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
em = NULL;
if (disk_bytenr == EXTENT_MAP_HOLE) {
- unlock_extent_cached(io_tree, start, lockend, &cached_state);
- btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
unlocked = true;
ret = iov_iter_zero(count, iter);
if (ret != count)
@@ -10722,10 +10642,10 @@ out_em:
free_extent_map(em);
out_unlock_extent:
if (!unlocked)
- unlock_extent_cached(io_tree, start, lockend, &cached_state);
+ unlock_extent(io_tree, start, lockend, &cached_state);
out_unlock_inode:
if (!unlocked)
- btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
return ret;
}
@@ -10860,14 +10780,14 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
end >> PAGE_SHIFT);
if (ret)
goto out_pages;
- lock_extent_bits(io_tree, start, end, &cached_state);
+ lock_extent(io_tree, start, end, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
if (!ordered &&
!filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
break;
if (ordered)
btrfs_put_ordered_extent(ordered);
- unlock_extent_cached(io_tree, start, end, &cached_state);
+ unlock_extent(io_tree, start, end, &cached_state);
cond_resched();
}
@@ -10921,7 +10841,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
(1 << BTRFS_ORDERED_COMPRESSED),
compression);
if (ret) {
- btrfs_drop_extent_cache(inode, start, end, 0);
+ btrfs_drop_extent_map_range(inode, start, end, false);
goto out_free_reserved;
}
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
@@ -10929,7 +10849,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
if (start + encoded->len > inode->vfs_inode.i_size)
i_size_write(&inode->vfs_inode, start + encoded->len);
- unlock_extent_cached(io_tree, start, end, &cached_state);
+ unlock_extent(io_tree, start, end, &cached_state);
btrfs_delalloc_release_extents(inode, num_bytes);
@@ -10960,7 +10880,7 @@ out_free_data_space:
if (!extent_reserved)
btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
out_unlock:
- unlock_extent_cached(io_tree, start, end, &cached_state);
+ unlock_extent(io_tree, start, end, &cached_state);
out_pages:
for (i = 0; i < nr_pages; i++) {
if (pages[i])
@@ -11201,7 +11121,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
- lock_extent_bits(io_tree, 0, isize - 1, &cached_state);
+ lock_extent(io_tree, 0, isize - 1, &cached_state);
start = 0;
while (start < isize) {
u64 logical_block_start, physical_block_start;
@@ -11242,7 +11162,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
free_extent_map(em);
em = NULL;
- ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true);
+ ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true);
if (ret < 0) {
goto out;
} else if (ret) {
@@ -11338,7 +11258,7 @@ out:
if (!IS_ERR_OR_NULL(em))
free_extent_map(em);
- unlock_extent_cached(io_tree, 0, isize - 1, &cached_state);
+ unlock_extent(io_tree, 0, isize - 1, &cached_state);
if (ret)
btrfs_swap_deactivate(file);
@@ -11391,7 +11311,7 @@ void btrfs_update_inode_bytes(struct btrfs_inode *inode,
spin_unlock(&inode->lock);
}
-/**
+/*
* Verify that there are no ordered extents for a given file range.
*
* @inode: The target inode.
@@ -11440,7 +11360,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
.mknod = btrfs_mknod,
.listxattr = btrfs_listxattr,
.permission = btrfs_permission,
- .get_acl = btrfs_get_acl,
+ .get_inode_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
.tmpfile = btrfs_tmpfile,
@@ -11493,7 +11413,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
.listxattr = btrfs_listxattr,
.permission = btrfs_permission,
.fiemap = btrfs_fiemap,
- .get_acl = btrfs_get_acl,
+ .get_inode_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
.fileattr_get = btrfs_fileattr_get,
@@ -11504,7 +11424,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
.setattr = btrfs_setattr,
.permission = btrfs_permission,
.listxattr = btrfs_listxattr,
- .get_acl = btrfs_get_acl,
+ .get_inode_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fe0cc816b4eb..7e348bd2ccde 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -50,6 +50,17 @@
#include "delalloc-space.h"
#include "block-group.h"
#include "subpage.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "root-tree.h"
+#include "defrag.h"
+#include "dir-item.h"
+#include "uuid-tree.h"
+#include "ioctl.h"
+#include "file.h"
+#include "scrub.h"
+#include "super.h"
#ifdef CONFIG_64BIT
/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -949,6 +960,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
struct inode *dir = d_inode(parent->dentry);
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct dentry *dentry;
+ struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen);
int error;
error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
@@ -969,8 +981,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
* check for them now when we can safely fail
*/
error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
- dir->i_ino, name,
- namelen);
+ dir->i_ino, &name_str);
if (error)
goto out_dput;
@@ -991,7 +1002,7 @@ out_up_read:
out_dput:
dput(dentry);
out_unlock:
- btrfs_inode_unlock(dir, 0);
+ btrfs_inode_unlock(BTRFS_I(dir), 0);
return error;
}
@@ -1036,908 +1047,6 @@ out:
}
/*
- * Defrag specific helper to get an extent map.
- *
- * Differences between this and btrfs_get_extent() are:
- *
- * - No extent_map will be added to inode->extent_tree
- * To reduce memory usage in the long run.
- *
- * - Extra optimization to skip file extents older than @newer_than
- * By using btrfs_search_forward() we can skip entire file ranges that
- * have extents created in past transactions, because btrfs_search_forward()
- * will not visit leaves and nodes with a generation smaller than given
- * minimal generation threshold (@newer_than).
- *
- * Return valid em if we find a file extent matching the requirement.
- * Return NULL if we can not find a file extent matching the requirement.
- *
- * Return ERR_PTR() for error.
- */
-static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
- u64 start, u64 newer_than)
-{
- struct btrfs_root *root = inode->root;
- struct btrfs_file_extent_item *fi;
- struct btrfs_path path = { 0 };
- struct extent_map *em;
- struct btrfs_key key;
- u64 ino = btrfs_ino(inode);
- int ret;
-
- em = alloc_extent_map();
- if (!em) {
- ret = -ENOMEM;
- goto err;
- }
-
- key.objectid = ino;
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = start;
-
- if (newer_than) {
- ret = btrfs_search_forward(root, &key, &path, newer_than);
- if (ret < 0)
- goto err;
- /* Can't find anything newer */
- if (ret > 0)
- goto not_found;
- } else {
- ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
- if (ret < 0)
- goto err;
- }
- if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
- /*
- * If btrfs_search_slot() makes path to point beyond nritems,
- * we should not have an empty leaf, as this inode must at
- * least have its INODE_ITEM.
- */
- ASSERT(btrfs_header_nritems(path.nodes[0]));
- path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1;
- }
- btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
- /* Perfect match, no need to go one slot back */
- if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
- key.offset == start)
- goto iterate;
-
- /* We didn't find a perfect match, needs to go one slot back */
- if (path.slots[0] > 0) {
- btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
- if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
- path.slots[0]--;
- }
-
-iterate:
- /* Iterate through the path to find a file extent covering @start */
- while (true) {
- u64 extent_end;
-
- if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
- goto next;
-
- btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
-
- /*
- * We may go one slot back to INODE_REF/XATTR item, then
- * need to go forward until we reach an EXTENT_DATA.
- * But we should still has the correct ino as key.objectid.
- */
- if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY)
- goto next;
-
- /* It's beyond our target range, definitely not extent found */
- if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY)
- goto not_found;
-
- /*
- * | |<- File extent ->|
- * \- start
- *
- * This means there is a hole between start and key.offset.
- */
- if (key.offset > start) {
- em->start = start;
- em->orig_start = start;
- em->block_start = EXTENT_MAP_HOLE;
- em->len = key.offset - start;
- break;
- }
-
- fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
- struct btrfs_file_extent_item);
- extent_end = btrfs_file_extent_end(&path);
-
- /*
- * |<- file extent ->| |
- * \- start
- *
- * We haven't reached start, search next slot.
- */
- if (extent_end <= start)
- goto next;
-
- /* Now this extent covers @start, convert it to em */
- btrfs_extent_item_to_extent_map(inode, &path, fi, false, em);
- break;
-next:
- ret = btrfs_next_item(root, &path);
- if (ret < 0)
- goto err;
- if (ret > 0)
- goto not_found;
- }
- btrfs_release_path(&path);
- return em;
-
-not_found:
- btrfs_release_path(&path);
- free_extent_map(em);
- return NULL;
-
-err:
- btrfs_release_path(&path);
- free_extent_map(em);
- return ERR_PTR(ret);
-}
-
-static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
- u64 newer_than, bool locked)
-{
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct extent_map *em;
- const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
-
- /*
- * hopefully we have this extent in the tree already, try without
- * the full extent lock
- */
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, sectorsize);
- read_unlock(&em_tree->lock);
-
- /*
- * We can get a merged extent, in that case, we need to re-search
- * tree to get the original em for defrag.
- *
- * If @newer_than is 0 or em::generation < newer_than, we can trust
- * this em, as either we don't care about the generation, or the
- * merged extent map will be rejected anyway.
- */
- if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
- newer_than && em->generation >= newer_than) {
- free_extent_map(em);
- em = NULL;
- }
-
- if (!em) {
- struct extent_state *cached = NULL;
- u64 end = start + sectorsize - 1;
-
- /* get the big lock and read metadata off disk */
- if (!locked)
- lock_extent_bits(io_tree, start, end, &cached);
- em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
- if (!locked)
- unlock_extent_cached(io_tree, start, end, &cached);
-
- if (IS_ERR(em))
- return NULL;
- }
-
- return em;
-}
-
-static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
- const struct extent_map *em)
-{
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
- return BTRFS_MAX_COMPRESSED;
- return fs_info->max_extent_size;
-}
-
-static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
- u32 extent_thresh, u64 newer_than, bool locked)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_map *next;
- bool ret = false;
-
- /* this is the last extent */
- if (em->start + em->len >= i_size_read(inode))
- return false;
-
- /*
- * Here we need to pass @newer_then when checking the next extent, or
- * we will hit a case we mark current extent for defrag, but the next
- * one will not be a target.
- * This will just cause extra IO without really reducing the fragments.
- */
- next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
- /* No more em or hole */
- if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
- goto out;
- if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
- goto out;
- /*
- * If the next extent is at its max capacity, defragging current extent
- * makes no sense, as the total number of extents won't change.
- */
- if (next->len >= get_extent_max_capacity(fs_info, em))
- goto out;
- /* Skip older extent */
- if (next->generation < newer_than)
- goto out;
- /* Also check extent size */
- if (next->len >= extent_thresh)
- goto out;
-
- ret = true;
-out:
- free_extent_map(next);
- return ret;
-}
-
-/*
- * Prepare one page to be defragged.
- *
- * This will ensure:
- *
- * - Returned page is locked and has been set up properly.
- * - No ordered extent exists in the page.
- * - The page is uptodate.
- *
- * NOTE: Caller should also wait for page writeback after the cluster is
- * prepared, here we don't do writeback wait for each page.
- */
-static struct page *defrag_prepare_one_page(struct btrfs_inode *inode,
- pgoff_t index)
-{
- struct address_space *mapping = inode->vfs_inode.i_mapping;
- gfp_t mask = btrfs_alloc_write_mask(mapping);
- u64 page_start = (u64)index << PAGE_SHIFT;
- u64 page_end = page_start + PAGE_SIZE - 1;
- struct extent_state *cached_state = NULL;
- struct page *page;
- int ret;
-
-again:
- page = find_or_create_page(mapping, index, mask);
- if (!page)
- return ERR_PTR(-ENOMEM);
-
- /*
- * Since we can defragment files opened read-only, we can encounter
- * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
- * can't do I/O using huge pages yet, so return an error for now.
- * Filesystem transparent huge pages are typically only used for
- * executables that explicitly enable them, so this isn't very
- * restrictive.
- */
- if (PageCompound(page)) {
- unlock_page(page);
- put_page(page);
- return ERR_PTR(-ETXTBSY);
- }
-
- ret = set_page_extent_mapped(page);
- if (ret < 0) {
- unlock_page(page);
- put_page(page);
- return ERR_PTR(ret);
- }
-
- /* Wait for any existing ordered extent in the range */
- while (1) {
- struct btrfs_ordered_extent *ordered;
-
- lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
- ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
- unlock_extent_cached(&inode->io_tree, page_start, page_end,
- &cached_state);
- if (!ordered)
- break;
-
- unlock_page(page);
- btrfs_start_ordered_extent(ordered, 1);
- btrfs_put_ordered_extent(ordered);
- lock_page(page);
- /*
- * We unlocked the page above, so we need check if it was
- * released or not.
- */
- if (page->mapping != mapping || !PagePrivate(page)) {
- unlock_page(page);
- put_page(page);
- goto again;
- }
- }
-
- /*
- * Now the page range has no ordered extent any more. Read the page to
- * make it uptodate.
- */
- if (!PageUptodate(page)) {
- btrfs_read_folio(NULL, page_folio(page));
- lock_page(page);
- if (page->mapping != mapping || !PagePrivate(page)) {
- unlock_page(page);
- put_page(page);
- goto again;
- }
- if (!PageUptodate(page)) {
- unlock_page(page);
- put_page(page);
- return ERR_PTR(-EIO);
- }
- }
- return page;
-}
-
-struct defrag_target_range {
- struct list_head list;
- u64 start;
- u64 len;
-};
-
-/*
- * Collect all valid target extents.
- *
- * @start: file offset to lookup
- * @len: length to lookup
- * @extent_thresh: file extent size threshold, any extent size >= this value
- * will be ignored
- * @newer_than: only defrag extents newer than this value
- * @do_compress: whether the defrag is doing compression
- * if true, @extent_thresh will be ignored and all regular
- * file extents meeting @newer_than will be targets.
- * @locked: if the range has already held extent lock
- * @target_list: list of targets file extents
- */
-static int defrag_collect_targets(struct btrfs_inode *inode,
- u64 start, u64 len, u32 extent_thresh,
- u64 newer_than, bool do_compress,
- bool locked, struct list_head *target_list,
- u64 *last_scanned_ret)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- bool last_is_target = false;
- u64 cur = start;
- int ret = 0;
-
- while (cur < start + len) {
- struct extent_map *em;
- struct defrag_target_range *new;
- bool next_mergeable = true;
- u64 range_len;
-
- last_is_target = false;
- em = defrag_lookup_extent(&inode->vfs_inode, cur,
- newer_than, locked);
- if (!em)
- break;
-
- /*
- * If the file extent is an inlined one, we may still want to
- * defrag it (fallthrough) if it will cause a regular extent.
- * This is for users who want to convert inline extents to
- * regular ones through max_inline= mount option.
- */
- if (em->block_start == EXTENT_MAP_INLINE &&
- em->len <= inode->root->fs_info->max_inline)
- goto next;
-
- /* Skip hole/delalloc/preallocated extents */
- if (em->block_start == EXTENT_MAP_HOLE ||
- em->block_start == EXTENT_MAP_DELALLOC ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- goto next;
-
- /* Skip older extent */
- if (em->generation < newer_than)
- goto next;
-
- /* This em is under writeback, no need to defrag */
- if (em->generation == (u64)-1)
- goto next;
-
- /*
- * Our start offset might be in the middle of an existing extent
- * map, so take that into account.
- */
- range_len = em->len - (cur - em->start);
- /*
- * If this range of the extent map is already flagged for delalloc,
- * skip it, because:
- *
- * 1) We could deadlock later, when trying to reserve space for
- * delalloc, because in case we can't immediately reserve space
- * the flusher can start delalloc and wait for the respective
- * ordered extents to complete. The deadlock would happen
- * because we do the space reservation while holding the range
- * locked, and starting writeback, or finishing an ordered
- * extent, requires locking the range;
- *
- * 2) If there's delalloc there, it means there's dirty pages for
- * which writeback has not started yet (we clean the delalloc
- * flag when starting writeback and after creating an ordered
- * extent). If we mark pages in an adjacent range for defrag,
- * then we will have a larger contiguous range for delalloc,
- * very likely resulting in a larger extent after writeback is
- * triggered (except in a case of free space fragmentation).
- */
- if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
- EXTENT_DELALLOC, 0, NULL))
- goto next;
-
- /*
- * For do_compress case, we want to compress all valid file
- * extents, thus no @extent_thresh or mergeable check.
- */
- if (do_compress)
- goto add;
-
- /* Skip too large extent */
- if (range_len >= extent_thresh)
- goto next;
-
- /*
- * Skip extents already at its max capacity, this is mostly for
- * compressed extents, which max cap is only 128K.
- */
- if (em->len >= get_extent_max_capacity(fs_info, em))
- goto next;
-
- /*
- * Normally there are no more extents after an inline one, thus
- * @next_mergeable will normally be false and not defragged.
- * So if an inline extent passed all above checks, just add it
- * for defrag, and be converted to regular extents.
- */
- if (em->block_start == EXTENT_MAP_INLINE)
- goto add;
-
- next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
- extent_thresh, newer_than, locked);
- if (!next_mergeable) {
- struct defrag_target_range *last;
-
- /* Empty target list, no way to merge with last entry */
- if (list_empty(target_list))
- goto next;
- last = list_entry(target_list->prev,
- struct defrag_target_range, list);
- /* Not mergeable with last entry */
- if (last->start + last->len != cur)
- goto next;
-
- /* Mergeable, fall through to add it to @target_list. */
- }
-
-add:
- last_is_target = true;
- range_len = min(extent_map_end(em), start + len) - cur;
- /*
- * This one is a good target, check if it can be merged into
- * last range of the target list.
- */
- if (!list_empty(target_list)) {
- struct defrag_target_range *last;
-
- last = list_entry(target_list->prev,
- struct defrag_target_range, list);
- ASSERT(last->start + last->len <= cur);
- if (last->start + last->len == cur) {
- /* Mergeable, enlarge the last entry */
- last->len += range_len;
- goto next;
- }
- /* Fall through to allocate a new entry */
- }
-
- /* Allocate new defrag_target_range */
- new = kmalloc(sizeof(*new), GFP_NOFS);
- if (!new) {
- free_extent_map(em);
- ret = -ENOMEM;
- break;
- }
- new->start = cur;
- new->len = range_len;
- list_add_tail(&new->list, target_list);
-
-next:
- cur = extent_map_end(em);
- free_extent_map(em);
- }
- if (ret < 0) {
- struct defrag_target_range *entry;
- struct defrag_target_range *tmp;
-
- list_for_each_entry_safe(entry, tmp, target_list, list) {
- list_del_init(&entry->list);
- kfree(entry);
- }
- }
- if (!ret && last_scanned_ret) {
- /*
- * If the last extent is not a target, the caller can skip to
- * the end of that extent.
- * Otherwise, we can only go the end of the specified range.
- */
- if (!last_is_target)
- *last_scanned_ret = max(cur, *last_scanned_ret);
- else
- *last_scanned_ret = max(start + len, *last_scanned_ret);
- }
- return ret;
-}
-
-#define CLUSTER_SIZE (SZ_256K)
-static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
-
-/*
- * Defrag one contiguous target range.
- *
- * @inode: target inode
- * @target: target range to defrag
- * @pages: locked pages covering the defrag range
- * @nr_pages: number of locked pages
- *
- * Caller should ensure:
- *
- * - Pages are prepared
- * Pages should be locked, no ordered extent in the pages range,
- * no writeback.
- *
- * - Extent bits are locked
- */
-static int defrag_one_locked_target(struct btrfs_inode *inode,
- struct defrag_target_range *target,
- struct page **pages, int nr_pages,
- struct extent_state **cached_state)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct extent_changeset *data_reserved = NULL;
- const u64 start = target->start;
- const u64 len = target->len;
- unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
- unsigned long start_index = start >> PAGE_SHIFT;
- unsigned long first_index = page_index(pages[0]);
- int ret = 0;
- int i;
-
- ASSERT(last_index - first_index + 1 <= nr_pages);
-
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
- if (ret < 0)
- return ret;
- clear_extent_bit(&inode->io_tree, start, start + len - 1,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 0, 0, cached_state);
- set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
-
- /* Update the page status */
- for (i = start_index - first_index; i <= last_index - first_index; i++) {
- ClearPageChecked(pages[i]);
- btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
- }
- btrfs_delalloc_release_extents(inode, len);
- extent_changeset_free(data_reserved);
-
- return ret;
-}
-
-static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
- u32 extent_thresh, u64 newer_than, bool do_compress,
- u64 *last_scanned_ret)
-{
- struct extent_state *cached_state = NULL;
- struct defrag_target_range *entry;
- struct defrag_target_range *tmp;
- LIST_HEAD(target_list);
- struct page **pages;
- const u32 sectorsize = inode->root->fs_info->sectorsize;
- u64 last_index = (start + len - 1) >> PAGE_SHIFT;
- u64 start_index = start >> PAGE_SHIFT;
- unsigned int nr_pages = last_index - start_index + 1;
- int ret = 0;
- int i;
-
- ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
- ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
-
- pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
- if (!pages)
- return -ENOMEM;
-
- /* Prepare all pages */
- for (i = 0; i < nr_pages; i++) {
- pages[i] = defrag_prepare_one_page(inode, start_index + i);
- if (IS_ERR(pages[i])) {
- ret = PTR_ERR(pages[i]);
- pages[i] = NULL;
- goto free_pages;
- }
- }
- for (i = 0; i < nr_pages; i++)
- wait_on_page_writeback(pages[i]);
-
- /* Lock the pages range */
- lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT,
- (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
- &cached_state);
- /*
- * Now we have a consistent view about the extent map, re-check
- * which range really needs to be defragged.
- *
- * And this time we have extent locked already, pass @locked = true
- * so that we won't relock the extent range and cause deadlock.
- */
- ret = defrag_collect_targets(inode, start, len, extent_thresh,
- newer_than, do_compress, true,
- &target_list, last_scanned_ret);
- if (ret < 0)
- goto unlock_extent;
-
- list_for_each_entry(entry, &target_list, list) {
- ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
- &cached_state);
- if (ret < 0)
- break;
- }
-
- list_for_each_entry_safe(entry, tmp, &target_list, list) {
- list_del_init(&entry->list);
- kfree(entry);
- }
-unlock_extent:
- unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT,
- (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
- &cached_state);
-free_pages:
- for (i = 0; i < nr_pages; i++) {
- if (pages[i]) {
- unlock_page(pages[i]);
- put_page(pages[i]);
- }
- }
- kfree(pages);
- return ret;
-}
-
-static int defrag_one_cluster(struct btrfs_inode *inode,
- struct file_ra_state *ra,
- u64 start, u32 len, u32 extent_thresh,
- u64 newer_than, bool do_compress,
- unsigned long *sectors_defragged,
- unsigned long max_sectors,
- u64 *last_scanned_ret)
-{
- const u32 sectorsize = inode->root->fs_info->sectorsize;
- struct defrag_target_range *entry;
- struct defrag_target_range *tmp;
- LIST_HEAD(target_list);
- int ret;
-
- ret = defrag_collect_targets(inode, start, len, extent_thresh,
- newer_than, do_compress, false,
- &target_list, NULL);
- if (ret < 0)
- goto out;
-
- list_for_each_entry(entry, &target_list, list) {
- u32 range_len = entry->len;
-
- /* Reached or beyond the limit */
- if (max_sectors && *sectors_defragged >= max_sectors) {
- ret = 1;
- break;
- }
-
- if (max_sectors)
- range_len = min_t(u32, range_len,
- (max_sectors - *sectors_defragged) * sectorsize);
-
- /*
- * If defrag_one_range() has updated last_scanned_ret,
- * our range may already be invalid (e.g. hole punched).
- * Skip if our range is before last_scanned_ret, as there is
- * no need to defrag the range anymore.
- */
- if (entry->start + range_len <= *last_scanned_ret)
- continue;
-
- if (ra)
- page_cache_sync_readahead(inode->vfs_inode.i_mapping,
- ra, NULL, entry->start >> PAGE_SHIFT,
- ((entry->start + range_len - 1) >> PAGE_SHIFT) -
- (entry->start >> PAGE_SHIFT) + 1);
- /*
- * Here we may not defrag any range if holes are punched before
- * we locked the pages.
- * But that's fine, it only affects the @sectors_defragged
- * accounting.
- */
- ret = defrag_one_range(inode, entry->start, range_len,
- extent_thresh, newer_than, do_compress,
- last_scanned_ret);
- if (ret < 0)
- break;
- *sectors_defragged += range_len >>
- inode->root->fs_info->sectorsize_bits;
- }
-out:
- list_for_each_entry_safe(entry, tmp, &target_list, list) {
- list_del_init(&entry->list);
- kfree(entry);
- }
- if (ret >= 0)
- *last_scanned_ret = max(*last_scanned_ret, start + len);
- return ret;
-}
-
-/*
- * Entry point to file defragmentation.
- *
- * @inode: inode to be defragged
- * @ra: readahead state (can be NUL)
- * @range: defrag options including range and flags
- * @newer_than: minimum transid to defrag
- * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
- * will be defragged.
- *
- * Return <0 for error.
- * Return >=0 for the number of sectors defragged, and range->start will be updated
- * to indicate the file offset where next defrag should be started at.
- * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
- * defragging all the range).
- */
-int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
- struct btrfs_ioctl_defrag_range_args *range,
- u64 newer_than, unsigned long max_to_defrag)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- unsigned long sectors_defragged = 0;
- u64 isize = i_size_read(inode);
- u64 cur;
- u64 last_byte;
- bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
- bool ra_allocated = false;
- int compress_type = BTRFS_COMPRESS_ZLIB;
- int ret = 0;
- u32 extent_thresh = range->extent_thresh;
- pgoff_t start_index;
-
- if (isize == 0)
- return 0;
-
- if (range->start >= isize)
- return -EINVAL;
-
- if (do_compress) {
- if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
- return -EINVAL;
- if (range->compress_type)
- compress_type = range->compress_type;
- }
-
- if (extent_thresh == 0)
- extent_thresh = SZ_256K;
-
- if (range->start + range->len > range->start) {
- /* Got a specific range */
- last_byte = min(isize, range->start + range->len);
- } else {
- /* Defrag until file end */
- last_byte = isize;
- }
-
- /* Align the range */
- cur = round_down(range->start, fs_info->sectorsize);
- last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
-
- /*
- * If we were not given a ra, allocate a readahead context. As
- * readahead is just an optimization, defrag will work without it so
- * we don't error out.
- */
- if (!ra) {
- ra_allocated = true;
- ra = kzalloc(sizeof(*ra), GFP_KERNEL);
- if (ra)
- file_ra_state_init(ra, inode->i_mapping);
- }
-
- /*
- * Make writeback start from the beginning of the range, so that the
- * defrag range can be written sequentially.
- */
- start_index = cur >> PAGE_SHIFT;
- if (start_index < inode->i_mapping->writeback_index)
- inode->i_mapping->writeback_index = start_index;
-
- while (cur < last_byte) {
- const unsigned long prev_sectors_defragged = sectors_defragged;
- u64 last_scanned = cur;
- u64 cluster_end;
-
- if (btrfs_defrag_cancelled(fs_info)) {
- ret = -EAGAIN;
- break;
- }
-
- /* We want the cluster end at page boundary when possible */
- cluster_end = (((cur >> PAGE_SHIFT) +
- (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
- cluster_end = min(cluster_end, last_byte);
-
- btrfs_inode_lock(inode, 0);
- if (IS_SWAPFILE(inode)) {
- ret = -ETXTBSY;
- btrfs_inode_unlock(inode, 0);
- break;
- }
- if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
- btrfs_inode_unlock(inode, 0);
- break;
- }
- if (do_compress)
- BTRFS_I(inode)->defrag_compress = compress_type;
- ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
- cluster_end + 1 - cur, extent_thresh,
- newer_than, do_compress, &sectors_defragged,
- max_to_defrag, &last_scanned);
-
- if (sectors_defragged > prev_sectors_defragged)
- balance_dirty_pages_ratelimited(inode->i_mapping);
-
- btrfs_inode_unlock(inode, 0);
- if (ret < 0)
- break;
- cur = max(cluster_end + 1, last_scanned);
- if (ret > 0) {
- ret = 0;
- break;
- }
- cond_resched();
- }
-
- if (ra_allocated)
- kfree(ra);
- /*
- * Update range.start for autodefrag, this will indicate where to start
- * in next run.
- */
- range->start = cur;
- if (sectors_defragged) {
- /*
- * We have defragged some sectors, for compression case they
- * need to be written back immediately.
- */
- if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
- filemap_flush(inode->i_mapping);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_flush(inode->i_mapping);
- }
- if (range->compress_type == BTRFS_COMPRESS_LZO)
- btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
- else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
- btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
- ret = sectors_defragged;
- }
- if (do_compress) {
- btrfs_inode_lock(inode, 0);
- BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
- btrfs_inode_unlock(inode, 0);
- }
- return ret;
-}
-
-/*
* Try to start exclusive operation @type or cancel it if it's running.
*
* Return:
@@ -2119,7 +1228,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (ret == 0 && new_size != old_size)
btrfs_info_in_rcu(fs_info,
"resize device %s (devid %llu) from %llu to %llu",
- rcu_str_deref(device->name), device->devid,
+ btrfs_dev_name(device), device->devid,
old_size, new_size);
out_finish:
btrfs_exclop_finish(fs_info);
@@ -3105,6 +2214,8 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
}
}
+ btrfs_free_path(path);
+ path = NULL;
if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
ret = -EFAULT;
@@ -3194,6 +2305,8 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
}
out:
+ btrfs_free_path(path);
+
if (!ret || ret == -EOVERFLOW) {
rootrefs->num_items = found;
/* update min_treeid for next search */
@@ -3205,7 +2318,6 @@ out:
}
kfree(rootrefs);
- btrfs_free_path(path);
return ret;
}
@@ -3271,7 +2383,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
dentry = btrfs_get_dentry(fs_info->sb,
BTRFS_FIRST_FREE_OBJECTID,
- vol_args2->subvolid, 0, 0);
+ vol_args2->subvolid, 0);
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
goto out_drop_write;
@@ -3416,16 +2528,16 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out_dput;
}
- btrfs_inode_lock(inode, 0);
- err = btrfs_delete_subvolume(dir, dentry);
- btrfs_inode_unlock(inode, 0);
+ btrfs_inode_lock(BTRFS_I(inode), 0);
+ err = btrfs_delete_subvolume(BTRFS_I(dir), dentry);
+ btrfs_inode_unlock(BTRFS_I(inode), 0);
if (!err)
d_delete_notify(dir, dentry);
out_dput:
dput(dentry);
out_unlock_dir:
- btrfs_inode_unlock(dir, 0);
+ btrfs_inode_unlock(BTRFS_I(dir), 0);
free_subvol_name:
kfree(subvol_name_ptr);
free_parent:
@@ -3747,13 +2859,10 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
di_args->bytes_used = btrfs_device_get_bytes_used(dev);
di_args->total_bytes = btrfs_device_get_total_bytes(dev);
memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
- if (dev->name) {
- strncpy(di_args->path, rcu_str_deref(dev->name),
- sizeof(di_args->path) - 1);
- di_args->path[sizeof(di_args->path) - 1] = 0;
- } else {
+ if (dev->name)
+ strscpy(di_args->path, btrfs_dev_name(dev), sizeof(di_args->path));
+ else
di_args->path[0] = '\0';
- }
out:
rcu_read_unlock();
@@ -3774,6 +2883,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
struct btrfs_trans_handle *trans;
struct btrfs_path *path = NULL;
struct btrfs_disk_key disk_key;
+ struct fscrypt_str name = FSTR_INIT("default", 7);
u64 objectid = 0;
u64 dir_id;
int ret;
@@ -3817,7 +2927,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
dir_id = btrfs_super_root_dir(fs_info->super_copy);
di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path,
- dir_id, "default", 7, 1);
+ dir_id, &name, 1);
if (IS_ERR_OR_NULL(di)) {
btrfs_release_path(path);
btrfs_end_transaction(trans);
@@ -4231,6 +3341,8 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
ipath->fspath->val[i] = rel_ptr;
}
+ btrfs_free_path(path);
+ path = NULL;
ret = copy_to_user((void __user *)(unsigned long)ipa->fspath,
ipath->fspath, size);
if (ret) {
@@ -4281,21 +3393,20 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
size = min_t(u32, loi->size, SZ_16M);
}
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
-
inodes = init_data_container(size);
if (IS_ERR(inodes)) {
ret = PTR_ERR(inodes);
- inodes = NULL;
- goto out;
+ goto out_loi;
}
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
inodes, ignore_offset);
+ btrfs_free_path(path);
if (ret == -EINVAL)
ret = -ENOENT;
if (ret < 0)
@@ -4307,7 +3418,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
ret = -EFAULT;
out:
- btrfs_free_path(path);
kvfree(inodes);
out_loi:
kfree(loi);
@@ -4338,7 +3448,7 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
spin_unlock(&fs_info->balance_lock);
}
-/**
+/*
* Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as
* required.
*
@@ -5283,7 +4393,7 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
goto out_acct;
}
- ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
&iov, &iter);
if (ret < 0)
goto out_acct;
@@ -5382,7 +4492,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
if (args.len > args.unencoded_len - args.unencoded_offset)
goto out_acct;
- ret = import_iovec(WRITE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ ret = import_iovec(ITER_SOURCE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
&iov, &iter);
if (ret < 0)
goto out_acct;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
new file mode 100644
index 000000000000..8a855d5ac2fa
--- /dev/null
+++ b/fs/btrfs/ioctl.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_IOCTL_H
+#define BTRFS_IOCTL_H
+
+long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int btrfs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa);
+int btrfs_ioctl_get_supported_features(void __user *arg);
+void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
+int __pure btrfs_is_empty_uuid(u8 *uuid);
+void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_balance_args *bargs);
+
+#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 9063072b399b..870528d87526 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -12,6 +12,7 @@
#include "ctree.h"
#include "extent_io.h"
#include "locking.h"
+#include "accessors.h"
/*
* Lockdep class keys for extent_buffer->lock's in this root. For a given
@@ -286,6 +287,31 @@ struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
}
/*
+ * Loop around taking references on and locking the root node of the tree in
+ * nowait mode until we end up with a lock on the root node or returning to
+ * avoid blocking.
+ *
+ * Return: root extent buffer with read lock held or -EAGAIN.
+ */
+struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root)
+{
+ struct extent_buffer *eb;
+
+ while (1) {
+ eb = btrfs_root_node(root);
+ if (!btrfs_try_tree_read_lock(eb)) {
+ free_extent_buffer(eb);
+ return ERR_PTR(-EAGAIN);
+ }
+ if (eb == root->node)
+ break;
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ return eb;
+}
+
+/*
* DREW locks
* ==========
*
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index ab268be09bb5..11c2269b4b6f 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -78,6 +78,82 @@ enum btrfs_lock_nesting {
BTRFS_NESTING_MAX,
};
+enum btrfs_lockdep_trans_states {
+ BTRFS_LOCKDEP_TRANS_COMMIT_START,
+ BTRFS_LOCKDEP_TRANS_UNBLOCKED,
+ BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED,
+ BTRFS_LOCKDEP_TRANS_COMPLETED,
+};
+
+/*
+ * Lockdep annotation for wait events.
+ *
+ * @owner: The struct where the lockdep map is defined
+ * @lock: The lockdep map corresponding to a wait event
+ *
+ * This macro is used to annotate a wait event. In this case a thread acquires
+ * the lockdep map as writer (exclusive lock) because it has to block until all
+ * the threads that hold the lock as readers signal the condition for the wait
+ * event and release their locks.
+ */
+#define btrfs_might_wait_for_event(owner, lock) \
+ do { \
+ rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_); \
+ rwsem_release(&owner->lock##_map, _THIS_IP_); \
+ } while (0)
+
+/*
+ * Protection for the resource/condition of a wait event.
+ *
+ * @owner: The struct where the lockdep map is defined
+ * @lock: The lockdep map corresponding to a wait event
+ *
+ * Many threads can modify the condition for the wait event at the same time
+ * and signal the threads that block on the wait event. The threads that modify
+ * the condition and do the signaling acquire the lock as readers (shared
+ * lock).
+ */
+#define btrfs_lockdep_acquire(owner, lock) \
+ rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_)
+
+/*
+ * Used after signaling the condition for a wait event to release the lockdep
+ * map held by a reader thread.
+ */
+#define btrfs_lockdep_release(owner, lock) \
+ rwsem_release(&owner->lock##_map, _THIS_IP_)
+
+/*
+ * Macros for the transaction states wait events, similar to the generic wait
+ * event macros.
+ */
+#define btrfs_might_wait_for_state(owner, i) \
+ do { \
+ rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \
+ rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_); \
+ } while (0)
+
+#define btrfs_trans_state_lockdep_acquire(owner, i) \
+ rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_)
+
+#define btrfs_trans_state_lockdep_release(owner, i) \
+ rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_)
+
+/* Initialization of the lockdep map */
+#define btrfs_lockdep_init_map(owner, lock) \
+ do { \
+ static struct lock_class_key lock##_key; \
+ lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0); \
+ } while (0)
+
+/* Initialization of the transaction states lockdep maps. */
+#define btrfs_state_lockdep_init_map(owner, lock, state) \
+ do { \
+ static struct lock_class_key lock##_key; \
+ lockdep_init_map(&owner->btrfs_state_change_map[state], #lock, \
+ &lock##_key, 0); \
+ } while (0)
+
static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES,
"too many lock subclasses defined");
@@ -94,6 +170,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
+struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root);
#ifdef CONFIG_BTRFS_DEBUG
static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb)
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 89bc5f825e0a..d5e78cbc8fbc 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -13,8 +13,10 @@
#include <linux/bio.h>
#include <linux/lzo.h>
#include <linux/refcount.h>
+#include "messages.h"
#include "compression.h"
#include "ctree.h"
+#include "super.h"
#define LZO_LEN 4
@@ -425,7 +427,7 @@ out:
return ret;
}
-int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+int lzo_decompress(struct list_head *ws, const u8 *data_in,
struct page *dest_page, unsigned long start_byte, size_t srclen,
size_t destlen)
{
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
new file mode 100644
index 000000000000..625bbbbb2608
--- /dev/null
+++ b/fs/btrfs/messages.c
@@ -0,0 +1,353 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "fs.h"
+#include "messages.h"
+#include "discard.h"
+#include "transaction.h"
+#include "space-info.h"
+#include "super.h"
+
+#ifdef CONFIG_PRINTK
+
+#define STATE_STRING_PREFACE ": state "
+#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT)
+
+/*
+ * Characters to print to indicate error conditions or uncommon filesystem state.
+ * RO is not an error.
+ */
+static const char fs_state_chars[] = {
+ [BTRFS_FS_STATE_ERROR] = 'E',
+ [BTRFS_FS_STATE_REMOUNTING] = 'M',
+ [BTRFS_FS_STATE_RO] = 0,
+ [BTRFS_FS_STATE_TRANS_ABORTED] = 'A',
+ [BTRFS_FS_STATE_DEV_REPLACING] = 'R',
+ [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0,
+ [BTRFS_FS_STATE_NO_CSUMS] = 'C',
+ [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L',
+};
+
+static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf)
+{
+ unsigned int bit;
+ bool states_printed = false;
+ unsigned long fs_state = READ_ONCE(info->fs_state);
+ char *curr = buf;
+
+ memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE));
+ curr += sizeof(STATE_STRING_PREFACE) - 1;
+
+ for_each_set_bit(bit, &fs_state, sizeof(fs_state)) {
+ WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT);
+ if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) {
+ *curr++ = fs_state_chars[bit];
+ states_printed = true;
+ }
+ }
+
+ /* If no states were printed, reset the buffer */
+ if (!states_printed)
+ curr = buf;
+
+ *curr++ = 0;
+}
+#endif
+
+/*
+ * Generally the error codes correspond to their respective errors, but there
+ * are a few special cases.
+ *
+ * EUCLEAN: Any sort of corruption that we encounter. The tree-checker for
+ * instance will return EUCLEAN if any of the blocks are corrupted in
+ * a way that is problematic. We want to reserve EUCLEAN for these
+ * sort of corruptions.
+ *
+ * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we
+ * need to use EROFS for this case. We will have no idea of the
+ * original failure, that will have been reported at the time we tripped
+ * over the error. Each subsequent error that doesn't have any context
+ * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR.
+ */
+const char * __attribute_const__ btrfs_decode_error(int errno)
+{
+ char *errstr = "unknown";
+
+ switch (errno) {
+ case -ENOENT: /* -2 */
+ errstr = "No such entry";
+ break;
+ case -EIO: /* -5 */
+ errstr = "IO failure";
+ break;
+ case -ENOMEM: /* -12*/
+ errstr = "Out of memory";
+ break;
+ case -EEXIST: /* -17 */
+ errstr = "Object already exists";
+ break;
+ case -ENOSPC: /* -28 */
+ errstr = "No space left";
+ break;
+ case -EROFS: /* -30 */
+ errstr = "Readonly filesystem";
+ break;
+ case -EOPNOTSUPP: /* -95 */
+ errstr = "Operation not supported";
+ break;
+ case -EUCLEAN: /* -117 */
+ errstr = "Filesystem corrupted";
+ break;
+ case -EDQUOT: /* -122 */
+ errstr = "Quota exceeded";
+ break;
+ }
+
+ return errstr;
+}
+
+/*
+ * __btrfs_handle_fs_error decodes expected errors from the caller and
+ * invokes the appropriate error response.
+ */
+__cold
+void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...)
+{
+ struct super_block *sb = fs_info->sb;
+#ifdef CONFIG_PRINTK
+ char statestr[STATE_STRING_BUF_LEN];
+ const char *errstr;
+#endif
+
+#ifdef CONFIG_PRINTK_INDEX
+ printk_index_subsys_emit(
+ "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", KERN_CRIT, fmt);
+#endif
+
+ /*
+ * Special case: if the error is EROFS, and we're already under
+ * SB_RDONLY, then it is safe here.
+ */
+ if (errno == -EROFS && sb_rdonly(sb))
+ return;
+
+#ifdef CONFIG_PRINTK
+ errstr = btrfs_decode_error(errno);
+ btrfs_state_to_string(fs_info, statestr);
+ if (fmt) {
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n",
+ sb->s_id, statestr, function, line, errno, errstr, &vaf);
+ va_end(args);
+ } else {
+ pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n",
+ sb->s_id, statestr, function, line, errno, errstr);
+ }
+#endif
+
+ /*
+ * Today we only save the error info to memory. Long term we'll also
+ * send it down to the disk.
+ */
+ set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
+
+ /* Don't go through full error handling during mount. */
+ if (!(sb->s_flags & SB_BORN))
+ return;
+
+ if (sb_rdonly(sb))
+ return;
+
+ btrfs_discard_stop(fs_info);
+
+ /* Handle error by forcing the filesystem readonly. */
+ btrfs_set_sb_rdonly(sb);
+ btrfs_info(fs_info, "forced readonly");
+ /*
+ * Note that a running device replace operation is not canceled here
+ * although there is no way to update the progress. It would add the
+ * risk of a deadlock, therefore the canceling is omitted. The only
+ * penalty is that some I/O remains active until the procedure
+ * completes. The next time when the filesystem is mounted writable
+ * again, the device replace operation continues.
+ */
+}
+
+#ifdef CONFIG_PRINTK
+static const char * const logtypes[] = {
+ "emergency",
+ "alert",
+ "critical",
+ "error",
+ "warning",
+ "notice",
+ "info",
+ "debug",
+};
+
+/*
+ * Use one ratelimit state per log level so that a flood of less important
+ * messages doesn't cause more important ones to be dropped.
+ */
+static struct ratelimit_state printk_limits[] = {
+ RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
+};
+
+void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+{
+ char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
+ struct va_format vaf;
+ va_list args;
+ int kern_level;
+ const char *type = logtypes[4];
+ struct ratelimit_state *ratelimit = &printk_limits[4];
+
+#ifdef CONFIG_PRINTK_INDEX
+ printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt);
+#endif
+
+ va_start(args, fmt);
+
+ while ((kern_level = printk_get_level(fmt)) != 0) {
+ size_t size = printk_skip_level(fmt) - fmt;
+
+ if (kern_level >= '0' && kern_level <= '7') {
+ memcpy(lvl, fmt, size);
+ lvl[size] = '\0';
+ type = logtypes[kern_level - '0'];
+ ratelimit = &printk_limits[kern_level - '0'];
+ }
+ fmt += size;
+ }
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ if (__ratelimit(ratelimit)) {
+ if (fs_info) {
+ char statestr[STATE_STRING_BUF_LEN];
+
+ btrfs_state_to_string(fs_info, statestr);
+ _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
+ fs_info->sb->s_id, statestr, &vaf);
+ } else {
+ _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
+ }
+ }
+
+ va_end(args);
+}
+#endif
+
+#ifdef CONFIG_BTRFS_ASSERT
+void __cold btrfs_assertfail(const char *expr, const char *file, int line)
+{
+ pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
+ BUG();
+}
+#endif
+
+void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info)
+{
+ btrfs_err(fs_info,
+"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel");
+}
+
+#if BITS_PER_LONG == 32
+void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info)
+{
+ if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) {
+ btrfs_warn(fs_info, "reaching 32bit limit for logical addresses");
+ btrfs_warn(fs_info,
+"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT",
+ BTRFS_32BIT_MAX_FILE_SIZE >> 40);
+ btrfs_warn(fs_info,
+ "please consider upgrading to 64bit kernel/hardware");
+ }
+}
+
+void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
+{
+ if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) {
+ btrfs_err(fs_info, "reached 32bit limit for logical addresses");
+ btrfs_err(fs_info,
+"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed",
+ BTRFS_32BIT_MAX_FILE_SIZE >> 40);
+ btrfs_err(fs_info,
+ "please consider upgrading to 64bit kernel/hardware");
+ }
+}
+#endif
+
+/*
+ * We only mark the transaction aborted and then set the file system read-only.
+ * This will prevent new transactions from starting or trying to join this
+ * one.
+ *
+ * This means that error recovery at the call site is limited to freeing
+ * any local memory allocations and passing the error code up without
+ * further cleanup. The transaction should complete as it normally would
+ * in the call path but will return -EIO.
+ *
+ * We'll complete the cleanup in btrfs_end_transaction and
+ * btrfs_commit_transaction.
+ */
+__cold
+void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+ const char *function,
+ unsigned int line, int errno, bool first_hit)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
+ WRITE_ONCE(trans->aborted, errno);
+ WRITE_ONCE(trans->transaction->aborted, errno);
+ if (first_hit && errno == -ENOSPC)
+ btrfs_dump_space_info_for_trans_abort(fs_info);
+ /* Wake up anybody who may be waiting on this transaction */
+ wake_up(&fs_info->transaction_wait);
+ wake_up(&fs_info->transaction_blocked_wait);
+ __btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
+}
+
+/*
+ * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an
+ * alert, and either panics or BUGs, depending on mount options.
+ */
+__cold
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...)
+{
+ char *s_id = "<unknown>";
+ const char *errstr;
+ struct va_format vaf = { .fmt = fmt };
+ va_list args;
+
+ if (fs_info)
+ s_id = fs_info->sb->s_id;
+
+ va_start(args, fmt);
+ vaf.va = &args;
+
+ errstr = btrfs_decode_error(errno);
+ if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR)))
+ panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
+ s_id, function, line, &vaf, errno, errstr);
+
+ btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
+ function, line, &vaf, errno, errstr);
+ va_end(args);
+ /* Caller calls BUG() */
+}
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
new file mode 100644
index 000000000000..190af1f698d9
--- /dev/null
+++ b/fs/btrfs/messages.h
@@ -0,0 +1,245 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_MESSAGES_H
+#define BTRFS_MESSAGES_H
+
+#include <linux/types.h>
+
+struct btrfs_fs_info;
+struct btrfs_trans_handle;
+
+static inline __printf(2, 3) __cold
+void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+{
+}
+
+#ifdef CONFIG_PRINTK
+
+#define btrfs_printk(fs_info, fmt, args...) \
+ _btrfs_printk(fs_info, fmt, ##args)
+
+__printf(2, 3)
+__cold
+void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+
+#else
+
+#define btrfs_printk(fs_info, fmt, args...) \
+ btrfs_no_printk(fs_info, fmt, ##args)
+#endif
+
+#define btrfs_emerg(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_INFO fmt, ##args)
+
+/*
+ * Wrappers that use printk_in_rcu
+ */
+#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
+
+/*
+ * Wrappers that use a ratelimited printk_in_rcu
+ */
+#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
+
+/*
+ * Wrappers that use a ratelimited printk
+ */
+#define btrfs_emerg_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args)
+
+#if defined(CONFIG_DYNAMIC_DEBUG)
+#define btrfs_debug(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#elif defined(DEBUG)
+#define btrfs_debug(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args)
+#else
+#define btrfs_debug(fs_info, fmt, args...) \
+ btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
+#endif
+
+#define btrfs_printk_in_rcu(fs_info, fmt, args...) \
+do { \
+ rcu_read_lock(); \
+ btrfs_printk(fs_info, fmt, ##args); \
+ rcu_read_unlock(); \
+} while (0)
+
+#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \
+do { \
+ rcu_read_lock(); \
+ btrfs_no_printk(fs_info, fmt, ##args); \
+ rcu_read_unlock(); \
+} while (0)
+
+#define btrfs_printk_ratelimited(fs_info, fmt, args...) \
+do { \
+ static DEFINE_RATELIMIT_STATE(_rs, \
+ DEFAULT_RATELIMIT_INTERVAL, \
+ DEFAULT_RATELIMIT_BURST); \
+ if (__ratelimit(&_rs)) \
+ btrfs_printk(fs_info, fmt, ##args); \
+} while (0)
+
+#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \
+do { \
+ rcu_read_lock(); \
+ btrfs_printk_ratelimited(fs_info, fmt, ##args); \
+ rcu_read_unlock(); \
+} while (0)
+
+#ifdef CONFIG_BTRFS_ASSERT
+void __cold btrfs_assertfail(const char *expr, const char *file, int line);
+
+#define ASSERT(expr) \
+ (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__))
+#else
+#define ASSERT(expr) (void)(expr)
+#endif
+
+void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info);
+
+__printf(5, 6)
+__cold
+void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...);
+
+const char * __attribute_const__ btrfs_decode_error(int errno);
+
+__cold
+void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+ const char *function,
+ unsigned int line, int errno, bool first_hit);
+
+bool __cold abort_should_print_stack(int errno);
+
+/*
+ * Call btrfs_abort_transaction as early as possible when an error condition is
+ * detected, that way the exact stack trace is reported for some errors.
+ */
+#define btrfs_abort_transaction(trans, errno) \
+do { \
+ bool first = false; \
+ /* Report first abort since mount */ \
+ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
+ &((trans)->fs_info->fs_state))) { \
+ first = true; \
+ if (WARN(abort_should_print_stack(errno), \
+ KERN_ERR \
+ "BTRFS: Transaction aborted (error %d)\n", \
+ (errno))) { \
+ /* Stack trace printed. */ \
+ } else { \
+ btrfs_err((trans)->fs_info, \
+ "Transaction aborted (error %d)", \
+ (errno)); \
+ } \
+ } \
+ __btrfs_abort_transaction((trans), __func__, \
+ __LINE__, (errno), first); \
+} while (0)
+
+#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
+ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
+ (errno), fmt, ##args)
+
+__printf(5, 6)
+__cold
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...);
+/*
+ * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
+ * will panic(). Otherwise we BUG() here.
+ */
+#define btrfs_panic(fs_info, errno, fmt, args...) \
+do { \
+ __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
+ BUG(); \
+} while (0)
+
+#if BITS_PER_LONG == 32
+#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT)
+/*
+ * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical
+ * addresses of extents.
+ *
+ * For 4K page size it's about 10T, for 64K it's 160T.
+ */
+#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8)
+void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info);
+void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info);
+#endif
+
+#endif
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index 340f995652f2..768583a440e1 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -10,6 +10,14 @@
#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
+/*
+ * Enumerate bits using enum autoincrement. Define the @name as the n-th bit.
+ */
+#define ENUM_BIT(name) \
+ __ ## name ## _BIT, \
+ name = (1U << __ ## name ## _BIT), \
+ __ ## name ## _SEQ = __ ## name ## _BIT
+
static inline void cond_wake_up(struct wait_queue_head *wq)
{
/*
@@ -32,22 +40,10 @@ static inline void cond_wake_up_nomb(struct wait_queue_head *wq)
wake_up(wq);
}
-static inline u64 div_factor(u64 num, int factor)
-{
- if (factor == 10)
- return num;
- num *= factor;
- return div_u64(num, 10);
-}
-
-static inline u64 div_factor_fine(u64 num, int factor)
+static inline u64 mult_perc(u64 num, u32 percent)
{
- if (factor == 100)
- return num;
- num *= factor;
- return div_u64(num, 100);
+ return div_u64(num * percent, 100);
}
-
/* Copy of is_power_of_two that is 64bit safe */
static inline bool is_power_of_two_u64(u64 n)
{
@@ -88,6 +84,41 @@ static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr)
return NULL;
}
+/*
+ * Search @root from an entry that starts or comes after @bytenr.
+ *
+ * @root: the root to search.
+ * @bytenr: bytenr to search from.
+ *
+ * Return the rb_node that start at or after @bytenr. If there is no entry at
+ * or after @bytner return NULL.
+ */
+static inline struct rb_node *rb_simple_search_first(struct rb_root *root,
+ u64 bytenr)
+{
+ struct rb_node *node = root->rb_node, *ret = NULL;
+ struct rb_simple_node *entry, *ret_entry = NULL;
+
+ while (node) {
+ entry = rb_entry(node, struct rb_simple_node, rb_node);
+
+ if (bytenr < entry->bytenr) {
+ if (!ret || entry->bytenr < ret_entry->bytenr) {
+ ret = node;
+ ret_entry = entry;
+ }
+
+ node = node->rb_left;
+ } else if (bytenr > entry->bytenr) {
+ node = node->rb_right;
+ } else {
+ return node;
+ }
+ }
+
+ return ret;
+}
+
static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr,
struct rb_node *node)
{
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 1952ac85222c..57d8c72737e1 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -7,6 +7,7 @@
#include <linux/blkdev.h>
#include <linux/writeback.h>
#include <linux/sched/mm.h>
+#include "messages.h"
#include "misc.h"
#include "ctree.h"
#include "transaction.h"
@@ -17,6 +18,8 @@
#include "delalloc-space.h"
#include "qgroup.h"
#include "subpage.h"
+#include "file.h"
+#include "super.h"
static struct kmem_cache *btrfs_ordered_extent_cache;
@@ -143,7 +146,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
return ret;
}
-/**
+/*
* Add an ordered extent to the per-inode tree.
*
* @inode: Inode that this extent is for.
@@ -501,7 +504,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
ASSERT(list_empty(&entry->log_list));
ASSERT(RB_EMPTY_NODE(&entry->rb_node));
if (entry->inode)
- btrfs_add_delayed_iput(entry->inode);
+ btrfs_add_delayed_iput(BTRFS_I(entry->inode));
while (!list_empty(&entry->list)) {
cur = entry->list.next;
sum = list_entry(cur, struct btrfs_ordered_sum, list);
@@ -524,7 +527,15 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *node;
bool pending;
+ bool freespace_inode;
+ /*
+ * If this is a free space inode the thread has not acquired the ordered
+ * extents lockdep map.
+ */
+ freespace_inode = btrfs_is_free_space_inode(btrfs_inode);
+
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered);
/* This is paired with btrfs_add_ordered_extent. */
spin_lock(&btrfs_inode->lock);
btrfs_mod_outstanding_extents(btrfs_inode, -1);
@@ -580,6 +591,8 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
}
}
+ btrfs_lockdep_release(fs_info, btrfs_trans_pending_ordered);
+
spin_lock(&root->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
root->nr_ordered_extents--;
@@ -594,6 +607,8 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
}
spin_unlock(&root->ordered_extent_lock);
wake_up(&entry->wait);
+ if (!freespace_inode)
+ btrfs_lockdep_release(fs_info, btrfs_ordered_extent);
}
static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
@@ -712,10 +727,17 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
u64 start = entry->file_offset;
u64 end = start + entry->num_bytes - 1;
struct btrfs_inode *inode = BTRFS_I(entry->inode);
+ bool freespace_inode;
trace_btrfs_ordered_extent_start(inode, entry);
/*
+ * If this is a free space inode do not take the ordered extents lockdep
+ * map.
+ */
+ freespace_inode = btrfs_is_free_space_inode(inode);
+
+ /*
* pages in the range can be dirty, clean or writeback. We
* start IO on any dirty ones so the wait doesn't stall waiting
* for the flusher thread to find them
@@ -723,6 +745,8 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
if (wait) {
+ if (!freespace_inode)
+ btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
&entry->flags));
}
@@ -740,11 +764,11 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
struct btrfs_ordered_extent *ordered;
if (start + len < start) {
- orig_end = INT_LIMIT(loff_t);
+ orig_end = OFFSET_MAX;
} else {
orig_end = start + len - 1;
- if (orig_end > INT_LIMIT(loff_t))
- orig_end = INT_LIMIT(loff_t);
+ if (orig_end > OFFSET_MAX)
+ orig_end = OFFSET_MAX;
}
/* start IO across the range first to instantiate any delalloc
@@ -998,17 +1022,18 @@ out:
}
/*
- * btrfs_flush_ordered_range - Lock the passed range and ensures all pending
- * ordered extents in it are run to completion.
+ * Lock the passed range and ensures all pending ordered extents in it are run
+ * to completion.
*
* @inode: Inode whose ordered tree is to be searched
* @start: Beginning of range to flush
* @end: Last byte of range to lock
* @cached_state: If passed, will return the extent state responsible for the
- * locked range. It's the caller's responsibility to free the cached state.
+ * locked range. It's the caller's responsibility to free the
+ * cached state.
*
- * This function always returns with the given range locked, ensuring after it's
- * called no order extent can be pending.
+ * Always return with the given range locked, ensuring after it's called no
+ * order extent can be pending.
*/
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
u64 end,
@@ -1022,7 +1047,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
cachedp = cached_state;
while (1) {
- lock_extent_bits(&inode->io_tree, start, end, cachedp);
+ lock_extent(&inode->io_tree, start, end, cachedp);
ordered = btrfs_lookup_ordered_range(inode, start,
end - start + 1);
if (!ordered) {
@@ -1035,12 +1060,38 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
refcount_dec(&cache->refs);
break;
}
- unlock_extent_cached(&inode->io_tree, start, end, cachedp);
+ unlock_extent(&inode->io_tree, start, end, cachedp);
btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
}
}
+/*
+ * Lock the passed range and ensure all pending ordered extents in it are run
+ * to completion in nowait mode.
+ *
+ * Return true if btrfs_lock_ordered_range does not return any extents,
+ * otherwise false.
+ */
+bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ if (!try_lock_extent(&inode->io_tree, start, end, cached_state))
+ return false;
+
+ ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1);
+ if (!ordered)
+ return true;
+
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent(&inode->io_tree, start, end, cached_state);
+
+ return false;
+}
+
+
static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
u64 len)
{
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 87792f85e2c4..89f82b78f590 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -160,18 +160,6 @@ struct btrfs_ordered_extent {
struct block_device *bdev;
};
-/*
- * calculates the total size you need to allocate for an ordered sum
- * structure spanning 'bytes' in the file
- */
-static inline int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info,
- unsigned long bytes)
-{
- int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize);
-
- return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size;
-}
-
static inline void
btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
{
@@ -218,6 +206,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
u64 end,
struct extent_state **cached_state);
+bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state);
int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
u64 post);
int __init ordered_data_init(void);
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index aa534108c1e2..7a1b021b5669 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -5,6 +5,7 @@
#include "ctree.h"
#include "disk-io.h"
+#include "orphan.h"
int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset)
diff --git a/fs/btrfs/orphan.h b/fs/btrfs/orphan.h
new file mode 100644
index 000000000000..3faab5cbb59a
--- /dev/null
+++ b/fs/btrfs/orphan.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_ORPHAN_H
+#define BTRFS_ORPHAN_H
+
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset);
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset);
+
+#endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index dd8777872143..b93c96213304 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -3,9 +3,12 @@
* Copyright (C) 2007 Oracle. All rights reserved.
*/
+#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "print-tree.h"
+#include "accessors.h"
+#include "tree-checker.h"
struct root_name_map {
u64 id;
@@ -240,9 +243,9 @@ void btrfs_print_leaf(struct extent_buffer *l)
case BTRFS_DIR_ITEM_KEY:
di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
btrfs_dir_item_key_to_cpu(l, di, &found_key);
- pr_info("\t\tdir oid %llu type %u\n",
+ pr_info("\t\tdir oid %llu flags %u\n",
found_key.objectid,
- btrfs_dir_type(l, di));
+ btrfs_dir_flags(l, di));
break;
case BTRFS_ROOT_ITEM_KEY:
ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
@@ -384,14 +387,16 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow)
if (!follow)
return;
for (i = 0; i < nr; i++) {
- struct btrfs_key first_key;
+ struct btrfs_tree_parent_check check = {
+ .level = level - 1,
+ .transid = btrfs_node_ptr_generation(c, i),
+ .owner_root = btrfs_header_owner(c),
+ .has_first_key = true
+ };
struct extent_buffer *next;
- btrfs_node_key_to_cpu(c, &first_key, i);
- next = read_tree_block(fs_info, btrfs_node_blockptr(c, i),
- btrfs_header_owner(c),
- btrfs_node_ptr_generation(c, i),
- level - 1, &first_key);
+ btrfs_node_key_to_cpu(c, &check.first_key, i);
+ next = read_tree_block(fs_info, btrfs_node_blockptr(c, i), &check);
if (IS_ERR(next))
continue;
if (!extent_buffer_uptodate(next)) {
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index a2ec8ecae8de..0755af0e53e3 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -4,12 +4,17 @@
*/
#include <linux/hashtable.h>
+#include "messages.h"
#include "props.h"
#include "btrfs_inode.h"
#include "transaction.h"
#include "ctree.h"
#include "xattr.h"
#include "compression.h"
+#include "space-info.h"
+#include "fs.h"
+#include "accessors.h"
+#include "super.h"
#define BTRFS_PROP_HANDLERS_HT_BITS 8
static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
@@ -270,11 +275,8 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 ino = btrfs_ino(BTRFS_I(inode));
- int ret;
-
- ret = iterate_object_props(root, path, ino, inode_prop_iterator, inode);
- return ret;
+ return iterate_object_props(root, path, ino, inode_prop_iterator, inode);
}
static int prop_compression_validate(const struct btrfs_inode *inode,
@@ -456,7 +458,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
return 0;
}
-void __init btrfs_props_init(void)
+int __init btrfs_props_init(void)
{
int i;
@@ -466,5 +468,6 @@ void __init btrfs_props_init(void)
hash_add(prop_handlers_ht, &p->node, h);
}
+ return 0;
}
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
index ca9dd3df129b..6e283196e38a 100644
--- a/fs/btrfs/props.h
+++ b/fs/btrfs/props.h
@@ -8,7 +8,7 @@
#include "ctree.h"
-void __init btrfs_props_init(void);
+int __init btrfs_props_init(void);
int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
const char *name, const char *value, size_t value_len,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index db723c0026bd..5c636e00d77d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -24,6 +24,11 @@
#include "block-group.h"
#include "sysfs.h"
#include "tree-mod-log.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "root-tree.h"
+#include "tree-checker.h"
/*
* Helpers to access qgroup reservation
@@ -275,7 +280,7 @@ static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *p
}
/*
- * Add relation specified by two qgoup ids.
+ * Add relation specified by two qgroup ids.
*
* Must be called with qgroup_lock held.
*
@@ -333,6 +338,13 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
}
#endif
+static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info)
+{
+ fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT |
+ BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
+ BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
+}
+
/*
* The full config is read in one go, only called from open_ctree()
* It doesn't use any locking, as at this point we're still single-threaded
@@ -401,7 +413,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
}
if (btrfs_qgroup_status_generation(l, ptr) !=
fs_info->generation) {
- flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
btrfs_err(fs_info,
"qgroup generation mismatch, marked as inconsistent");
}
@@ -419,7 +431,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
(!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
btrfs_err(fs_info, "inconsistent qgroup config");
- flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
}
if (!qgroup) {
qgroup = add_qgroup_rb(fs_info, found_key.offset);
@@ -878,7 +890,8 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
l = path->nodes[0];
slot = path->slots[0];
ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
- btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
+ btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags &
+ BTRFS_QGROUP_STATUS_FLAGS_MASK);
btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
btrfs_set_qgroup_status_rescan(l, ptr,
fs_info->qgroup_rescan_progress.objectid);
@@ -1052,7 +1065,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
+ btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags &
+ BTRFS_QGROUP_STATUS_FLAGS_MASK);
btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
btrfs_mark_buffer_dirty(leaf);
@@ -1174,6 +1188,21 @@ out_add_root:
fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
+ } else {
+ /*
+ * We have set both BTRFS_FS_QUOTA_ENABLED and
+ * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with
+ * -EINPROGRESS. That can happen because someone started the
+ * rescan worker by calling quota rescan ioctl before we
+ * attempted to initialize the rescan worker. Failure due to
+ * quotas disabled in the meanwhile is not possible, because
+ * we are holding a write lock on fs_info->subvol_sem, which
+ * is also acquired when disabling quotas.
+ * Ignore such error, and any other error would need to undo
+ * everything we did in the transaction we just committed.
+ */
+ ASSERT(ret == -EINPROGRESS);
+ ret = 0;
}
out_free_path:
@@ -1255,6 +1284,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
quota_root = fs_info->quota_root;
fs_info->quota_root = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
+ fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
spin_unlock(&fs_info->qgroup_lock);
btrfs_free_qgroup_config(fs_info);
@@ -1717,7 +1747,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
ret = update_qgroup_limit_item(trans, qgroup);
if (ret) {
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
btrfs_info(fs_info, "unable to update quota limit for %llu",
qgroupid);
}
@@ -1765,8 +1795,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord)
{
- struct ulist *old_root;
- u64 bytenr = qrecord->bytenr;
+ struct btrfs_backref_walk_ctx ctx = { 0 };
int ret;
/*
@@ -1790,10 +1819,15 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
*/
ASSERT(trans != NULL);
- ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
- true);
+ if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
+ return 0;
+
+ ctx.bytenr = qrecord->bytenr;
+ ctx.fs_info = trans->fs_info;
+
+ ret = btrfs_find_all_roots(&ctx, true);
if (ret < 0) {
- trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(trans->fs_info);
btrfs_warn(trans->fs_info,
"error accounting new delayed refs extent (err code: %d), quota inconsistent",
ret);
@@ -1807,12 +1841,12 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
*
* So modifying qrecord->old_roots is safe here
*/
- qrecord->old_roots = old_root;
+ qrecord->old_roots = ctx.roots;
return 0;
}
int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
- u64 num_bytes, gfp_t gfp_flag)
+ u64 num_bytes)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_qgroup_extent_record *record;
@@ -1822,7 +1856,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
|| bytenr == 0 || num_bytes == 0)
return 0;
- record = kzalloc(sizeof(*record), gfp_flag);
+ record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record)
return -ENOMEM;
@@ -1874,8 +1908,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
- ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes,
- GFP_NOFS);
+ ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes);
if (ret)
return ret;
}
@@ -2074,12 +2107,11 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
* blocks for qgroup accounting.
*/
ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
- nodesize, GFP_NOFS);
+ nodesize);
if (ret < 0)
goto out;
- ret = btrfs_qgroup_trace_extent(trans,
- dst_path->nodes[dst_level]->start,
- nodesize, GFP_NOFS);
+ ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start,
+ nodesize);
if (ret < 0)
goto out;
@@ -2269,7 +2301,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
out:
btrfs_free_path(dst_path);
if (ret < 0)
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
return ret;
}
@@ -2280,6 +2312,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret = 0;
int level;
+ u8 drop_subptree_thres;
struct extent_buffer *eb = root_eb;
struct btrfs_path *path = NULL;
@@ -2289,8 +2322,31 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return 0;
+ spin_lock(&fs_info->qgroup_lock);
+ drop_subptree_thres = fs_info->qgroup_drop_subtree_thres;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ /*
+ * This function only gets called for snapshot drop, if we hit a high
+ * node here, it means we are going to change ownership for quite a lot
+ * of extents, which will greatly slow down btrfs_commit_transaction().
+ *
+ * So here if we find a high tree here, we just skip the accounting and
+ * mark qgroup inconsistent.
+ */
+ if (root_level >= drop_subptree_thres) {
+ qgroup_mark_inconsistent(fs_info);
+ return 0;
+ }
+
if (!extent_buffer_uptodate(root_eb)) {
- ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL);
+ struct btrfs_tree_parent_check check = {
+ .has_first_key = false,
+ .transid = root_gen,
+ .level = root_level
+ };
+
+ ret = btrfs_read_extent_buffer(root_eb, &check);
if (ret)
goto out;
}
@@ -2345,8 +2401,7 @@ walk_down:
path->locks[level] = BTRFS_READ_LOCK;
ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
- fs_info->nodesize,
- GFP_NOFS);
+ fs_info->nodesize);
if (ret)
goto out;
}
@@ -2604,7 +2659,8 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
* If quotas get disabled meanwhile, the resources need to be freed and
* we can't just exit here.
*/
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
goto out_free;
if (new_roots) {
@@ -2700,18 +2756,24 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
num_dirty_extents++;
trace_btrfs_qgroup_account_extents(fs_info, record);
- if (!ret) {
+ if (!ret && !(fs_info->qgroup_flags &
+ BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) {
+ struct btrfs_backref_walk_ctx ctx = { 0 };
+
+ ctx.bytenr = record->bytenr;
+ ctx.fs_info = fs_info;
+
/*
* Old roots should be searched when inserting qgroup
* extent record
*/
if (WARN_ON(!record->old_roots)) {
/* Search commit root to find old_roots */
- ret = btrfs_find_all_roots(NULL, fs_info,
- record->bytenr, 0,
- &record->old_roots, false);
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret < 0)
goto cleanup;
+ record->old_roots = ctx.roots;
+ ctx.roots = NULL;
}
/* Free the reserved data space */
@@ -2724,10 +2786,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
* which doesn't lock tree or delayed_refs and search
* current root. It's safe inside commit_transaction().
*/
- ret = btrfs_find_all_roots(trans, fs_info,
- record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
+ ctx.trans = trans;
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret < 0)
goto cleanup;
+ new_roots = ctx.roots;
if (qgroup_to_skip) {
ulist_del(new_roots, qgroup_to_skip, 0);
ulist_del(record->old_roots, qgroup_to_skip,
@@ -2773,12 +2836,10 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
spin_unlock(&fs_info->qgroup_lock);
ret = update_qgroup_info_item(trans, qgroup);
if (ret)
- fs_info->qgroup_flags |=
- BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
ret = update_qgroup_limit_item(trans, qgroup);
if (ret)
- fs_info->qgroup_flags |=
- BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
spin_lock(&fs_info->qgroup_lock);
}
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
@@ -2789,7 +2850,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
ret = update_qgroup_status_item(trans);
if (ret)
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
return ret;
}
@@ -2905,14 +2966,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
dstgroup->rsv_excl = inherit->lim.rsv_excl;
- ret = update_qgroup_limit_item(trans, dstgroup);
- if (ret) {
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- btrfs_info(fs_info,
- "unable to update quota limit for %llu",
- dstgroup->qgroupid);
- goto unlock;
- }
+ qgroup_dirty(fs_info, dstgroup);
}
if (srcid) {
@@ -3013,7 +3067,7 @@ out:
if (!committing)
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (need_rescan)
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
return ret;
}
@@ -3202,7 +3256,6 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root;
struct btrfs_key found;
struct extent_buffer *scratch_leaf = NULL;
- struct ulist *roots = NULL;
u64 num_bytes;
bool done;
int slot;
@@ -3252,6 +3305,8 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
mutex_unlock(&fs_info->qgroup_rescan_lock);
for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
+ struct btrfs_backref_walk_ctx ctx = { 0 };
+
btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
if (found.type != BTRFS_EXTENT_ITEM_KEY &&
found.type != BTRFS_METADATA_ITEM_KEY)
@@ -3261,13 +3316,15 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
else
num_bytes = found.offset;
- ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
- &roots, false);
+ ctx.bytenr = found.objectid;
+ ctx.fs_info = fs_info;
+
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret < 0)
goto out;
/* For rescan, just pass old_roots as NULL */
ret = btrfs_qgroup_account_extent(trans, found.objectid,
- num_bytes, NULL, roots);
+ num_bytes, NULL, ctx.roots);
if (ret < 0)
goto out;
}
@@ -3286,7 +3343,8 @@ static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
{
return btrfs_fs_closing(fs_info) ||
test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
- !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
}
static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
@@ -3351,7 +3409,8 @@ out:
}
mutex_lock(&fs_info->qgroup_rescan_lock);
- if (!stopped)
+ if (!stopped ||
+ fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
if (trans) {
ret = update_qgroup_status_item(trans);
@@ -3362,6 +3421,7 @@ out:
}
}
fs_info->qgroup_rescan_running = false;
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
complete_all(&fs_info->qgroup_rescan_completion);
mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -3372,6 +3432,8 @@ out:
if (stopped) {
btrfs_info(fs_info, "qgroup scan paused");
+ } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) {
+ btrfs_info(fs_info, "qgroup scan cancelled");
} else if (err >= 0) {
btrfs_info(fs_info, "qgroup scan completed%s",
err > 0 ? " (inconsistency flag cleared)" : "");
@@ -3434,6 +3496,8 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
memset(&fs_info->qgroup_rescan_progress, 0,
sizeof(fs_info->qgroup_rescan_progress));
+ fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
+ BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
fs_info->qgroup_rescan_progress.objectid = progress_objectid;
init_completion(&fs_info->qgroup_rescan_completion);
mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -4231,8 +4295,7 @@ out_unlock:
spin_unlock(&blocks->lock);
out:
if (ret < 0)
- fs_info->qgroup_flags |=
- BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
return ret;
}
@@ -4247,6 +4310,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct extent_buffer *subvol_eb)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_tree_parent_check check = { 0 };
struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
struct btrfs_qgroup_swapped_block *block;
struct extent_buffer *reloc_eb = NULL;
@@ -4295,10 +4359,13 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
blocks->swapped = swapped;
spin_unlock(&blocks->lock);
+ check.level = block->level;
+ check.transid = block->reloc_generation;
+ check.has_first_key = true;
+ memcpy(&check.first_key, &block->first_key, sizeof(check.first_key));
+
/* Read out reloc subtree root */
- reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0,
- block->reloc_generation, block->level,
- &block->first_key);
+ reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, &check);
if (IS_ERR(reloc_eb)) {
ret = PTR_ERR(reloc_eb);
reloc_eb = NULL;
@@ -4319,7 +4386,7 @@ out:
btrfs_err_rl(fs_info,
"failed to account subtree at bytenr %llu: %d",
subvol_eb->start, ret);
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
}
return ret;
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 0c4dd2a9af96..7bffa10589d6 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -11,6 +11,7 @@
#include <linux/kobject.h>
#include "ulist.h"
#include "delayed-ref.h"
+#include "misc.h"
/*
* Btrfs qgroup overview
@@ -100,6 +101,9 @@
* subtree rescan for them.
*/
+#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1UL << 3)
+#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1UL << 4)
+
/*
* Record a dirty extent, and info qgroup to update quota on it
* TODO: Use kmem cache to alloc it.
@@ -239,9 +243,11 @@ static inline u64 btrfs_qgroup_subvolid(u64 qgroupid)
/*
* For qgroup event trace points only
*/
-#define QGROUP_RESERVE (1<<0)
-#define QGROUP_RELEASE (1<<1)
-#define QGROUP_FREE (1<<2)
+enum {
+ ENUM_BIT(QGROUP_RESERVE),
+ ENUM_BIT(QGROUP_RELEASE),
+ ENUM_BIT(QGROUP_FREE),
+};
int btrfs_quota_enable(struct btrfs_fs_info *fs_info);
int btrfs_quota_disable(struct btrfs_fs_info *fs_info);
@@ -315,7 +321,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
* (NULL trans)
*/
int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
- u64 num_bytes, gfp_t gfp_flag);
+ u64 num_bytes);
/*
* Inform qgroup to trace all leaf items of data
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 2feb5c20641a..2d90a6b5eb00 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -13,12 +13,15 @@
#include <linux/list_sort.h>
#include <linux/raid/xor.h>
#include <linux/mm.h>
+#include "messages.h"
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "volumes.h"
#include "raid56.h"
#include "async-thread.h"
+#include "file-item.h"
+#include "btrfs_inode.h"
/* set when additional merges to this rbio are not allowed */
#define RBIO_RMW_LOCKED_BIT 1
@@ -63,19 +66,45 @@ struct sector_ptr {
unsigned int uptodate:8;
};
-static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
-static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
-static void rmw_work(struct work_struct *work);
-static void read_rebuild_work(struct work_struct *work);
-static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
-static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
-static void __free_raid_bio(struct btrfs_raid_bio *rbio);
+static void rmw_rbio_work(struct work_struct *work);
+static void rmw_rbio_work_locked(struct work_struct *work);
static void index_rbio_pages(struct btrfs_raid_bio *rbio);
static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
-static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
- int need_check);
-static void scrub_parity_work(struct work_struct *work);
+static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check);
+static void scrub_rbio_work_locked(struct work_struct *work);
+
+static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
+{
+ bitmap_free(rbio->error_bitmap);
+ kfree(rbio->stripe_pages);
+ kfree(rbio->bio_sectors);
+ kfree(rbio->stripe_sectors);
+ kfree(rbio->finish_pointers);
+}
+
+static void free_raid_bio(struct btrfs_raid_bio *rbio)
+{
+ int i;
+
+ if (!refcount_dec_and_test(&rbio->refs))
+ return;
+
+ WARN_ON(!list_empty(&rbio->stripe_cache));
+ WARN_ON(!list_empty(&rbio->hash_list));
+ WARN_ON(!bio_list_empty(&rbio->bio_list));
+
+ for (i = 0; i < rbio->nr_pages; i++) {
+ if (rbio->stripe_pages[i]) {
+ __free_page(rbio->stripe_pages[i]);
+ rbio->stripe_pages[i] = NULL;
+ }
+ }
+
+ btrfs_put_bioc(rbio->bioc);
+ free_raid_bio_pointers(rbio);
+ kfree(rbio);
+}
static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
{
@@ -146,8 +175,16 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
for (i = 0; i < rbio->nr_sectors; i++) {
/* Some range not covered by bio (partial write), skip it */
- if (!rbio->bio_sectors[i].page)
+ if (!rbio->bio_sectors[i].page) {
+ /*
+ * Even if the sector is not covered by bio, if it is
+ * a data sector it should still be uptodate as it is
+ * read from disk.
+ */
+ if (i < rbio->nr_data * rbio->stripe_nsectors)
+ ASSERT(rbio->stripe_sectors[i].uptodate);
continue;
+ }
ASSERT(rbio->stripe_sectors[i].page);
memcpy_page(rbio->stripe_sectors[i].page,
@@ -234,6 +271,21 @@ static void steal_rbio_page(struct btrfs_raid_bio *src,
dest->stripe_sectors[i].uptodate = true;
}
+static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr)
+{
+ const int sector_nr = (page_nr << PAGE_SHIFT) >>
+ rbio->bioc->fs_info->sectorsize_bits;
+
+ /*
+ * We have ensured PAGE_SIZE is aligned with sectorsize, thus
+ * we won't have a page which is half data half parity.
+ *
+ * Thus if the first sector of the page belongs to data stripes, then
+ * the full page belongs to data stripes.
+ */
+ return (sector_nr < rbio->nr_data * rbio->stripe_nsectors);
+}
+
/*
* Stealing an rbio means taking all the uptodate pages from the stripe array
* in the source rbio and putting them into the destination rbio.
@@ -244,16 +296,26 @@ static void steal_rbio_page(struct btrfs_raid_bio *src,
static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
{
int i;
- struct page *s;
if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
return;
for (i = 0; i < dest->nr_pages; i++) {
- s = src->stripe_pages[i];
- if (!s || !full_page_sectors_uptodate(src, i))
+ struct page *p = src->stripe_pages[i];
+
+ /*
+ * We don't need to steal P/Q pages as they will always be
+ * regenerated for RMW or full write anyway.
+ */
+ if (!is_data_stripe_page(src, i))
continue;
+ /*
+ * If @src already has RBIO_CACHE_READY_BIT, it should have
+ * all data stripe pages present and uptodate.
+ */
+ ASSERT(p);
+ ASSERT(full_page_sectors_uptodate(src, i));
steal_rbio_page(src, dest, i);
}
index_stripe_sectors(dest);
@@ -275,7 +337,6 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
/* Also inherit the bitmaps from @victim. */
bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
dest->stripe_nsectors);
- dest->generic_bio_cnt += victim->generic_bio_cnt;
bio_list_init(&victim->bio_list);
}
@@ -337,7 +398,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
spin_unlock(&h->lock);
if (freeit)
- __free_raid_bio(rbio);
+ free_raid_bio(rbio);
}
/*
@@ -527,28 +588,10 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
return 0;
- if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
+ if (last->operation == BTRFS_RBIO_REBUILD_MISSING ||
+ last->operation == BTRFS_RBIO_READ_REBUILD)
return 0;
- if (last->operation == BTRFS_RBIO_READ_REBUILD) {
- int fa = last->faila;
- int fb = last->failb;
- int cur_fa = cur->faila;
- int cur_fb = cur->failb;
-
- if (last->faila >= last->failb) {
- fa = last->failb;
- fb = last->faila;
- }
-
- if (cur->faila >= cur->failb) {
- cur_fa = cur->failb;
- cur_fb = cur->faila;
- }
-
- if (fa != cur_fa || fb != cur_fb)
- return 0;
- }
return 1;
}
@@ -685,10 +728,12 @@ out:
if (cache_drop)
remove_rbio_from_cache(cache_drop);
if (freeit)
- __free_raid_bio(freeit);
+ free_raid_bio(freeit);
return ret;
}
+static void recover_rbio_work_locked(struct work_struct *work);
+
/*
* called as rmw or parity rebuild is completed. If the plug list has more
* rbios waiting for this stripe, the next one on the list will be started
@@ -746,16 +791,16 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
spin_unlock_irqrestore(&h->lock, flags);
if (next->operation == BTRFS_RBIO_READ_REBUILD)
- start_async_work(next, read_rebuild_work);
+ start_async_work(next, recover_rbio_work_locked);
else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
steal_rbio(rbio, next);
- start_async_work(next, read_rebuild_work);
+ start_async_work(next, recover_rbio_work_locked);
} else if (next->operation == BTRFS_RBIO_WRITE) {
steal_rbio(rbio, next);
- start_async_work(next, rmw_work);
+ start_async_work(next, rmw_rbio_work_locked);
} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
steal_rbio(rbio, next);
- start_async_work(next, scrub_parity_work);
+ start_async_work(next, scrub_rbio_work_locked);
}
goto done_nolock;
@@ -770,28 +815,6 @@ done_nolock:
remove_rbio_from_cache(rbio);
}
-static void __free_raid_bio(struct btrfs_raid_bio *rbio)
-{
- int i;
-
- if (!refcount_dec_and_test(&rbio->refs))
- return;
-
- WARN_ON(!list_empty(&rbio->stripe_cache));
- WARN_ON(!list_empty(&rbio->hash_list));
- WARN_ON(!bio_list_empty(&rbio->bio_list));
-
- for (i = 0; i < rbio->nr_pages; i++) {
- if (rbio->stripe_pages[i]) {
- __free_page(rbio->stripe_pages[i]);
- rbio->stripe_pages[i] = NULL;
- }
- }
-
- btrfs_put_bioc(rbio->bioc);
- kfree(rbio);
-}
-
static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
{
struct bio *next;
@@ -814,8 +837,11 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
struct bio *cur = bio_list_get(&rbio->bio_list);
struct bio *extra;
- if (rbio->generic_bio_cnt)
- btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
+ kfree(rbio->csum_buf);
+ bitmap_free(rbio->csum_bitmap);
+ rbio->csum_buf = NULL;
+ rbio->csum_bitmap = NULL;
+
/*
* Clear the data bitmap, as the rbio may be cached for later usage.
* do this before before unlock_stripe() so there will be no new bio
@@ -833,7 +859,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
*/
unlock_stripe(rbio);
extra = bio_list_get(&rbio->bio_list);
- __free_raid_bio(rbio);
+ free_raid_bio(rbio);
rbio_endio_bio_list(cur, err);
if (extra)
@@ -841,36 +867,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
}
/*
- * end io function used by finish_rmw. When we finally
- * get here, we've written a full stripe
- */
-static void raid_write_end_io(struct bio *bio)
-{
- struct btrfs_raid_bio *rbio = bio->bi_private;
- blk_status_t err = bio->bi_status;
- int max_errors;
-
- if (err)
- fail_bio_stripe(rbio, bio);
-
- bio_put(bio);
-
- if (!atomic_dec_and_test(&rbio->stripes_pending))
- return;
-
- err = BLK_STS_OK;
-
- /* OK, we have read all the stripes we need to. */
- max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
- 0 : rbio->bioc->max_errors;
- if (atomic_read(&rbio->error) > max_errors)
- err = BLK_STS_IOERR;
-
- rbio_orig_end_io(rbio, err);
-}
-
-/**
- * Get a sector pointer specified by its @stripe_nr and @sector_nr
+ * Get a sector pointer specified by its @stripe_nr and @sector_nr.
*
* @rbio: The raid bio
* @stripe_nr: Stripe number, valid range [0, real_stripe)
@@ -922,7 +919,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
const unsigned int num_sectors = stripe_nsectors * real_stripes;
struct btrfs_raid_bio *rbio;
- void *p;
/* PAGE_SIZE must also be aligned to sectorsize for subpage support */
ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
@@ -932,47 +928,41 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
*/
ASSERT(stripe_nsectors <= BITS_PER_LONG);
- rbio = kzalloc(sizeof(*rbio) +
- sizeof(*rbio->stripe_pages) * num_pages +
- sizeof(*rbio->bio_sectors) * num_sectors +
- sizeof(*rbio->stripe_sectors) * num_sectors +
- sizeof(*rbio->finish_pointers) * real_stripes,
- GFP_NOFS);
+ rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
if (!rbio)
return ERR_PTR(-ENOMEM);
+ rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *),
+ GFP_NOFS);
+ rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
+ GFP_NOFS);
+ rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
+ GFP_NOFS);
+ rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
+ rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
+
+ if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors ||
+ !rbio->finish_pointers || !rbio->error_bitmap) {
+ free_raid_bio_pointers(rbio);
+ kfree(rbio);
+ return ERR_PTR(-ENOMEM);
+ }
bio_list_init(&rbio->bio_list);
+ init_waitqueue_head(&rbio->io_wait);
INIT_LIST_HEAD(&rbio->plug_list);
spin_lock_init(&rbio->bio_list_lock);
INIT_LIST_HEAD(&rbio->stripe_cache);
INIT_LIST_HEAD(&rbio->hash_list);
+ btrfs_get_bioc(bioc);
rbio->bioc = bioc;
rbio->nr_pages = num_pages;
rbio->nr_sectors = num_sectors;
rbio->real_stripes = real_stripes;
rbio->stripe_npages = stripe_npages;
rbio->stripe_nsectors = stripe_nsectors;
- rbio->faila = -1;
- rbio->failb = -1;
refcount_set(&rbio->refs, 1);
- atomic_set(&rbio->error, 0);
atomic_set(&rbio->stripes_pending, 0);
- /*
- * The stripe_pages, bio_sectors, etc arrays point to the extra memory
- * we allocated past the end of the rbio.
- */
- p = rbio + 1;
-#define CONSUME_ALLOC(ptr, count) do { \
- ptr = p; \
- p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
- } while (0)
- CONSUME_ALLOC(rbio->stripe_pages, num_pages);
- CONSUME_ALLOC(rbio->bio_sectors, num_sectors);
- CONSUME_ALLOC(rbio->stripe_sectors, num_sectors);
- CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
-#undef CONSUME_ALLOC
-
ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
@@ -1008,6 +998,45 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
}
/*
+ * Return the total numer of errors found in the vertical stripe of @sector_nr.
+ *
+ * @faila and @failb will also be updated to the first and second stripe
+ * number of the errors.
+ */
+static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
+ int *faila, int *failb)
+{
+ int stripe_nr;
+ int found_errors = 0;
+
+ if (faila || failb) {
+ /*
+ * Both @faila and @failb should be valid pointers if any of
+ * them is specified.
+ */
+ ASSERT(faila && failb);
+ *faila = -1;
+ *failb = -1;
+ }
+
+ for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
+ int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
+
+ if (test_bit(total_sector_nr, rbio->error_bitmap)) {
+ found_errors++;
+ if (faila) {
+ /* Update faila and failb. */
+ if (*faila < 0)
+ *faila = stripe_nr;
+ else if (*failb < 0)
+ *failb = stripe_nr;
+ }
+ }
+ }
+ return found_errors;
+}
+
+/*
* Add a single sector @sector into our list of bios for IO.
*
* Return 0 if everything went well.
@@ -1040,8 +1069,19 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
disk_start = stripe->physical + sector_nr * sectorsize;
/* if the device is missing, just fail this stripe */
- if (!stripe->dev->bdev)
- return fail_rbio_index(rbio, stripe_nr);
+ if (!stripe->dev->bdev) {
+ int found_errors;
+
+ set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr,
+ rbio->error_bitmap);
+
+ /* Check if we have reached tolerance early. */
+ found_errors = get_rbio_veritical_errors(rbio, sector_nr,
+ NULL, NULL);
+ if (found_errors > rbio->bioc->max_errors)
+ return -EIO;
+ return 0;
+ }
/* see if we can add this page onto our existing bio */
if (last) {
@@ -1073,23 +1113,6 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
return 0;
}
-/*
- * while we're doing the read/modify/write cycle, we could
- * have errors in reading pages off the disk. This checks
- * for errors and if we're not able to read the page it'll
- * trigger parity reconstruction. The rmw will be finished
- * after we've reconstructed the failed stripes
- */
-static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
-{
- if (rbio->faila >= 0 || rbio->failb >= 0) {
- BUG_ON(rbio->faila == rbio->real_stripes - 1);
- __raid56_parity_recover(rbio);
- } else {
- finish_rmw(rbio);
- }
-}
-
static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
{
const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
@@ -1160,109 +1183,71 @@ not_found:
trace_info->stripe_nr = -1;
}
-/*
- * this is called from one of two situations. We either
- * have a full stripe from the higher layers, or we've read all
- * the missing bits off disk.
- *
- * This will calculate the parity and then send down any
- * changed blocks.
- */
-static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
+/* Generate PQ for one veritical stripe. */
+static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
{
- struct btrfs_io_context *bioc = rbio->bioc;
- const u32 sectorsize = bioc->fs_info->sectorsize;
void **pointers = rbio->finish_pointers;
- int nr_data = rbio->nr_data;
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ struct sector_ptr *sector;
+ int stripe;
+ const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
+
+ /* First collect one sector from each data stripe */
+ for (stripe = 0; stripe < rbio->nr_data; stripe++) {
+ sector = sector_in_rbio(rbio, stripe, sectornr, 0);
+ pointers[stripe] = kmap_local_page(sector->page) +
+ sector->pgoff;
+ }
+
+ /* Then add the parity stripe */
+ sector = rbio_pstripe_sector(rbio, sectornr);
+ sector->uptodate = 1;
+ pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
+
+ if (has_qstripe) {
+ /*
+ * RAID6, add the qstripe and call the library function
+ * to fill in our p/q
+ */
+ sector = rbio_qstripe_sector(rbio, sectornr);
+ sector->uptodate = 1;
+ pointers[stripe++] = kmap_local_page(sector->page) +
+ sector->pgoff;
+
+ raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
+ pointers);
+ } else {
+ /* raid5 */
+ memcpy(pointers[rbio->nr_data], pointers[0], sectorsize);
+ run_xor(pointers + 1, rbio->nr_data - 1, sectorsize);
+ }
+ for (stripe = stripe - 1; stripe >= 0; stripe--)
+ kunmap_local(pointers[stripe]);
+}
+
+static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list)
+{
+ struct bio *bio;
/* The total sector number inside the full stripe. */
int total_sector_nr;
- int stripe;
- /* Sector number inside a stripe. */
int sectornr;
- bool has_qstripe;
- struct bio_list bio_list;
- struct bio *bio;
+ int stripe;
int ret;
- bio_list_init(&bio_list);
-
- if (rbio->real_stripes - rbio->nr_data == 1)
- has_qstripe = false;
- else if (rbio->real_stripes - rbio->nr_data == 2)
- has_qstripe = true;
- else
- BUG();
+ ASSERT(bio_list_size(bio_list) == 0);
/* We should have at least one data sector. */
ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
- /* at this point we either have a full stripe,
- * or we've read the full stripe from the drive.
- * recalculate the parity and write the new results.
- *
- * We're not allowed to add any new bios to the
- * bio list here, anyone else that wants to
- * change this stripe needs to do their own rmw.
- */
- spin_lock_irq(&rbio->bio_list_lock);
- set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
- spin_unlock_irq(&rbio->bio_list_lock);
-
- atomic_set(&rbio->error, 0);
-
/*
- * now that we've set rmw_locked, run through the
- * bio list one last time and map the page pointers
- *
- * We don't cache full rbios because we're assuming
- * the higher layers are unlikely to use this area of
- * the disk again soon. If they do use it again,
- * hopefully they will send another full bio.
+ * Reset errors, as we may have errors inherited from from degraded
+ * write.
*/
- index_rbio_pages(rbio);
- if (!rbio_is_full(rbio))
- cache_rbio_pages(rbio);
- else
- clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
-
- for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
- struct sector_ptr *sector;
-
- /* First collect one sector from each data stripe */
- for (stripe = 0; stripe < nr_data; stripe++) {
- sector = sector_in_rbio(rbio, stripe, sectornr, 0);
- pointers[stripe] = kmap_local_page(sector->page) +
- sector->pgoff;
- }
-
- /* Then add the parity stripe */
- sector = rbio_pstripe_sector(rbio, sectornr);
- sector->uptodate = 1;
- pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
-
- if (has_qstripe) {
- /*
- * RAID6, add the qstripe and call the library function
- * to fill in our p/q
- */
- sector = rbio_qstripe_sector(rbio, sectornr);
- sector->uptodate = 1;
- pointers[stripe++] = kmap_local_page(sector->page) +
- sector->pgoff;
-
- raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
- pointers);
- } else {
- /* raid5 */
- memcpy(pointers[nr_data], pointers[0], sectorsize);
- run_xor(pointers + 1, nr_data - 1, sectorsize);
- }
- for (stripe = stripe - 1; stripe >= 0; stripe--)
- kunmap_local(pointers[stripe]);
- }
+ bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
/*
- * Start writing. Make bios for everything from the higher layers (the
+ * Start assembly. Make bios for everything from the higher layers (the
* bio_list in our rbio) and our P/Q. Ignore everything else.
*/
for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
@@ -1284,15 +1269,16 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
sector = rbio_stripe_sector(rbio, stripe, sectornr);
}
- ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
sectornr, REQ_OP_WRITE);
if (ret)
- goto cleanup;
+ goto error;
}
- if (likely(!bioc->num_tgtdevs))
- goto write_data;
+ if (likely(!rbio->bioc->num_tgtdevs))
+ return 0;
+ /* Make a copy for the replace target device. */
for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
total_sector_nr++) {
struct sector_ptr *sector;
@@ -1300,7 +1286,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
stripe = total_sector_nr / rbio->stripe_nsectors;
sectornr = total_sector_nr % rbio->stripe_nsectors;
- if (!bioc->tgtdev_map[stripe]) {
+ if (!rbio->bioc->tgtdev_map[stripe]) {
/*
* We can skip the whole stripe completely, note
* total_sector_nr will be increased by one anyway.
@@ -1322,125 +1308,52 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
sector = rbio_stripe_sector(rbio, stripe, sectornr);
}
- ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ ret = rbio_add_io_sector(rbio, bio_list, sector,
rbio->bioc->tgtdev_map[stripe],
sectornr, REQ_OP_WRITE);
if (ret)
- goto cleanup;
- }
-
-write_data:
- atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
- BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
-
- while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid_write_end_io;
-
- if (trace_raid56_write_stripe_enabled()) {
- struct raid56_bio_trace_info trace_info = { 0 };
-
- bio_get_trace_info(rbio, bio, &trace_info);
- trace_raid56_write_stripe(rbio, bio, &trace_info);
- }
- submit_bio(bio);
+ goto error;
}
- return;
-cleanup:
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
-
- while ((bio = bio_list_pop(&bio_list)))
+ return 0;
+error:
+ while ((bio = bio_list_pop(bio_list)))
bio_put(bio);
+ return -EIO;
}
-/*
- * helper to find the stripe number for a given bio. Used to figure out which
- * stripe has failed. This expects the bio to correspond to a physical disk,
- * so it looks up based on physical sector numbers.
- */
-static int find_bio_stripe(struct btrfs_raid_bio *rbio,
- struct bio *bio)
-{
- u64 physical = bio->bi_iter.bi_sector;
- int i;
- struct btrfs_io_stripe *stripe;
-
- physical <<= 9;
-
- for (i = 0; i < rbio->bioc->num_stripes; i++) {
- stripe = &rbio->bioc->stripes[i];
- if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) &&
- stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
- return i;
- }
- }
- return -1;
-}
-
-/*
- * helper to find the stripe number for a given
- * bio (before mapping). Used to figure out which stripe has
- * failed. This looks up based on logical block numbers.
- */
-static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
- struct bio *bio)
-{
- u64 logical = bio->bi_iter.bi_sector << 9;
- int i;
-
- for (i = 0; i < rbio->nr_data; i++) {
- u64 stripe_start = rbio->bioc->raid_map[i];
-
- if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN))
- return i;
- }
- return -1;
-}
-
-/*
- * returns -EIO if we had too many failures
- */
-static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
+static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
{
- unsigned long flags;
- int ret = 0;
+ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
+ rbio->bioc->raid_map[0];
+ int total_nr_sector = offset >> fs_info->sectorsize_bits;
- spin_lock_irqsave(&rbio->bio_list_lock, flags);
+ ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
- /* we already know this stripe is bad, move on */
- if (rbio->faila == failed || rbio->failb == failed)
- goto out;
+ bitmap_set(rbio->error_bitmap, total_nr_sector,
+ bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
- if (rbio->faila == -1) {
- /* first failure on this rbio */
- rbio->faila = failed;
- atomic_inc(&rbio->error);
- } else if (rbio->failb == -1) {
- /* second failure on this rbio */
- rbio->failb = failed;
- atomic_inc(&rbio->error);
- } else {
- ret = -EIO;
+ /*
+ * Special handling for raid56_alloc_missing_rbio() used by
+ * scrub/replace. Unlike call path in raid56_parity_recover(), they
+ * pass an empty bio here. Thus we have to find out the missing device
+ * and mark the stripe error instead.
+ */
+ if (bio->bi_iter.bi_size == 0) {
+ bool found_missing = false;
+ int stripe_nr;
+
+ for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
+ if (!rbio->bioc->stripes[stripe_nr].dev->bdev) {
+ found_missing = true;
+ bitmap_set(rbio->error_bitmap,
+ stripe_nr * rbio->stripe_nsectors,
+ rbio->stripe_nsectors);
+ }
+ }
+ ASSERT(found_missing);
}
-out:
- spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
-
- return ret;
-}
-
-/*
- * helper to fail a stripe based on a physical disk
- * bio.
- */
-static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
- struct bio *bio)
-{
- int failed = find_bio_stripe(rbio, bio);
-
- if (failed < 0)
- return -EIO;
-
- return fail_rbio_index(rbio, failed);
}
/*
@@ -1488,193 +1401,163 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
}
}
-static void raid56_bio_end_io(struct bio *bio)
+static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio)
{
- struct btrfs_raid_bio *rbio = bio->bi_private;
-
- if (bio->bi_status)
- fail_bio_stripe(rbio, bio);
- else
- set_bio_pages_uptodate(rbio, bio);
+ struct bio_vec *bv = bio_first_bvec_all(bio);
+ int i;
- bio_put(bio);
+ for (i = 0; i < rbio->nr_sectors; i++) {
+ struct sector_ptr *sector;
- if (atomic_dec_and_test(&rbio->stripes_pending))
- queue_work(rbio->bioc->fs_info->endio_raid56_workers,
- &rbio->end_io_work);
+ sector = &rbio->stripe_sectors[i];
+ if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
+ break;
+ sector = &rbio->bio_sectors[i];
+ if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
+ break;
+ }
+ ASSERT(i < rbio->nr_sectors);
+ return i;
}
-/*
- * End io handler for the read phase of the RMW cycle. All the bios here are
- * physical stripe bios we've read from the disk so we can recalculate the
- * parity of the stripe.
- *
- * This will usually kick off finish_rmw once all the bios are read in, but it
- * may trigger parity reconstruction if we had any errors along the way
- */
-static void raid56_rmw_end_io_work(struct work_struct *work)
+static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio)
{
- struct btrfs_raid_bio *rbio =
- container_of(work, struct btrfs_raid_bio, end_io_work);
+ int total_sector_nr = get_bio_sector_nr(rbio, bio);
+ u32 bio_size = 0;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
- if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
- return;
- }
+ bio_for_each_segment_all(bvec, bio, iter_all)
+ bio_size += bvec->bv_len;
- /*
- * This will normally call finish_rmw to start our write but if there
- * are any failed stripes we'll reconstruct from parity first.
- */
- validate_rbio_for_rmw(rbio);
+ bitmap_set(rbio->error_bitmap, total_sector_nr,
+ bio_size >> rbio->bioc->fs_info->sectorsize_bits);
}
-/*
- * the stripe must be locked by the caller. It will
- * unlock after all the writes are done
- */
-static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
+/* Verify the data sectors at read time. */
+static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
+ struct bio *bio)
{
- int bios_to_read = 0;
- struct bio_list bio_list;
- const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data;
- int ret;
- int total_sector_nr;
- struct bio *bio;
+ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ int total_sector_nr = get_bio_sector_nr(rbio, bio);
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
- bio_list_init(&bio_list);
+ /* No data csum for the whole stripe, no need to verify. */
+ if (!rbio->csum_bitmap || !rbio->csum_buf)
+ return;
- ret = alloc_rbio_pages(rbio);
- if (ret)
- goto cleanup;
+ /* P/Q stripes, they have no data csum to verify against. */
+ if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
+ return;
- index_rbio_pages(rbio);
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ int bv_offset;
+
+ for (bv_offset = bvec->bv_offset;
+ bv_offset < bvec->bv_offset + bvec->bv_len;
+ bv_offset += fs_info->sectorsize, total_sector_nr++) {
+ u8 csum_buf[BTRFS_CSUM_SIZE];
+ u8 *expected_csum = rbio->csum_buf +
+ total_sector_nr * fs_info->csum_size;
+ int ret;
- atomic_set(&rbio->error, 0);
- /* Build a list of bios to read all the missing data sectors. */
- for (total_sector_nr = 0; total_sector_nr < nr_data_sectors;
- total_sector_nr++) {
- struct sector_ptr *sector;
- int stripe = total_sector_nr / rbio->stripe_nsectors;
- int sectornr = total_sector_nr % rbio->stripe_nsectors;
+ /* No csum for this sector, skip to the next sector. */
+ if (!test_bit(total_sector_nr, rbio->csum_bitmap))
+ continue;
- /*
- * We want to find all the sectors missing from the rbio and
- * read them from the disk. If sector_in_rbio() finds a page
- * in the bio list we don't need to read it off the stripe.
- */
- sector = sector_in_rbio(rbio, stripe, sectornr, 1);
- if (sector)
- continue;
+ ret = btrfs_check_sector_csum(fs_info, bvec->bv_page,
+ bv_offset, csum_buf, expected_csum);
+ if (ret < 0)
+ set_bit(total_sector_nr, rbio->error_bitmap);
+ }
+ }
+}
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
- /*
- * The bio cache may have handed us an uptodate page. If so,
- * use it.
- */
- if (sector->uptodate)
- continue;
+static void raid_wait_read_end_io(struct bio *bio)
+{
+ struct btrfs_raid_bio *rbio = bio->bi_private;
- ret = rbio_add_io_sector(rbio, &bio_list, sector,
- stripe, sectornr, REQ_OP_READ);
- if (ret)
- goto cleanup;
+ if (bio->bi_status) {
+ rbio_update_error_bitmap(rbio, bio);
+ } else {
+ set_bio_pages_uptodate(rbio, bio);
+ verify_bio_data_sectors(rbio, bio);
}
- bios_to_read = bio_list_size(&bio_list);
- if (!bios_to_read) {
- /*
- * this can happen if others have merged with
- * us, it means there is nothing left to read.
- * But if there are missing devices it may not be
- * safe to do the full stripe write yet.
- */
- goto finish;
- }
+ bio_put(bio);
+ if (atomic_dec_and_test(&rbio->stripes_pending))
+ wake_up(&rbio->io_wait);
+}
- /*
- * The bioc may be freed once we submit the last bio. Make sure not to
- * touch it after that.
- */
- atomic_set(&rbio->stripes_pending, bios_to_read);
- INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
- while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid56_bio_end_io;
+static void submit_read_bios(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list)
+{
+ struct bio *bio;
+
+ atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
+ while ((bio = bio_list_pop(bio_list))) {
+ bio->bi_end_io = raid_wait_read_end_io;
- if (trace_raid56_read_partial_enabled()) {
+ if (trace_raid56_scrub_read_recover_enabled()) {
struct raid56_bio_trace_info trace_info = { 0 };
bio_get_trace_info(rbio, bio, &trace_info);
- trace_raid56_read_partial(rbio, bio, &trace_info);
+ trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
}
submit_bio(bio);
}
- /* the actual write will happen once the reads are done */
- return 0;
+}
-cleanup:
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
+static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list)
+{
+ struct bio *bio;
+ int total_sector_nr;
+ int ret = 0;
- while ((bio = bio_list_pop(&bio_list)))
- bio_put(bio);
+ ASSERT(bio_list_size(bio_list) == 0);
- return -EIO;
+ /*
+ * Build a list of bios to read all sectors (including data and P/Q).
+ *
+ * This behaviro is to compensate the later csum verification and
+ * recovery.
+ */
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ struct sector_ptr *sector;
+ int stripe = total_sector_nr / rbio->stripe_nsectors;
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
-finish:
- validate_rbio_for_rmw(rbio);
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ ret = rbio_add_io_sector(rbio, bio_list, sector,
+ stripe, sectornr, REQ_OP_READ);
+ if (ret)
+ goto cleanup;
+ }
return 0;
+
+cleanup:
+ while ((bio = bio_list_pop(bio_list)))
+ bio_put(bio);
+ return ret;
}
-/*
- * if the upper layers pass in a full stripe, we thank them by only allocating
- * enough pages to hold the parity, and sending it all down quickly.
- */
-static int full_stripe_write(struct btrfs_raid_bio *rbio)
+static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
{
+ const int data_pages = rbio->nr_data * rbio->stripe_npages;
int ret;
- ret = alloc_rbio_parity_pages(rbio);
- if (ret) {
- __free_raid_bio(rbio);
+ ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages);
+ if (ret < 0)
return ret;
- }
-
- ret = lock_stripe_add(rbio);
- if (ret == 0)
- finish_rmw(rbio);
- return 0;
-}
-/*
- * partial stripe writes get handed over to async helpers.
- * We're really hoping to merge a few more writes into this
- * rbio before calculating new parity
- */
-static int partial_stripe_write(struct btrfs_raid_bio *rbio)
-{
- int ret;
-
- ret = lock_stripe_add(rbio);
- if (ret == 0)
- start_async_work(rbio, rmw_work);
+ index_stripe_sectors(rbio);
return 0;
}
/*
- * sometimes while we were reading from the drive to
- * recalculate parity, enough new bios come into create
- * a full stripe. So we do a check here to see if we can
- * go directly to finish_rmw
- */
-static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
-{
- /* head off into rmw land if we don't have a full stripe */
- if (!rbio_is_full(rbio))
- return partial_stripe_write(rbio);
- return full_stripe_write(rbio);
-}
-
-/*
* We use plugging call backs to collect full stripes.
* Any time we get a partial stripe write while plugged
* we collect it into a list. When the unplug comes down,
@@ -1708,71 +1591,39 @@ static int plug_cmp(void *priv, const struct list_head *a,
return 0;
}
-static void run_plug(struct btrfs_plug_cb *plug)
+static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
{
+ struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb);
struct btrfs_raid_bio *cur;
struct btrfs_raid_bio *last = NULL;
- /*
- * sort our plug list then try to merge
- * everything we can in hopes of creating full
- * stripes.
- */
list_sort(NULL, &plug->rbio_list, plug_cmp);
+
while (!list_empty(&plug->rbio_list)) {
cur = list_entry(plug->rbio_list.next,
struct btrfs_raid_bio, plug_list);
list_del_init(&cur->plug_list);
if (rbio_is_full(cur)) {
- int ret;
-
- /* we have a full stripe, send it down */
- ret = full_stripe_write(cur);
- BUG_ON(ret);
+ /* We have a full stripe, queue it down. */
+ start_async_work(cur, rmw_rbio_work);
continue;
}
if (last) {
if (rbio_can_merge(last, cur)) {
merge_rbio(last, cur);
- __free_raid_bio(cur);
+ free_raid_bio(cur);
continue;
-
}
- __raid56_parity_write(last);
+ start_async_work(last, rmw_rbio_work);
}
last = cur;
}
- if (last) {
- __raid56_parity_write(last);
- }
+ if (last)
+ start_async_work(last, rmw_rbio_work);
kfree(plug);
}
-/*
- * if the unplug comes from schedule, we have to push the
- * work off to a helper thread
- */
-static void unplug_work(struct work_struct *work)
-{
- struct btrfs_plug_cb *plug;
- plug = container_of(work, struct btrfs_plug_cb, work);
- run_plug(plug);
-}
-
-static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
-{
- struct btrfs_plug_cb *plug;
- plug = container_of(cb, struct btrfs_plug_cb, cb);
-
- if (from_schedule) {
- INIT_WORK(&plug->work, unplug_work);
- queue_work(plug->info->rmw_workers, &plug->work);
- return;
- }
- run_plug(plug);
-}
-
/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
{
@@ -1813,27 +1664,20 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio)) {
- btrfs_put_bioc(bioc);
ret = PTR_ERR(rbio);
- goto out_dec_counter;
+ goto fail;
}
rbio->operation = BTRFS_RBIO_WRITE;
rbio_add_bio(rbio, bio);
- rbio->generic_bio_cnt = 1;
-
/*
- * don't plug on full rbios, just get them out the door
+ * Don't plug on full rbios, just get them out the door
* as quickly as we can
*/
- if (rbio_is_full(rbio)) {
- ret = full_stripe_write(rbio);
- if (ret)
- goto out_dec_counter;
- return;
- }
+ if (rbio_is_full(rbio))
+ goto queue_rbio;
- cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
+ cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
if (cb) {
plug = container_of(cb, struct btrfs_plug_cb, cb);
if (!plug->info) {
@@ -1841,282 +1685,270 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
INIT_LIST_HEAD(&plug->rbio_list);
}
list_add_tail(&rbio->plug_list, &plug->rbio_list);
- } else {
- ret = __raid56_parity_write(rbio);
- if (ret)
- goto out_dec_counter;
+ return;
}
+queue_rbio:
+ /*
+ * Either we don't have any existing plug, or we're doing a full stripe,
+ * can queue the rmw work now.
+ */
+ start_async_work(rbio, rmw_rbio_work);
return;
-out_dec_counter:
- btrfs_bio_counter_dec(fs_info);
+fail:
bio->bi_status = errno_to_blk_status(ret);
bio_endio(bio);
}
-/*
- * all parity reconstruction happens here. We've read in everything
- * we can find from the drives and this does the heavy lifting of
- * sorting the good from the bad.
- */
-static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
+static int verify_one_sector(struct btrfs_raid_bio *rbio,
+ int stripe_nr, int sector_nr)
{
- const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
- int sectornr, stripe;
- void **pointers;
- void **unmap_array;
- int faila = -1, failb = -1;
- blk_status_t err;
- int i;
+ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ struct sector_ptr *sector;
+ u8 csum_buf[BTRFS_CSUM_SIZE];
+ u8 *csum_expected;
+ int ret;
- /*
- * This array stores the pointer for each sector, thus it has the extra
- * pgoff value added from each sector
- */
- pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
- if (!pointers) {
- err = BLK_STS_RESOURCE;
- goto cleanup_io;
- }
+ if (!rbio->csum_bitmap || !rbio->csum_buf)
+ return 0;
+ /* No way to verify P/Q as they are not covered by data csum. */
+ if (stripe_nr >= rbio->nr_data)
+ return 0;
/*
- * Store copy of pointers that does not get reordered during
- * reconstruction so that kunmap_local works.
+ * If we're rebuilding a read, we have to use pages from the
+ * bio list if possible.
*/
- unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
- if (!unmap_array) {
- err = BLK_STS_RESOURCE;
- goto cleanup_pointers;
+ if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) {
+ sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
+ } else {
+ sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
}
- faila = rbio->faila;
- failb = rbio->failb;
+ ASSERT(sector->page);
- if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
- rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
- spin_lock_irq(&rbio->bio_list_lock);
- set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
- spin_unlock_irq(&rbio->bio_list_lock);
- }
+ csum_expected = rbio->csum_buf +
+ (stripe_nr * rbio->stripe_nsectors + sector_nr) *
+ fs_info->csum_size;
+ ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff,
+ csum_buf, csum_expected);
+ return ret;
+}
- index_rbio_pages(rbio);
+/*
+ * Recover a vertical stripe specified by @sector_nr.
+ * @*pointers are the pre-allocated pointers by the caller, so we don't
+ * need to allocate/free the pointers again and again.
+ */
+static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
+ void **pointers, void **unmap_array)
+{
+ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ struct sector_ptr *sector;
+ const u32 sectorsize = fs_info->sectorsize;
+ int found_errors;
+ int faila;
+ int failb;
+ int stripe_nr;
+ int ret = 0;
- for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
- struct sector_ptr *sector;
+ /*
+ * Now we just use bitmap to mark the horizontal stripes in
+ * which we have data when doing parity scrub.
+ */
+ if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
+ !test_bit(sector_nr, &rbio->dbitmap))
+ return 0;
- /*
- * Now we just use bitmap to mark the horizontal stripes in
- * which we have data when doing parity scrub.
- */
- if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
- !test_bit(sectornr, &rbio->dbitmap))
- continue;
+ found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
+ &failb);
+ /*
+ * No errors in the veritical stripe, skip it. Can happen for recovery
+ * which only part of a stripe failed csum check.
+ */
+ if (!found_errors)
+ return 0;
+
+ if (found_errors > rbio->bioc->max_errors)
+ return -EIO;
+ /*
+ * Setup our array of pointers with sectors from each stripe
+ *
+ * NOTE: store a duplicate array of pointers to preserve the
+ * pointer order.
+ */
+ for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
/*
- * Setup our array of pointers with sectors from each stripe
- *
- * NOTE: store a duplicate array of pointers to preserve the
- * pointer order
+ * If we're rebuilding a read, we have to use pages from the
+ * bio list if possible.
*/
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- /*
- * If we're rebuilding a read, we have to use
- * pages from the bio list
- */
- if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
- rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
- (stripe == faila || stripe == failb)) {
- sector = sector_in_rbio(rbio, stripe, sectornr, 0);
- } else {
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
- }
- ASSERT(sector->page);
- pointers[stripe] = kmap_local_page(sector->page) +
- sector->pgoff;
- unmap_array[stripe] = pointers[stripe];
+ if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) {
+ sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
+ } else {
+ sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
}
+ ASSERT(sector->page);
+ pointers[stripe_nr] = kmap_local_page(sector->page) +
+ sector->pgoff;
+ unmap_array[stripe_nr] = pointers[stripe_nr];
+ }
- /* All raid6 handling here */
- if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
- /* Single failure, rebuild from parity raid5 style */
- if (failb < 0) {
- if (faila == rbio->nr_data) {
- /*
- * Just the P stripe has failed, without
- * a bad data or Q stripe.
- * TODO, we should redo the xor here.
- */
- err = BLK_STS_IOERR;
- goto cleanup;
- }
+ /* All raid6 handling here */
+ if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
+ /* Single failure, rebuild from parity raid5 style */
+ if (failb < 0) {
+ if (faila == rbio->nr_data)
/*
- * a single failure in raid6 is rebuilt
- * in the pstripe code below
+ * Just the P stripe has failed, without
+ * a bad data or Q stripe.
+ * We have nothing to do, just skip the
+ * recovery for this stripe.
*/
- goto pstripe;
- }
-
- /* make sure our ps and qs are in order */
- if (faila > failb)
- swap(faila, failb);
-
- /* if the q stripe is failed, do a pstripe reconstruction
- * from the xors.
- * If both the q stripe and the P stripe are failed, we're
- * here due to a crc mismatch and we can't give them the
- * data they want
+ goto cleanup;
+ /*
+ * a single failure in raid6 is rebuilt
+ * in the pstripe code below
*/
- if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
- if (rbio->bioc->raid_map[faila] ==
- RAID5_P_STRIPE) {
- err = BLK_STS_IOERR;
- goto cleanup;
- }
+ goto pstripe;
+ }
+
+ /*
+ * If the q stripe is failed, do a pstripe reconstruction from
+ * the xors.
+ * If both the q stripe and the P stripe are failed, we're
+ * here due to a crc mismatch and we can't give them the
+ * data they want.
+ */
+ if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
+ if (rbio->bioc->raid_map[faila] ==
+ RAID5_P_STRIPE)
/*
- * otherwise we have one bad data stripe and
- * a good P stripe. raid5!
+ * Only P and Q are corrupted.
+ * We only care about data stripes recovery,
+ * can skip this vertical stripe.
*/
- goto pstripe;
- }
+ goto cleanup;
+ /*
+ * Otherwise we have one bad data stripe and
+ * a good P stripe. raid5!
+ */
+ goto pstripe;
+ }
- if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
- raid6_datap_recov(rbio->real_stripes,
- sectorsize, faila, pointers);
- } else {
- raid6_2data_recov(rbio->real_stripes,
- sectorsize, faila, failb,
- pointers);
- }
+ if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
+ raid6_datap_recov(rbio->real_stripes, sectorsize,
+ faila, pointers);
} else {
- void *p;
+ raid6_2data_recov(rbio->real_stripes, sectorsize,
+ faila, failb, pointers);
+ }
+ } else {
+ void *p;
- /* rebuild from P stripe here (raid5 or raid6) */
- BUG_ON(failb != -1);
+ /* Rebuild from P stripe here (raid5 or raid6). */
+ ASSERT(failb == -1);
pstripe:
- /* Copy parity block into failed block to start with */
- memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
+ /* Copy parity block into failed block to start with */
+ memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
- /* rearrange the pointer array */
- p = pointers[faila];
- for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
- pointers[stripe] = pointers[stripe + 1];
- pointers[rbio->nr_data - 1] = p;
+ /* Rearrange the pointer array */
+ p = pointers[faila];
+ for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1;
+ stripe_nr++)
+ pointers[stripe_nr] = pointers[stripe_nr + 1];
+ pointers[rbio->nr_data - 1] = p;
- /* xor in the rest */
- run_xor(pointers, rbio->nr_data - 1, sectorsize);
- }
- /* if we're doing this rebuild as part of an rmw, go through
- * and set all of our private rbio pages in the
- * failed stripes as uptodate. This way finish_rmw will
- * know they can be trusted. If this was a read reconstruction,
- * other endio functions will fiddle the uptodate bits
- */
- if (rbio->operation == BTRFS_RBIO_WRITE) {
- for (i = 0; i < rbio->stripe_nsectors; i++) {
- if (faila != -1) {
- sector = rbio_stripe_sector(rbio, faila, i);
- sector->uptodate = 1;
- }
- if (failb != -1) {
- sector = rbio_stripe_sector(rbio, failb, i);
- sector->uptodate = 1;
- }
- }
- }
- for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
- kunmap_local(unmap_array[stripe]);
+ /* Xor in the rest */
+ run_xor(pointers, rbio->nr_data - 1, sectorsize);
+
+ }
+
+ /*
+ * No matter if this is a RMW or recovery, we should have all
+ * failed sectors repaired in the vertical stripe, thus they are now
+ * uptodate.
+ * Especially if we determine to cache the rbio, we need to
+ * have at least all data sectors uptodate.
+ *
+ * If possible, also check if the repaired sector matches its data
+ * checksum.
+ */
+ if (faila >= 0) {
+ ret = verify_one_sector(rbio, faila, sector_nr);
+ if (ret < 0)
+ goto cleanup;
+
+ sector = rbio_stripe_sector(rbio, faila, sector_nr);
+ sector->uptodate = 1;
+ }
+ if (failb >= 0) {
+ ret = verify_one_sector(rbio, faila, sector_nr);
+ if (ret < 0)
+ goto cleanup;
+
+ sector = rbio_stripe_sector(rbio, failb, sector_nr);
+ sector->uptodate = 1;
}
- err = BLK_STS_OK;
cleanup:
- kfree(unmap_array);
-cleanup_pointers:
- kfree(pointers);
+ for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
+ kunmap_local(unmap_array[stripe_nr]);
+ return ret;
+}
+
+static int recover_sectors(struct btrfs_raid_bio *rbio)
+{
+ void **pointers = NULL;
+ void **unmap_array = NULL;
+ int sectornr;
+ int ret = 0;
-cleanup_io:
/*
- * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
- * valid rbio which is consistent with ondisk content, thus such a
- * valid rbio can be cached to avoid further disk reads.
+ * @pointers array stores the pointer for each sector.
+ *
+ * @unmap_array stores copy of pointers that does not get reordered
+ * during reconstruction so that kunmap_local works.
*/
+ pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+ unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+ if (!pointers || !unmap_array) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
- /*
- * - In case of two failures, where rbio->failb != -1:
- *
- * Do not cache this rbio since the above read reconstruction
- * (raid6_datap_recov() or raid6_2data_recov()) may have
- * changed some content of stripes which are not identical to
- * on-disk content any more, otherwise, a later write/recover
- * may steal stripe_pages from this rbio and end up with
- * corruptions or rebuild failures.
- *
- * - In case of single failure, where rbio->failb == -1:
- *
- * Cache this rbio iff the above read reconstruction is
- * executed without problems.
- */
- if (err == BLK_STS_OK && rbio->failb < 0)
- cache_rbio_pages(rbio);
- else
- clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+ spin_lock_irq(&rbio->bio_list_lock);
+ set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+ spin_unlock_irq(&rbio->bio_list_lock);
+ }
- rbio_orig_end_io(rbio, err);
- } else if (err == BLK_STS_OK) {
- rbio->faila = -1;
- rbio->failb = -1;
+ index_rbio_pages(rbio);
- if (rbio->operation == BTRFS_RBIO_WRITE)
- finish_rmw(rbio);
- else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
- finish_parity_scrub(rbio, 0);
- else
- BUG();
- } else {
- rbio_orig_end_io(rbio, err);
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ ret = recover_vertical(rbio, sectornr, pointers, unmap_array);
+ if (ret < 0)
+ break;
}
-}
-/*
- * This is called only for stripes we've read from disk to reconstruct the
- * parity.
- */
-static void raid_recover_end_io_work(struct work_struct *work)
-{
- struct btrfs_raid_bio *rbio =
- container_of(work, struct btrfs_raid_bio, end_io_work);
-
- if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
- else
- __raid_recover_end_io(rbio);
+out:
+ kfree(pointers);
+ kfree(unmap_array);
+ return ret;
}
-/*
- * reads everything we need off the disk to reconstruct
- * the parity. endio handlers trigger final reconstruction
- * when the IO is done.
- *
- * This is used both for reads from the higher layers and for
- * parity construction required to finish a rmw cycle.
- */
-static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
+static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list)
{
- int bios_to_read = 0;
- struct bio_list bio_list;
- int ret;
- int total_sector_nr;
struct bio *bio;
+ int total_sector_nr;
+ int ret = 0;
- bio_list_init(&bio_list);
-
- ret = alloc_rbio_pages(rbio);
- if (ret)
- goto cleanup;
-
- atomic_set(&rbio->error, 0);
-
+ ASSERT(bio_list_size(bio_list) == 0);
/*
* Read everything that hasn't failed. However this time we will
* not trust any cached sector.
@@ -2131,64 +1963,139 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
int sectornr = total_sector_nr % rbio->stripe_nsectors;
struct sector_ptr *sector;
- if (rbio->faila == stripe || rbio->failb == stripe) {
- atomic_inc(&rbio->error);
- /* Skip the current stripe. */
- ASSERT(sectornr == 0);
- total_sector_nr += rbio->stripe_nsectors - 1;
+ /*
+ * Skip the range which has error. It can be a range which is
+ * marked error (for csum mismatch), or it can be a missing
+ * device.
+ */
+ if (!rbio->bioc->stripes[stripe].dev->bdev ||
+ test_bit(total_sector_nr, rbio->error_bitmap)) {
+ /*
+ * Also set the error bit for missing device, which
+ * may not yet have its error bit set.
+ */
+ set_bit(total_sector_nr, rbio->error_bitmap);
continue;
}
+
sector = rbio_stripe_sector(rbio, stripe, sectornr);
- ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
sectornr, REQ_OP_READ);
if (ret < 0)
- goto cleanup;
+ goto error;
}
+ return 0;
+error:
+ while ((bio = bio_list_pop(bio_list)))
+ bio_put(bio);
- bios_to_read = bio_list_size(&bio_list);
- if (!bios_to_read) {
- /*
- * we might have no bios to read just because the pages
- * were up to date, or we might have no bios to read because
- * the devices were gone.
- */
- if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
- __raid_recover_end_io(rbio);
- return 0;
- } else {
- goto cleanup;
- }
- }
+ return -EIO;
+}
+
+static int recover_rbio(struct btrfs_raid_bio *rbio)
+{
+ struct bio_list bio_list;
+ struct bio *bio;
+ int ret;
/*
- * The bioc may be freed once we submit the last bio. Make sure not to
- * touch it after that.
+ * Either we're doing recover for a read failure or degraded write,
+ * caller should have set error bitmap correctly.
*/
- atomic_set(&rbio->stripes_pending, bios_to_read);
- INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
- while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid56_bio_end_io;
+ ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
+ bio_list_init(&bio_list);
- if (trace_raid56_scrub_read_recover_enabled()) {
- struct raid56_bio_trace_info trace_info = { 0 };
+ /* For recovery, we need to read all sectors including P/Q. */
+ ret = alloc_rbio_pages(rbio);
+ if (ret < 0)
+ goto out;
- bio_get_trace_info(rbio, bio, &trace_info);
- trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
- }
- submit_bio(bio);
- }
+ index_rbio_pages(rbio);
- return 0;
+ ret = recover_assemble_read_bios(rbio, &bio_list);
+ if (ret < 0)
+ goto out;
-cleanup:
- if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
- rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
+ submit_read_bios(rbio, &bio_list);
+ wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+ ret = recover_sectors(rbio);
+
+out:
while ((bio = bio_list_pop(&bio_list)))
bio_put(bio);
- return -EIO;
+ return ret;
+}
+
+static void recover_rbio_work(struct work_struct *work)
+{
+ struct btrfs_raid_bio *rbio;
+ int ret;
+
+ rbio = container_of(work, struct btrfs_raid_bio, work);
+
+ ret = lock_stripe_add(rbio);
+ if (ret == 0) {
+ ret = recover_rbio(rbio);
+ rbio_orig_end_io(rbio, errno_to_blk_status(ret));
+ }
+}
+
+static void recover_rbio_work_locked(struct work_struct *work)
+{
+ struct btrfs_raid_bio *rbio;
+ int ret;
+
+ rbio = container_of(work, struct btrfs_raid_bio, work);
+
+ ret = recover_rbio(rbio);
+ rbio_orig_end_io(rbio, errno_to_blk_status(ret));
+}
+
+static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
+{
+ bool found = false;
+ int sector_nr;
+
+ /*
+ * This is for RAID6 extra recovery tries, thus mirror number should
+ * be large than 2.
+ * Mirror 1 means read from data stripes. Mirror 2 means rebuild using
+ * RAID5 methods.
+ */
+ ASSERT(mirror_num > 2);
+ for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
+ int found_errors;
+ int faila;
+ int failb;
+
+ found_errors = get_rbio_veritical_errors(rbio, sector_nr,
+ &faila, &failb);
+ /* This vertical stripe doesn't have errors. */
+ if (!found_errors)
+ continue;
+
+ /*
+ * If we found errors, there should be only one error marked
+ * by previous set_rbio_range_error().
+ */
+ ASSERT(found_errors == 1);
+ found = true;
+
+ /* Now select another stripe to mark as error. */
+ failb = rbio->real_stripes - (mirror_num - 1);
+ if (failb <= faila)
+ failb--;
+
+ /* Set the extra bit in error bitmap. */
+ if (failb >= 0)
+ set_bit(failb * rbio->stripe_nsectors + sector_nr,
+ rbio->error_bitmap);
+ }
+
+ /* We should found at least one vertical stripe with error.*/
+ ASSERT(found);
}
/*
@@ -2198,88 +2105,292 @@ cleanup:
* of the drive.
*/
void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
- int mirror_num, bool generic_io)
+ int mirror_num)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
- if (generic_io) {
- ASSERT(bioc->mirror_num == mirror_num);
- btrfs_bio(bio)->mirror_num = mirror_num;
- } else {
- btrfs_get_bioc(bioc);
- }
-
rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio)) {
bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
- goto out_end_bio;
+ bio_endio(bio);
+ return;
}
rbio->operation = BTRFS_RBIO_READ_REBUILD;
rbio_add_bio(rbio, bio);
- rbio->faila = find_logical_bio_stripe(rbio, bio);
- if (rbio->faila == -1) {
- btrfs_warn(fs_info,
-"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
- __func__, bio->bi_iter.bi_sector << 9,
- (u64)bio->bi_iter.bi_size, bioc->map_type);
- kfree(rbio);
- bio->bi_status = BLK_STS_IOERR;
- goto out_end_bio;
- }
-
- if (generic_io)
- rbio->generic_bio_cnt = 1;
+ set_rbio_range_error(rbio, bio);
/*
* Loop retry:
* for 'mirror == 2', reconstruct from all other stripes.
* for 'mirror_num > 2', select a stripe to fail on every retry.
*/
- if (mirror_num > 2) {
+ if (mirror_num > 2)
+ set_rbio_raid6_extra_error(rbio, mirror_num);
+
+ start_async_work(rbio, recover_rbio_work);
+}
+
+static void fill_data_csums(struct btrfs_raid_bio *rbio)
+{
+ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
+ rbio->bioc->raid_map[0]);
+ const u64 start = rbio->bioc->raid_map[0];
+ const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
+ fs_info->sectorsize_bits;
+ int ret;
+
+ /* The rbio should not have its csum buffer initialized. */
+ ASSERT(!rbio->csum_buf && !rbio->csum_bitmap);
+
+ /*
+ * Skip the csum search if:
+ *
+ * - The rbio doesn't belong to data block groups
+ * Then we are doing IO for tree blocks, no need to search csums.
+ *
+ * - The rbio belongs to mixed block groups
+ * This is to avoid deadlock, as we're already holding the full
+ * stripe lock, if we trigger a metadata read, and it needs to do
+ * raid56 recovery, we will deadlock.
+ */
+ if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) ||
+ rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA)
+ return;
+
+ rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors *
+ fs_info->csum_size, GFP_NOFS);
+ rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors,
+ GFP_NOFS);
+ if (!rbio->csum_buf || !rbio->csum_bitmap) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1,
+ rbio->csum_buf, rbio->csum_bitmap);
+ if (ret < 0)
+ goto error;
+ if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
+ goto no_csum;
+ return;
+
+error:
+ /*
+ * We failed to allocate memory or grab the csum, but it's not fatal,
+ * we can still continue. But better to warn users that RMW is no
+ * longer safe for this particular sub-stripe write.
+ */
+ btrfs_warn_rl(fs_info,
+"sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
+ rbio->bioc->raid_map[0], ret);
+no_csum:
+ kfree(rbio->csum_buf);
+ bitmap_free(rbio->csum_bitmap);
+ rbio->csum_buf = NULL;
+ rbio->csum_bitmap = NULL;
+}
+
+static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
+{
+ struct bio_list bio_list;
+ struct bio *bio;
+ int ret;
+
+ bio_list_init(&bio_list);
+
+ /*
+ * Fill the data csums we need for data verification. We need to fill
+ * the csum_bitmap/csum_buf first, as our endio function will try to
+ * verify the data sectors.
+ */
+ fill_data_csums(rbio);
+
+ ret = rmw_assemble_read_bios(rbio, &bio_list);
+ if (ret < 0)
+ goto out;
+
+ submit_read_bios(rbio, &bio_list);
+ wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+
+ /*
+ * We may or may not have any corrupted sectors (including missing dev
+ * and csum mismatch), just let recover_sectors() to handle them all.
+ */
+ ret = recover_sectors(rbio);
+ return ret;
+out:
+ while ((bio = bio_list_pop(&bio_list)))
+ bio_put(bio);
+
+ return ret;
+}
+
+static void raid_wait_write_end_io(struct bio *bio)
+{
+ struct btrfs_raid_bio *rbio = bio->bi_private;
+ blk_status_t err = bio->bi_status;
+
+ if (err)
+ rbio_update_error_bitmap(rbio, bio);
+ bio_put(bio);
+ if (atomic_dec_and_test(&rbio->stripes_pending))
+ wake_up(&rbio->io_wait);
+}
+
+static void submit_write_bios(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list)
+{
+ struct bio *bio;
+
+ atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
+ while ((bio = bio_list_pop(bio_list))) {
+ bio->bi_end_io = raid_wait_write_end_io;
+
+ if (trace_raid56_write_stripe_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_write_stripe(rbio, bio, &trace_info);
+ }
+ submit_bio(bio);
+ }
+}
+
+/*
+ * To determine if we need to read any sector from the disk.
+ * Should only be utilized in RMW path, to skip cached rbio.
+ */
+static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
+{
+ int i;
+
+ for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) {
+ struct sector_ptr *sector = &rbio->stripe_sectors[i];
+
/*
- * 'mirror == 3' is to fail the p stripe and
- * reconstruct from the q stripe. 'mirror > 3' is to
- * fail a data stripe and reconstruct from p+q stripe.
+ * We have a sector which doesn't have page nor uptodate,
+ * thus this rbio can not be cached one, as cached one must
+ * have all its data sectors present and uptodate.
*/
- rbio->failb = rbio->real_stripes - (mirror_num - 1);
- ASSERT(rbio->failb > 0);
- if (rbio->failb <= rbio->faila)
- rbio->failb--;
+ if (!sector->page || !sector->uptodate)
+ return true;
}
+ return false;
+}
- if (lock_stripe_add(rbio))
- return;
+static int rmw_rbio(struct btrfs_raid_bio *rbio)
+{
+ struct bio_list bio_list;
+ int sectornr;
+ int ret = 0;
/*
- * This adds our rbio to the list of rbios that will be handled after
- * the current lock owner is done.
+ * Allocate the pages for parity first, as P/Q pages will always be
+ * needed for both full-stripe and sub-stripe writes.
*/
- __raid56_parity_recover(rbio);
- return;
+ ret = alloc_rbio_parity_pages(rbio);
+ if (ret < 0)
+ return ret;
-out_end_bio:
- btrfs_bio_counter_dec(fs_info);
- btrfs_put_bioc(bioc);
- bio_endio(bio);
+ /*
+ * Either full stripe write, or we have every data sector already
+ * cached, can go to write path immediately.
+ */
+ if (rbio_is_full(rbio) || !need_read_stripe_sectors(rbio))
+ goto write;
+
+ /*
+ * Now we're doing sub-stripe write, also need all data stripes to do
+ * the full RMW.
+ */
+ ret = alloc_rbio_data_pages(rbio);
+ if (ret < 0)
+ return ret;
+
+ index_rbio_pages(rbio);
+
+ ret = rmw_read_wait_recover(rbio);
+ if (ret < 0)
+ return ret;
+
+write:
+ /*
+ * At this stage we're not allowed to add any new bios to the
+ * bio list any more, anyone else that wants to change this stripe
+ * needs to do their own rmw.
+ */
+ spin_lock_irq(&rbio->bio_list_lock);
+ set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+ spin_unlock_irq(&rbio->bio_list_lock);
+
+ bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
+
+ index_rbio_pages(rbio);
+
+ /*
+ * We don't cache full rbios because we're assuming
+ * the higher layers are unlikely to use this area of
+ * the disk again soon. If they do use it again,
+ * hopefully they will send another full bio.
+ */
+ if (!rbio_is_full(rbio))
+ cache_rbio_pages(rbio);
+ else
+ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
+ generate_pq_vertical(rbio, sectornr);
+
+ bio_list_init(&bio_list);
+ ret = rmw_assemble_write_bios(rbio, &bio_list);
+ if (ret < 0)
+ return ret;
+
+ /* We should have at least one bio assembled. */
+ ASSERT(bio_list_size(&bio_list));
+ submit_write_bios(rbio, &bio_list);
+ wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+
+ /* We may have more errors than our tolerance during the read. */
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ int found_errors;
+
+ found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL);
+ if (found_errors > rbio->bioc->max_errors) {
+ ret = -EIO;
+ break;
+ }
+ }
+ return ret;
}
-static void rmw_work(struct work_struct *work)
+static void rmw_rbio_work(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
+ int ret;
rbio = container_of(work, struct btrfs_raid_bio, work);
- raid56_rmw_stripe(rbio);
+
+ ret = lock_stripe_add(rbio);
+ if (ret == 0) {
+ ret = rmw_rbio(rbio);
+ rbio_orig_end_io(rbio, errno_to_blk_status(ret));
+ }
}
-static void read_rebuild_work(struct work_struct *work)
+static void rmw_rbio_work_locked(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
+ int ret;
rbio = container_of(work, struct btrfs_raid_bio, work);
- __raid56_parity_recover(rbio);
+
+ ret = rmw_rbio(rbio);
+ rbio_orig_end_io(rbio, errno_to_blk_status(ret));
}
/*
@@ -2326,13 +2437,6 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
ASSERT(i < rbio->real_stripes);
bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
-
- /*
- * We have already increased bio_counter when getting bioc, record it
- * so we can free it at rbio_orig_end_io().
- */
- rbio->generic_bio_cnt = 1;
-
return rbio;
}
@@ -2381,8 +2485,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
return 0;
}
-static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
- int need_check)
+static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
{
struct btrfs_io_context *bioc = rbio->bioc;
const u32 sectorsize = bioc->fs_info->sectorsize;
@@ -2425,7 +2528,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
p_sector.page = alloc_page(GFP_NOFS);
if (!p_sector.page)
- goto cleanup;
+ return -ENOMEM;
p_sector.pgoff = 0;
p_sector.uptodate = 1;
@@ -2435,14 +2538,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
if (!q_sector.page) {
__free_page(p_sector.page);
p_sector.page = NULL;
- goto cleanup;
+ return -ENOMEM;
}
q_sector.pgoff = 0;
q_sector.uptodate = 1;
pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
}
- atomic_set(&rbio->error, 0);
+ bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
/* Map the parity stripe just once */
pointers[nr_data] = kmap_local_page(p_sector.page);
@@ -2522,33 +2625,13 @@ writeback:
}
submit_write:
- nr_data = bio_list_size(&bio_list);
- if (!nr_data) {
- /* Every parity is right */
- rbio_orig_end_io(rbio, BLK_STS_OK);
- return;
- }
-
- atomic_set(&rbio->stripes_pending, nr_data);
-
- while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid_write_end_io;
-
- if (trace_raid56_scrub_write_stripe_enabled()) {
- struct raid56_bio_trace_info trace_info = { 0 };
-
- bio_get_trace_info(rbio, bio, &trace_info);
- trace_raid56_scrub_write_stripe(rbio, bio, &trace_info);
- }
- submit_bio(bio);
- }
- return;
+ submit_write_bios(rbio, &bio_list);
+ return 0;
cleanup:
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
-
while ((bio = bio_list_pop(&bio_list)))
bio_put(bio);
+ return ret;
}
static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
@@ -2558,102 +2641,99 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
return 0;
}
-/*
- * While we're doing the parity check and repair, we could have errors
- * in reading pages off the disk. This checks for errors and if we're
- * not able to read the page it'll trigger parity reconstruction. The
- * parity scrub will be finished after we've reconstructed the failed
- * stripes
- */
-static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
+static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
{
- if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
- goto cleanup;
+ void **pointers = NULL;
+ void **unmap_array = NULL;
+ int sector_nr;
+ int ret;
- if (rbio->faila >= 0 || rbio->failb >= 0) {
+ /*
+ * @pointers array stores the pointer for each sector.
+ *
+ * @unmap_array stores copy of pointers that does not get reordered
+ * during reconstruction so that kunmap_local works.
+ */
+ pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+ unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+ if (!pointers || !unmap_array) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
int dfail = 0, failp = -1;
+ int faila;
+ int failb;
+ int found_errors;
+
+ found_errors = get_rbio_veritical_errors(rbio, sector_nr,
+ &faila, &failb);
+ if (found_errors > rbio->bioc->max_errors) {
+ ret = -EIO;
+ goto out;
+ }
+ if (found_errors == 0)
+ continue;
- if (is_data_stripe(rbio, rbio->faila))
- dfail++;
- else if (is_parity_stripe(rbio->faila))
- failp = rbio->faila;
+ /* We should have at least one error here. */
+ ASSERT(faila >= 0 || failb >= 0);
- if (is_data_stripe(rbio, rbio->failb))
+ if (is_data_stripe(rbio, faila))
dfail++;
- else if (is_parity_stripe(rbio->failb))
- failp = rbio->failb;
+ else if (is_parity_stripe(faila))
+ failp = faila;
+ if (is_data_stripe(rbio, failb))
+ dfail++;
+ else if (is_parity_stripe(failb))
+ failp = failb;
/*
- * Because we can not use a scrubbing parity to repair
- * the data, so the capability of the repair is declined.
- * (In the case of RAID5, we can not repair anything)
+ * Because we can not use a scrubbing parity to repair the
+ * data, so the capability of the repair is declined. (In the
+ * case of RAID5, we can not repair anything.)
*/
- if (dfail > rbio->bioc->max_errors - 1)
- goto cleanup;
-
+ if (dfail > rbio->bioc->max_errors - 1) {
+ ret = -EIO;
+ goto out;
+ }
/*
- * If all data is good, only parity is correctly, just
- * repair the parity.
+ * If all data is good, only parity is correctly, just repair
+ * the parity, no need to recover data stripes.
*/
- if (dfail == 0) {
- finish_parity_scrub(rbio, 0);
- return;
- }
+ if (dfail == 0)
+ continue;
/*
* Here means we got one corrupted data stripe and one
- * corrupted parity on RAID6, if the corrupted parity
- * is scrubbing parity, luckily, use the other one to repair
- * the data, or we can not repair the data stripe.
+ * corrupted parity on RAID6, if the corrupted parity is
+ * scrubbing parity, luckily, use the other one to repair the
+ * data, or we can not repair the data stripe.
*/
- if (failp != rbio->scrubp)
- goto cleanup;
+ if (failp != rbio->scrubp) {
+ ret = -EIO;
+ goto out;
+ }
- __raid_recover_end_io(rbio);
- } else {
- finish_parity_scrub(rbio, 1);
+ ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
+ if (ret < 0)
+ goto out;
}
- return;
-
-cleanup:
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
-}
-
-/*
- * end io for the read phase of the rmw cycle. All the bios here are physical
- * stripe bios we've read from the disk so we can recalculate the parity of the
- * stripe.
- *
- * This will usually kick off finish_rmw once all the bios are read in, but it
- * may trigger parity reconstruction if we had any errors along the way
- */
-static void raid56_parity_scrub_end_io_work(struct work_struct *work)
-{
- struct btrfs_raid_bio *rbio =
- container_of(work, struct btrfs_raid_bio, end_io_work);
-
- /*
- * This will normally call finish_rmw to start our write, but if there
- * are any failed stripes we'll reconstruct from parity first
- */
- validate_rbio_for_parity_scrub(rbio);
+out:
+ kfree(pointers);
+ kfree(unmap_array);
+ return ret;
}
-static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
+static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list)
{
- int bios_to_read = 0;
- struct bio_list bio_list;
- int ret;
- int total_sector_nr;
struct bio *bio;
+ int total_sector_nr;
+ int ret = 0;
- bio_list_init(&bio_list);
-
- ret = alloc_rbio_essential_pages(rbio);
- if (ret)
- goto cleanup;
+ ASSERT(bio_list_size(bio_list) == 0);
- atomic_set(&rbio->error, 0);
/* Build a list of bios to read all the missing parts. */
for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
total_sector_nr++) {
@@ -2682,67 +2762,84 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
if (sector->uptodate)
continue;
- ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
sectornr, REQ_OP_READ);
if (ret)
- goto cleanup;
+ goto error;
}
+ return 0;
+error:
+ while ((bio = bio_list_pop(bio_list)))
+ bio_put(bio);
+ return ret;
+}
- bios_to_read = bio_list_size(&bio_list);
- if (!bios_to_read) {
- /*
- * this can happen if others have merged with
- * us, it means there is nothing left to read.
- * But if there are missing devices it may not be
- * safe to do the full stripe write yet.
- */
- goto finish;
- }
+static int scrub_rbio(struct btrfs_raid_bio *rbio)
+{
+ bool need_check = false;
+ struct bio_list bio_list;
+ int sector_nr;
+ int ret;
+ struct bio *bio;
- /*
- * The bioc may be freed once we submit the last bio. Make sure not to
- * touch it after that.
- */
- atomic_set(&rbio->stripes_pending, bios_to_read);
- INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work);
- while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid56_bio_end_io;
+ bio_list_init(&bio_list);
- if (trace_raid56_scrub_read_enabled()) {
- struct raid56_bio_trace_info trace_info = { 0 };
+ ret = alloc_rbio_essential_pages(rbio);
+ if (ret)
+ goto cleanup;
- bio_get_trace_info(rbio, bio, &trace_info);
- trace_raid56_scrub_read(rbio, bio, &trace_info);
+ bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
+
+ ret = scrub_assemble_read_bios(rbio, &bio_list);
+ if (ret < 0)
+ goto cleanup;
+
+ submit_read_bios(rbio, &bio_list);
+ wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+
+ /* We may have some failures, recover the failed sectors first. */
+ ret = recover_scrub_rbio(rbio);
+ if (ret < 0)
+ goto cleanup;
+
+ /*
+ * We have every sector properly prepared. Can finish the scrub
+ * and writeback the good content.
+ */
+ ret = finish_parity_scrub(rbio, need_check);
+ wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+ for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
+ int found_errors;
+
+ found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL);
+ if (found_errors > rbio->bioc->max_errors) {
+ ret = -EIO;
+ break;
}
- submit_bio(bio);
}
- /* the actual write will happen once the reads are done */
- return;
+ return ret;
cleanup:
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
-
while ((bio = bio_list_pop(&bio_list)))
bio_put(bio);
- return;
-
-finish:
- validate_rbio_for_parity_scrub(rbio);
+ return ret;
}
-static void scrub_parity_work(struct work_struct *work)
+static void scrub_rbio_work_locked(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
+ int ret;
rbio = container_of(work, struct btrfs_raid_bio, work);
- raid56_parity_scrub_stripe(rbio);
+ ret = scrub_rbio(rbio);
+ rbio_orig_end_io(rbio, errno_to_blk_status(ret));
}
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
{
if (!lock_stripe_add(rbio))
- start_async_work(rbio, scrub_parity_work);
+ start_async_work(rbio, scrub_rbio_work_locked);
}
/* The following code is used for dev replace of a missing RAID 5/6 device. */
@@ -2765,24 +2862,12 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
*/
ASSERT(!bio->bi_iter.bi_size);
- rbio->faila = find_logical_bio_stripe(rbio, bio);
- if (rbio->faila == -1) {
- BUG();
- kfree(rbio);
- return NULL;
- }
-
- /*
- * When we get bioc, we have already increased bio_counter, record it
- * so we can free it at rbio_orig_end_io()
- */
- rbio->generic_bio_cnt = 1;
+ set_rbio_range_error(rbio, bio);
return rbio;
}
void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
{
- if (!lock_stripe_add(rbio))
- start_async_work(rbio, read_rebuild_work);
+ start_async_work(rbio, recover_rbio_work);
}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 6f48f9e4c869..7c73a443939e 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -74,12 +74,6 @@ struct btrfs_raid_bio {
/* How many sectors there are for each stripe */
u8 stripe_nsectors;
- /* First bad stripe, -1 means no corruption */
- s8 faila;
-
- /* Second bad stripe (for RAID6 use) */
- s8 failb;
-
/* Stripe number that we're scrubbing */
u8 scrubp;
@@ -89,15 +83,11 @@ struct btrfs_raid_bio {
*/
int bio_list_bytes;
- int generic_bio_cnt;
-
refcount_t refs;
atomic_t stripes_pending;
- atomic_t error;
-
- struct work_struct end_io_work;
+ wait_queue_head_t io_wait;
/* Bitmap to record which horizontal stripe has data */
unsigned long dbitmap;
@@ -128,6 +118,29 @@ struct btrfs_raid_bio {
/* Allocated with real_stripes-many pointers for finish_*() calls */
void **finish_pointers;
+
+ /*
+ * The bitmap recording where IO errors happened.
+ * Each bit is corresponding to one sector in either bio_sectors[] or
+ * stripe_sectors[] array.
+ *
+ * The reason we don't use another bit in sector_ptr is, we have two
+ * arrays of sectors, and a lot of IO can use sectors in both arrays.
+ * Thus making it much harder to iterate.
+ */
+ unsigned long *error_bitmap;
+
+ /*
+ * Checksum buffer if the rbio is for data. The buffer should cover
+ * all data sectors (exlcuding P/Q sectors).
+ */
+ u8 *csum_buf;
+
+ /*
+ * Each bit represents if the corresponding sector has data csum found.
+ * Should only cover data sectors (excluding P/Q sectors).
+ */
+ unsigned long *csum_bitmap;
};
/*
@@ -166,7 +179,7 @@ static inline int nr_data_stripes(const struct map_lookup *map)
struct btrfs_device;
void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
- int mirror_num, bool generic_io);
+ int mirror_num);
void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc);
void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h
index 5c1a617eb25d..5c2b66d155ef 100644
--- a/fs/btrfs/rcu-string.h
+++ b/fs/btrfs/rcu-string.h
@@ -18,7 +18,11 @@ static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask)
(len * sizeof(char)), mask);
if (!ret)
return ret;
- strncpy(ret->str, src, len);
+ /* Warn if the source got unexpectedly truncated. */
+ if (WARN_ON(strscpy(ret->str, src, len) < 0)) {
+ kfree(ret);
+ return NULL;
+ }
return ret;
}
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index a248f46cfe72..95d28497de7c 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -5,11 +5,14 @@
#include <linux/sched.h>
#include <linux/stacktrace.h>
+#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "locking.h"
#include "delayed-ref.h"
#include "ref-verify.h"
+#include "fs.h"
+#include "accessors.h"
/*
* Used to keep track the roots and number of refs each root has for a given
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 9acf47b11fe6..0474bbe39da7 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -2,13 +2,19 @@
#include <linux/blkdev.h>
#include <linux/iversion.h>
-#include "compression.h"
#include "ctree.h"
+#include "fs.h"
+#include "messages.h"
+#include "compression.h"
#include "delalloc-space.h"
#include "disk-io.h"
#include "reflink.h"
#include "transaction.h"
#include "subpage.h"
+#include "accessors.h"
+#include "file-item.h"
+#include "file.h"
+#include "super.h"
#define BTRFS_MAX_DEDUPE_LEN SZ_16M
@@ -92,7 +98,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
clear_extent_bit(&inode->io_tree, file_offset, range_end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, NULL);
+ NULL);
ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
if (ret)
goto out_unlock;
@@ -318,16 +324,16 @@ copy_to_page:
goto out;
}
-/**
- * btrfs_clone() - clone a range from inode file to another
+/*
+ * Clone a range from inode file to another.
*
- * @src: Inode to clone from
- * @inode: Inode to clone to
- * @off: Offset within source to start clone from
- * @olen: Original length, passed by user, of range to clone
- * @olen_aligned: Block-aligned value of olen
- * @destoff: Offset within @inode to start clone
- * @no_time_update: Whether to update mtime/ctime on the target inode
+ * @src: Inode to clone from
+ * @inode: Inode to clone to
+ * @off: Offset within source to start clone from
+ * @olen: Original length, passed by user, of range to clone
+ * @olen_aligned: Block-aligned value of olen
+ * @destoff: Offset within @inode to start clone
+ * @no_time_update: Whether to update mtime/ctime on the target inode
*/
static int btrfs_clone(struct inode *src, struct inode *inode,
const u64 off, const u64 olen, const u64 olen_aligned,
@@ -615,8 +621,8 @@ out:
static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
struct inode *inode2, u64 loff2, u64 len)
{
- unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
- unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+ unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1, NULL);
+ unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1, NULL);
}
static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
@@ -634,8 +640,8 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
swap(range1_end, range2_end);
}
- lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end);
- lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end);
+ lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end, NULL);
+ lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end, NULL);
btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end);
btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end);
@@ -887,7 +893,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
return -EINVAL;
if (same_inode) {
- btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP);
} else {
lock_two_nondirectories(src_inode, dst_inode);
btrfs_double_mmap_lock(src_inode, dst_inode);
@@ -905,7 +911,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
out_unlock:
if (same_inode) {
- btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP);
} else {
btrfs_double_mmap_unlock(src_inode, dst_inode);
unlock_two_nondirectories(src_inode, dst_inode);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 45c02aba2492..31ec4a7658ce 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -27,6 +27,15 @@
#include "subpage.h"
#include "zoned.h"
#include "inode-item.h"
+#include "space-info.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "root-tree.h"
+#include "file-item.h"
+#include "relocation.h"
+#include "super.h"
+#include "tree-checker.h"
/*
* Relocation overview
@@ -470,7 +479,7 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
int ret;
int err = 0;
- iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS);
+ iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info);
if (!iter)
return ERR_PTR(-ENOMEM);
path = btrfs_alloc_path();
@@ -1109,10 +1118,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
inode = find_next_inode(root, key.objectid);
first = 0;
} else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(BTRFS_I(inode));
inode = find_next_inode(root, key.objectid);
}
if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
+ struct extent_state *cached_state = NULL;
+
end = key.offset +
btrfs_file_extent_num_bytes(leaf, fi);
WARN_ON(!IS_ALIGNED(key.offset,
@@ -1120,14 +1131,15 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
end--;
ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
- key.offset, end);
+ key.offset, end,
+ &cached_state);
if (!ret)
continue;
- btrfs_drop_extent_cache(BTRFS_I(inode),
- key.offset, end, 1);
+ btrfs_drop_extent_map_range(BTRFS_I(inode),
+ key.offset, end, true);
unlock_extent(&BTRFS_I(inode)->io_tree,
- key.offset, end);
+ key.offset, end, &cached_state);
}
}
@@ -1170,7 +1182,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
if (dirty)
btrfs_mark_buffer_dirty(leaf);
if (inode)
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(BTRFS_I(inode));
return ret;
}
@@ -1516,6 +1528,8 @@ static int invalidate_extent_cache(struct btrfs_root *root,
objectid = min_key->objectid;
while (1) {
+ struct extent_state *cached_state = NULL;
+
cond_resched();
iput(inode);
@@ -1566,9 +1580,9 @@ static int invalidate_extent_cache(struct btrfs_root *root,
}
/* the lock_extent waits for read_folio to complete */
- lock_extent(&BTRFS_I(inode)->io_tree, start, end);
- btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1);
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
+ btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
}
return 0;
}
@@ -2597,10 +2611,14 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
static int get_tree_block_key(struct btrfs_fs_info *fs_info,
struct tree_block *block)
{
+ struct btrfs_tree_parent_check check = {
+ .level = block->level,
+ .owner_root = block->owner,
+ .transid = block->key.offset
+ };
struct extent_buffer *eb;
- eb = read_tree_block(fs_info, block->bytenr, block->owner,
- block->key.offset, block->level, NULL);
+ eb = read_tree_block(fs_info, block->bytenr, &check);
if (IS_ERR(eb))
return PTR_ERR(eb);
if (!extent_buffer_uptodate(eb)) {
@@ -2861,25 +2879,27 @@ static noinline_for_stack int prealloc_file_extent_cluster(
if (ret)
return ret;
- btrfs_inode_lock(&inode->vfs_inode, 0);
+ btrfs_inode_lock(inode, 0);
for (nr = 0; nr < cluster->nr; nr++) {
+ struct extent_state *cached_state = NULL;
+
start = cluster->boundary[nr] - offset;
if (nr + 1 < cluster->nr)
end = cluster->boundary[nr + 1] - 1 - offset;
else
end = cluster->end - offset;
- lock_extent(&inode->io_tree, start, end);
+ lock_extent(&inode->io_tree, start, end, &cached_state);
num_bytes = end + 1 - start;
ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
num_bytes, num_bytes,
end + 1, &alloc_hint);
cur_offset = end + 1;
- unlock_extent(&inode->io_tree, start, end);
+ unlock_extent(&inode->io_tree, start, end, &cached_state);
if (ret)
break;
}
- btrfs_inode_unlock(&inode->vfs_inode, 0);
+ btrfs_inode_unlock(inode, 0);
if (cur_offset < prealloc_end)
btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
@@ -2890,8 +2910,8 @@ static noinline_for_stack int prealloc_file_extent_cluster(
static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode,
u64 start, u64 end, u64 block_start)
{
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
+ struct extent_state *cached_state = NULL;
int ret = 0;
em = alloc_extent_map();
@@ -2904,18 +2924,11 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod
em->block_start = block_start;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
- lock_extent(&BTRFS_I(inode)->io_tree, start, end);
- while (1) {
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
- write_unlock(&em_tree->lock);
- if (ret != -EEXIST) {
- free_extent_map(em);
- break;
- }
- btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
- }
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
+ ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
+ free_extent_map(em);
+
return ret;
}
@@ -2991,6 +3004,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
*/
cur = max(page_start, cluster->boundary[*cluster_nr] - offset);
while (cur <= page_end) {
+ struct extent_state *cached_state = NULL;
u64 extent_start = cluster->boundary[*cluster_nr] - offset;
u64 extent_end = get_cluster_boundary_end(cluster,
*cluster_nr) - offset;
@@ -3006,13 +3020,15 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
goto release_page;
/* Mark the range delalloc and dirty for later writeback */
- lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end);
+ lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
+ &cached_state);
ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
- clamped_end, 0, NULL);
+ clamped_end, 0, &cached_state);
if (ret) {
- clear_extent_bits(&BTRFS_I(inode)->io_tree,
- clamped_start, clamped_end,
- EXTENT_LOCKED | EXTENT_BOUNDARY);
+ clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ clamped_start, clamped_end,
+ EXTENT_LOCKED | EXTENT_BOUNDARY,
+ &cached_state);
btrfs_delalloc_release_metadata(BTRFS_I(inode),
clamped_len, true);
btrfs_delalloc_release_extents(BTRFS_I(inode),
@@ -3039,7 +3055,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
boundary_start, boundary_end,
EXTENT_BOUNDARY);
}
- unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end);
+ unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
+ &cached_state);
btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
cur += clamped_len;
@@ -3396,24 +3413,28 @@ int add_data_references(struct reloc_control *rc,
struct btrfs_path *path,
struct rb_root *blocks)
{
- struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
- struct ulist *leaves = NULL;
+ struct btrfs_backref_walk_ctx ctx = { 0 };
struct ulist_iterator leaf_uiter;
struct ulist_node *ref_node = NULL;
- const u32 blocksize = fs_info->nodesize;
+ const u32 blocksize = rc->extent_root->fs_info->nodesize;
int ret = 0;
btrfs_release_path(path);
- ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid,
- 0, &leaves, NULL, true);
+
+ ctx.bytenr = extent_key->objectid;
+ ctx.ignore_extent_item_pos = true;
+ ctx.fs_info = rc->extent_root->fs_info;
+
+ ret = btrfs_find_all_leafs(&ctx);
if (ret < 0)
return ret;
ULIST_ITER_INIT(&leaf_uiter);
- while ((ref_node = ulist_next(leaves, &leaf_uiter))) {
+ while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) {
+ struct btrfs_tree_parent_check check = { 0 };
struct extent_buffer *eb;
- eb = read_tree_block(fs_info, ref_node->val, 0, 0, 0, NULL);
+ eb = read_tree_block(ctx.fs_info, ref_node->val, &check);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
break;
@@ -3429,7 +3450,7 @@ int add_data_references(struct reloc_control *rc,
}
if (ret < 0)
free_block_list(blocks);
- ulist_free(leaves);
+ ulist_free(ctx.refs);
return ret;
}
@@ -3913,8 +3934,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&rc->dirty_subvol_roots);
btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1);
mapping_tree_init(&rc->reloc_root_tree);
- extent_io_tree_init(fs_info, &rc->processed_blocks,
- IO_TREE_RELOC_BLOCKS, NULL);
+ extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS);
return rc;
}
@@ -4338,8 +4358,8 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
disk_bytenr = file_pos + inode->index_cnt;
csum_root = btrfs_csum_root(fs_info, disk_bytenr);
- ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
- disk_bytenr + len - 1, &list, 0);
+ ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
+ disk_bytenr + len - 1, &list, 0, false);
if (ret)
goto out;
diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
new file mode 100644
index 000000000000..2041a86186de
--- /dev/null
+++ b/fs/btrfs/relocation.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_RELOCATION_H
+#define BTRFS_RELOCATION_H
+
+int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start);
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_recover_relocation(struct btrfs_fs_info *fs_info);
+int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len);
+int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf,
+ struct extent_buffer *cow);
+void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
+ u64 *bytes_to_reserve);
+int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending);
+int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info);
+struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr);
+int btrfs_should_ignore_reloc_root(struct btrfs_root *root);
+
+#endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index d647cb2938c0..859874579456 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -6,11 +6,16 @@
#include <linux/err.h>
#include <linux/uuid.h>
#include "ctree.h"
+#include "fs.h"
+#include "messages.h"
#include "transaction.h"
#include "disk-io.h"
#include "print-tree.h"
#include "qgroup.h"
#include "space-info.h"
+#include "accessors.h"
+#include "root-tree.h"
+#include "orphan.h"
/*
* Read a root item from the tree. In case we detect a root item smaller then
@@ -327,9 +332,8 @@ out:
}
int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
- u64 ref_id, u64 dirid, u64 *sequence, const char *name,
- int name_len)
-
+ u64 ref_id, u64 dirid, u64 *sequence,
+ const struct fscrypt_str *name)
{
struct btrfs_root *tree_root = trans->fs_info->tree_root;
struct btrfs_path *path;
@@ -337,7 +341,6 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
struct extent_buffer *leaf;
struct btrfs_key key;
unsigned long ptr;
- int err = 0;
int ret;
path = btrfs_alloc_path();
@@ -350,7 +353,6 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
if (ret < 0) {
- err = ret;
goto out;
} else if (ret == 0) {
leaf = path->nodes[0];
@@ -358,20 +360,20 @@ again:
struct btrfs_root_ref);
ptr = (unsigned long)(ref + 1);
if ((btrfs_root_ref_dirid(leaf, ref) != dirid) ||
- (btrfs_root_ref_name_len(leaf, ref) != name_len) ||
- memcmp_extent_buffer(leaf, name, ptr, name_len)) {
- err = -ENOENT;
+ (btrfs_root_ref_name_len(leaf, ref) != name->len) ||
+ memcmp_extent_buffer(leaf, name->name, ptr, name->len)) {
+ ret = -ENOENT;
goto out;
}
*sequence = btrfs_root_ref_sequence(leaf, ref);
ret = btrfs_del_item(trans, tree_root, path);
- if (ret) {
- err = ret;
+ if (ret)
goto out;
- }
- } else
- err = -ENOENT;
+ } else {
+ ret = -ENOENT;
+ goto out;
+ }
if (key.type == BTRFS_ROOT_BACKREF_KEY) {
btrfs_release_path(path);
@@ -383,7 +385,7 @@ again:
out:
btrfs_free_path(path);
- return err;
+ return ret;
}
/*
@@ -402,8 +404,8 @@ out:
* Will return 0, -ENOMEM, or anything from the CoW path
*/
int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
- u64 ref_id, u64 dirid, u64 sequence, const char *name,
- int name_len)
+ u64 ref_id, u64 dirid, u64 sequence,
+ const struct fscrypt_str *name)
{
struct btrfs_root *tree_root = trans->fs_info->tree_root;
struct btrfs_key key;
@@ -422,7 +424,7 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
key.offset = ref_id;
again:
ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
- sizeof(*ref) + name_len);
+ sizeof(*ref) + name->len);
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_free_path(path);
@@ -433,9 +435,9 @@ again:
ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
btrfs_set_root_ref_dirid(leaf, ref, dirid);
btrfs_set_root_ref_sequence(leaf, ref, sequence);
- btrfs_set_root_ref_name_len(leaf, ref, name_len);
+ btrfs_set_root_ref_name_len(leaf, ref, name->len);
ptr = (unsigned long)(ref + 1);
- write_extent_buffer(leaf, name, ptr, name_len);
+ write_extent_buffer(leaf, name->name, ptr, name->len);
btrfs_mark_buffer_dirty(leaf);
if (key.type == BTRFS_ROOT_BACKREF_KEY) {
diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h
new file mode 100644
index 000000000000..cbbaca32126e
--- /dev/null
+++ b/fs/btrfs/root-tree.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_ROOT_TREE_H
+#define BTRFS_ROOT_TREE_H
+
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv,
+ int nitems, bool use_global_rsv);
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv);
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
+ u64 ref_id, u64 dirid, u64 sequence,
+ const struct fscrypt_str *name);
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
+ u64 ref_id, u64 dirid, u64 *sequence,
+ const struct fscrypt_str *name);
+int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key);
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ const struct btrfs_key *key,
+ struct btrfs_root_item *item);
+int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_key *key,
+ struct btrfs_root_item *item);
+int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
+ struct btrfs_path *path, struct btrfs_root_item *root_item,
+ struct btrfs_key *root_key);
+int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info);
+void btrfs_set_root_node(struct btrfs_root_item *item,
+ struct extent_buffer *node);
+void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
+void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+
+#endif
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3afe5fa50a63..52b346795f66 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -17,10 +17,13 @@
#include "extent_io.h"
#include "dev-replace.h"
#include "check-integrity.h"
-#include "rcu-string.h"
#include "raid56.h"
#include "block-group.h"
#include "zoned.h"
+#include "fs.h"
+#include "accessors.h"
+#include "file-item.h"
+#include "scrub.h"
/*
* This is only the first step towards a full-features scrub. It reads all
@@ -54,6 +57,19 @@ struct scrub_ctx;
*/
#define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
+#define SCRUB_MAX_PAGES (DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE))
+
+/*
+ * Maximum number of mirrors that can be available for all profiles counting
+ * the target device of dev-replace as one. During an active device replace
+ * procedure, the target device of the copy operation is a mirror for the
+ * filesystem data as well that can be used to read data in order to repair
+ * read errors on other disks.
+ *
+ * Current value is derived from RAID1C4 with 4 copies.
+ */
+#define BTRFS_MAX_MIRRORS (4 + 1)
+
struct scrub_recover {
refcount_t refs;
struct btrfs_io_context *bioc;
@@ -62,16 +78,12 @@ struct scrub_recover {
struct scrub_sector {
struct scrub_block *sblock;
- struct page *page;
- struct btrfs_device *dev;
struct list_head list;
u64 flags; /* extent flags */
u64 generation;
- u64 logical;
- u64 physical;
- u64 physical_for_dev_replace;
+ /* Offset in bytes to @sblock. */
+ u32 offset;
atomic_t refs;
- u8 mirror_num;
unsigned int have_csum:1;
unsigned int io_error:1;
u8 csum[BTRFS_CSUM_SIZE];
@@ -94,8 +106,22 @@ struct scrub_bio {
};
struct scrub_block {
+ /*
+ * Each page will have its page::private used to record the logical
+ * bytenr.
+ */
+ struct page *pages[SCRUB_MAX_PAGES];
struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
+ struct btrfs_device *dev;
+ /* Logical bytenr of the sblock */
+ u64 logical;
+ u64 physical;
+ u64 physical_for_dev_replace;
+ /* Length of sblock in bytes */
+ u32 len;
int sector_count;
+ int mirror_num;
+
atomic_t outstanding_sectors;
refcount_t refs; /* free mem on transition to zero */
struct scrub_ctx *sctx;
@@ -202,8 +228,174 @@ struct full_stripe_lock {
struct mutex mutex;
};
+#ifndef CONFIG_64BIT
+/* This structure is for archtectures whose (void *) is smaller than u64 */
+struct scrub_page_private {
+ u64 logical;
+};
+#endif
+
+static int attach_scrub_page_private(struct page *page, u64 logical)
+{
+#ifdef CONFIG_64BIT
+ attach_page_private(page, (void *)logical);
+ return 0;
+#else
+ struct scrub_page_private *spp;
+
+ spp = kmalloc(sizeof(*spp), GFP_KERNEL);
+ if (!spp)
+ return -ENOMEM;
+ spp->logical = logical;
+ attach_page_private(page, (void *)spp);
+ return 0;
+#endif
+}
+
+static void detach_scrub_page_private(struct page *page)
+{
+#ifdef CONFIG_64BIT
+ detach_page_private(page);
+ return;
+#else
+ struct scrub_page_private *spp;
+
+ spp = detach_page_private(page);
+ kfree(spp);
+ return;
+#endif
+}
+
+static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx,
+ struct btrfs_device *dev,
+ u64 logical, u64 physical,
+ u64 physical_for_dev_replace,
+ int mirror_num)
+{
+ struct scrub_block *sblock;
+
+ sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
+ if (!sblock)
+ return NULL;
+ refcount_set(&sblock->refs, 1);
+ sblock->sctx = sctx;
+ sblock->logical = logical;
+ sblock->physical = physical;
+ sblock->physical_for_dev_replace = physical_for_dev_replace;
+ sblock->dev = dev;
+ sblock->mirror_num = mirror_num;
+ sblock->no_io_error_seen = 1;
+ /*
+ * Scrub_block::pages will be allocated at alloc_scrub_sector() when
+ * the corresponding page is not allocated.
+ */
+ return sblock;
+}
+
+/*
+ * Allocate a new scrub sector and attach it to @sblock.
+ *
+ * Will also allocate new pages for @sblock if needed.
+ */
+static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
+ u64 logical)
+{
+ const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT;
+ struct scrub_sector *ssector;
+
+ /* We must never have scrub_block exceed U32_MAX in size. */
+ ASSERT(logical - sblock->logical < U32_MAX);
+
+ ssector = kzalloc(sizeof(*ssector), GFP_KERNEL);
+ if (!ssector)
+ return NULL;
+
+ /* Allocate a new page if the slot is not allocated */
+ if (!sblock->pages[page_index]) {
+ int ret;
+
+ sblock->pages[page_index] = alloc_page(GFP_KERNEL);
+ if (!sblock->pages[page_index]) {
+ kfree(ssector);
+ return NULL;
+ }
+ ret = attach_scrub_page_private(sblock->pages[page_index],
+ sblock->logical + (page_index << PAGE_SHIFT));
+ if (ret < 0) {
+ kfree(ssector);
+ __free_page(sblock->pages[page_index]);
+ sblock->pages[page_index] = NULL;
+ return NULL;
+ }
+ }
+
+ atomic_set(&ssector->refs, 1);
+ ssector->sblock = sblock;
+ /* The sector to be added should not be used */
+ ASSERT(sblock->sectors[sblock->sector_count] == NULL);
+ ssector->offset = logical - sblock->logical;
+
+ /* The sector count must be smaller than the limit */
+ ASSERT(sblock->sector_count < SCRUB_MAX_SECTORS_PER_BLOCK);
+
+ sblock->sectors[sblock->sector_count] = ssector;
+ sblock->sector_count++;
+ sblock->len += sblock->sctx->fs_info->sectorsize;
+
+ return ssector;
+}
+
+static struct page *scrub_sector_get_page(struct scrub_sector *ssector)
+{
+ struct scrub_block *sblock = ssector->sblock;
+ pgoff_t index;
+ /*
+ * When calling this function, ssector must be alreaday attached to the
+ * parent sblock.
+ */
+ ASSERT(sblock);
+
+ /* The range should be inside the sblock range */
+ ASSERT(ssector->offset < sblock->len);
+
+ index = ssector->offset >> PAGE_SHIFT;
+ ASSERT(index < SCRUB_MAX_PAGES);
+ ASSERT(sblock->pages[index]);
+ ASSERT(PagePrivate(sblock->pages[index]));
+ return sblock->pages[index];
+}
+
+static unsigned int scrub_sector_get_page_offset(struct scrub_sector *ssector)
+{
+ struct scrub_block *sblock = ssector->sblock;
+
+ /*
+ * When calling this function, ssector must be already attached to the
+ * parent sblock.
+ */
+ ASSERT(sblock);
+
+ /* The range should be inside the sblock range */
+ ASSERT(ssector->offset < sblock->len);
+
+ return offset_in_page(ssector->offset);
+}
+
+static char *scrub_sector_get_kaddr(struct scrub_sector *ssector)
+{
+ return page_address(scrub_sector_get_page(ssector)) +
+ scrub_sector_get_page_offset(ssector);
+}
+
+static int bio_add_scrub_sector(struct bio *bio, struct scrub_sector *ssector,
+ unsigned int len)
+{
+ return bio_add_page(bio, scrub_sector_get_page(ssector), len,
+ scrub_sector_get_page_offset(ssector));
+}
+
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
- struct scrub_block *sblocks_for_recheck);
+ struct scrub_block *sblocks_for_recheck[]);
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock,
int retry_failed_mirror);
@@ -533,10 +725,8 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
if (sctx->curr != -1) {
struct scrub_bio *sbio = sctx->bios[sctx->curr];
- for (i = 0; i < sbio->sector_count; i++) {
- WARN_ON(!sbio->sectors[i]->page);
+ for (i = 0; i < sbio->sector_count; i++)
scrub_block_put(sbio->sectors[i]->sblock);
- }
bio_put(sbio->bio);
}
@@ -618,8 +808,8 @@ nomem:
return ERR_PTR(-ENOMEM);
}
-static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
- void *warn_ctx)
+static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
+ u64 root, void *warn_ctx)
{
u32 nlink;
int ret;
@@ -686,7 +876,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
btrfs_warn_in_rcu(fs_info,
"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
swarn->errstr, swarn->logical,
- rcu_str_deref(swarn->dev->name),
+ btrfs_dev_name(swarn->dev),
swarn->physical,
root, inum, offset,
fs_info->sectorsize, nlink,
@@ -700,7 +890,7 @@ err:
btrfs_warn_in_rcu(fs_info,
"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
swarn->errstr, swarn->logical,
- rcu_str_deref(swarn->dev->name),
+ btrfs_dev_name(swarn->dev),
swarn->physical,
root, inum, offset, ret);
@@ -718,7 +908,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
struct btrfs_extent_item *ei;
struct scrub_warning swarn;
unsigned long ptr = 0;
- u64 extent_item_pos;
u64 flags = 0;
u64 ref_root;
u32 item_size;
@@ -726,15 +915,21 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
int ret;
WARN_ON(sblock->sector_count < 1);
- dev = sblock->sectors[0]->dev;
+ dev = sblock->dev;
fs_info = sblock->sctx->fs_info;
+ /* Super block error, no need to search extent tree. */
+ if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+ btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
+ errstr, btrfs_dev_name(dev), sblock->physical);
+ return;
+ }
path = btrfs_alloc_path();
if (!path)
return;
- swarn.physical = sblock->sectors[0]->physical;
- swarn.logical = sblock->sectors[0]->logical;
+ swarn.physical = sblock->physical;
+ swarn.logical = sblock->logical;
swarn.errstr = errstr;
swarn.dev = NULL;
@@ -743,7 +938,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
if (ret < 0)
goto out;
- extent_item_pos = swarn.logical - found_key.objectid;
swarn.extent_item_size = found_key.offset;
eb = path->nodes[0];
@@ -758,7 +952,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
btrfs_warn_in_rcu(fs_info,
"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
errstr, swarn.logical,
- rcu_str_deref(dev->name),
+ btrfs_dev_name(dev),
swarn.physical,
ref_level ? "node" : "leaf",
ret < 0 ? -1 : ref_level,
@@ -766,12 +960,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
} while (ret != 1);
btrfs_release_path(path);
} else {
+ struct btrfs_backref_walk_ctx ctx = { 0 };
+
btrfs_release_path(path);
+
+ ctx.bytenr = found_key.objectid;
+ ctx.extent_item_pos = swarn.logical - found_key.objectid;
+ ctx.fs_info = fs_info;
+
swarn.path = path;
swarn.dev = dev;
- iterate_extent_inodes(fs_info, found_key.objectid,
- extent_item_pos, 1,
- scrub_print_warning_inode, &swarn, false);
+
+ iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn);
}
out:
@@ -804,13 +1004,14 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
{
struct scrub_ctx *sctx = sblock_to_check->sctx;
- struct btrfs_device *dev;
+ struct btrfs_device *dev = sblock_to_check->dev;
struct btrfs_fs_info *fs_info;
u64 logical;
unsigned int failed_mirror_index;
unsigned int is_metadata;
unsigned int have_csum;
- struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
+ /* One scrub_block for each mirror */
+ struct scrub_block *sblocks_for_recheck[BTRFS_MAX_MIRRORS] = { 0 };
struct scrub_block *sblock_bad;
int ret;
int mirror_index;
@@ -825,22 +1026,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
fs_info = sctx->fs_info;
if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
/*
- * if we find an error in a super block, we just report it.
+ * If we find an error in a super block, we just report it.
* They will get written with the next transaction commit
* anyway
*/
+ scrub_print_warning("super block error", sblock_to_check);
spin_lock(&sctx->stat_lock);
++sctx->stat.super_errors;
spin_unlock(&sctx->stat_lock);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
return 0;
}
- logical = sblock_to_check->sectors[0]->logical;
- BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1);
- failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1;
+ logical = sblock_to_check->logical;
+ ASSERT(sblock_to_check->mirror_num);
+ failed_mirror_index = sblock_to_check->mirror_num - 1;
is_metadata = !(sblock_to_check->sectors[0]->flags &
BTRFS_EXTENT_FLAG_DATA);
have_csum = sblock_to_check->sectors[0]->have_csum;
- dev = sblock_to_check->sectors[0]->dev;
if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
return 0;
@@ -902,17 +1104,28 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* repaired area is verified in order to correctly maintain
* the statistics.
*/
-
- sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
- sizeof(*sblocks_for_recheck), GFP_KERNEL);
- if (!sblocks_for_recheck) {
- spin_lock(&sctx->stat_lock);
- sctx->stat.malloc_errors++;
- sctx->stat.read_errors++;
- sctx->stat.uncorrectable_errors++;
- spin_unlock(&sctx->stat_lock);
- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
- goto out;
+ for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
+ /*
+ * Note: the two members refs and outstanding_sectors are not
+ * used in the blocks that are used for the recheck procedure.
+ *
+ * But alloc_scrub_block() will initialize sblock::ref anyway,
+ * so we can use scrub_block_put() to clean them up.
+ *
+ * And here we don't setup the physical/dev for the sblock yet,
+ * they will be correctly initialized in scrub_setup_recheck_block().
+ */
+ sblocks_for_recheck[mirror_index] = alloc_scrub_block(sctx, NULL,
+ logical, 0, 0, mirror_index);
+ if (!sblocks_for_recheck[mirror_index]) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ sctx->stat.read_errors++;
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+ goto out;
+ }
}
/* Setup the context, map the logical blocks and alloc the sectors */
@@ -926,7 +1139,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
goto out;
}
BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
- sblock_bad = sblocks_for_recheck + failed_mirror_index;
+ sblock_bad = sblocks_for_recheck[failed_mirror_index];
/* build and submit the bios for the failed mirror, check checksums */
scrub_recheck_block(fs_info, sblock_bad, 1);
@@ -1011,22 +1224,22 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
if (mirror_index >= BTRFS_MAX_MIRRORS)
break;
- if (!sblocks_for_recheck[mirror_index].sector_count)
+ if (!sblocks_for_recheck[mirror_index]->sector_count)
break;
- sblock_other = sblocks_for_recheck + mirror_index;
+ sblock_other = sblocks_for_recheck[mirror_index];
} else {
struct scrub_recover *r = sblock_bad->sectors[0]->recover;
int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
if (mirror_index >= max_allowed)
break;
- if (!sblocks_for_recheck[1].sector_count)
+ if (!sblocks_for_recheck[1]->sector_count)
break;
ASSERT(failed_mirror_index == 0);
- sblock_other = sblocks_for_recheck + 1;
- sblock_other->sectors[0]->mirror_num = 1 + mirror_index;
+ sblock_other = sblocks_for_recheck[1];
+ sblock_other->mirror_num = 1 + mirror_index;
}
/* build and submit the bios, check checksums */
@@ -1097,12 +1310,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
/* Try to find no-io-error sector in mirrors */
for (mirror_index = 0;
mirror_index < BTRFS_MAX_MIRRORS &&
- sblocks_for_recheck[mirror_index].sector_count > 0;
+ sblocks_for_recheck[mirror_index]->sector_count > 0;
mirror_index++) {
- if (!sblocks_for_recheck[mirror_index].
+ if (!sblocks_for_recheck[mirror_index]->
sectors[sector_num]->io_error) {
- sblock_other = sblocks_for_recheck +
- mirror_index;
+ sblock_other = sblocks_for_recheck[mirror_index];
break;
}
}
@@ -1163,7 +1375,7 @@ corrected_error:
spin_unlock(&sctx->stat_lock);
btrfs_err_rl_in_rcu(fs_info,
"fixed up error at logical %llu on dev %s",
- logical, rcu_str_deref(dev->name));
+ logical, btrfs_dev_name(dev));
}
} else {
did_not_correct_error:
@@ -1172,29 +1384,32 @@ did_not_correct_error:
spin_unlock(&sctx->stat_lock);
btrfs_err_rl_in_rcu(fs_info,
"unable to fixup (regular) error at logical %llu on dev %s",
- logical, rcu_str_deref(dev->name));
+ logical, btrfs_dev_name(dev));
}
out:
- if (sblocks_for_recheck) {
- for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
- mirror_index++) {
- struct scrub_block *sblock = sblocks_for_recheck +
- mirror_index;
- struct scrub_recover *recover;
- int i;
-
- for (i = 0; i < sblock->sector_count; i++) {
- sblock->sectors[i]->sblock = NULL;
- recover = sblock->sectors[i]->recover;
- if (recover) {
- scrub_put_recover(fs_info, recover);
- sblock->sectors[i]->recover = NULL;
- }
- scrub_sector_put(sblock->sectors[i]);
+ for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
+ struct scrub_block *sblock = sblocks_for_recheck[mirror_index];
+ struct scrub_recover *recover;
+ int sector_index;
+
+ /* Not allocated, continue checking the next mirror */
+ if (!sblock)
+ continue;
+
+ for (sector_index = 0; sector_index < sblock->sector_count;
+ sector_index++) {
+ /*
+ * Here we just cleanup the recover, each sector will be
+ * properly cleaned up by later scrub_block_put()
+ */
+ recover = sblock->sectors[sector_index]->recover;
+ if (recover) {
+ scrub_put_recover(fs_info, recover);
+ sblock->sectors[sector_index]->recover = NULL;
}
}
- kfree(sblocks_for_recheck);
+ scrub_block_put(sblock);
}
ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
@@ -1244,12 +1459,12 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
}
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
- struct scrub_block *sblocks_for_recheck)
+ struct scrub_block *sblocks_for_recheck[])
{
struct scrub_ctx *sctx = original_sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
+ u64 logical = original_sblock->logical;
u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
- u64 logical = original_sblock->sectors[0]->logical;
u64 generation = original_sblock->sectors[0]->generation;
u64 flags = original_sblock->sectors[0]->flags;
u64 have_csum = original_sblock->sectors[0]->have_csum;
@@ -1264,11 +1479,6 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
int nmirrors;
int ret;
- /*
- * Note: the two members refs and outstanding_sectors are not used (and
- * not set) in the blocks that are used for the recheck procedure.
- */
-
while (length > 0) {
sublen = min_t(u64, length, fs_info->sectorsize);
mapped_length = sublen;
@@ -1287,7 +1497,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
return -EIO;
}
- recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
+ recover = kzalloc(sizeof(struct scrub_recover), GFP_KERNEL);
if (!recover) {
btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
@@ -1307,24 +1517,19 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblock;
struct scrub_sector *sector;
- sblock = sblocks_for_recheck + mirror_index;
+ sblock = sblocks_for_recheck[mirror_index];
sblock->sctx = sctx;
- sector = kzalloc(sizeof(*sector), GFP_NOFS);
+ sector = alloc_scrub_sector(sblock, logical);
if (!sector) {
-leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
scrub_put_recover(fs_info, recover);
return -ENOMEM;
}
- scrub_sector_get(sector);
- sblock->sectors[sector_index] = sector;
- sector->sblock = sblock;
sector->flags = flags;
sector->generation = generation;
- sector->logical = logical;
sector->have_csum = have_csum;
if (have_csum)
memcpy(sector->csum,
@@ -1339,21 +1544,20 @@ leave_nomem:
mirror_index,
&stripe_index,
&stripe_offset);
- sector->physical = bioc->stripes[stripe_index].physical +
- stripe_offset;
- sector->dev = bioc->stripes[stripe_index].dev;
+ /*
+ * We're at the first sector, also populate @sblock
+ * physical and dev.
+ */
+ if (sector_index == 0) {
+ sblock->physical =
+ bioc->stripes[stripe_index].physical +
+ stripe_offset;
+ sblock->dev = bioc->stripes[stripe_index].dev;
+ sblock->physical_for_dev_replace =
+ original_sblock->physical_for_dev_replace;
+ }
BUG_ON(sector_index >= original_sblock->sector_count);
- sector->physical_for_dev_replace =
- original_sblock->sectors[sector_index]->
- physical_for_dev_replace;
- /* For missing devices, dev->bdev is NULL */
- sector->mirror_num = mirror_index + 1;
- sblock->sector_count++;
- sector->page = alloc_page(GFP_NOFS);
- if (!sector->page)
- goto leave_nomem;
-
scrub_get_recover(recover);
sector->recover = recover;
}
@@ -1377,11 +1581,11 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
{
DECLARE_COMPLETION_ONSTACK(done);
- bio->bi_iter.bi_sector = sector->logical >> 9;
+ bio->bi_iter.bi_sector = (sector->offset + sector->sblock->logical) >>
+ SECTOR_SHIFT;
bio->bi_private = &done;
bio->bi_end_io = scrub_bio_wait_endio;
- raid56_parity_recover(bio, sector->recover->bioc,
- sector->sblock->sectors[0]->mirror_num, false);
+ raid56_parity_recover(bio, sector->recover->bioc, sector->sblock->mirror_num);
wait_for_completion_io(&done);
return blk_status_to_errno(bio->bi_status);
@@ -1395,17 +1599,16 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
int i;
/* All sectors in sblock belong to the same stripe on the same device. */
- ASSERT(first_sector->dev);
- if (!first_sector->dev->bdev)
+ ASSERT(sblock->dev);
+ if (!sblock->dev->bdev)
goto out;
- bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
+ bio = bio_alloc(sblock->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
for (i = 0; i < sblock->sector_count; i++) {
struct scrub_sector *sector = sblock->sectors[i];
- WARN_ON(!sector->page);
- bio_add_page(bio, sector->page, PAGE_SIZE, 0);
+ bio_add_scrub_sector(bio, sector, fs_info->sectorsize);
}
if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
@@ -1449,16 +1652,16 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct bio bio;
struct bio_vec bvec;
- if (sector->dev->bdev == NULL) {
+ if (sblock->dev->bdev == NULL) {
sector->io_error = 1;
sblock->no_io_error_seen = 0;
continue;
}
- WARN_ON(!sector->page);
- bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ);
- bio_add_page(&bio, sector->page, fs_info->sectorsize, 0);
- bio.bi_iter.bi_sector = sector->physical >> 9;
+ bio_init(&bio, sblock->dev->bdev, &bvec, 1, REQ_OP_READ);
+ bio_add_scrub_sector(&bio, sector, fs_info->sectorsize);
+ bio.bi_iter.bi_sector = (sblock->physical + sector->offset) >>
+ SECTOR_SHIFT;
btrfsic_check_bio(&bio);
if (submit_bio_wait(&bio)) {
@@ -1475,7 +1678,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
{
- struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices;
+ struct btrfs_fs_devices *fs_devices = sector->sblock->dev->fs_devices;
int ret;
ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
@@ -1521,30 +1724,29 @@ static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
const u32 sectorsize = fs_info->sectorsize;
- BUG_ON(sector_bad->page == NULL);
- BUG_ON(sector_good->page == NULL);
if (force_write || sblock_bad->header_error ||
sblock_bad->checksum_error || sector_bad->io_error) {
struct bio bio;
struct bio_vec bvec;
int ret;
- if (!sector_bad->dev->bdev) {
+ if (!sblock_bad->dev->bdev) {
btrfs_warn_rl(fs_info,
"scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
return -EIO;
}
- bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
- bio.bi_iter.bi_sector = sector_bad->physical >> 9;
- __bio_add_page(&bio, sector_good->page, sectorsize, 0);
+ bio_init(&bio, sblock_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
+ bio.bi_iter.bi_sector = (sblock_bad->physical +
+ sector_bad->offset) >> SECTOR_SHIFT;
+ ret = bio_add_scrub_sector(&bio, sector_good, sectorsize);
btrfsic_check_bio(&bio);
ret = submit_bio_wait(&bio);
bio_uninit(&bio);
if (ret) {
- btrfs_dev_stat_inc_and_print(sector_bad->dev,
+ btrfs_dev_stat_inc_and_print(sblock_bad->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
atomic64_inc(&fs_info->dev_replace.num_write_errors);
return -EIO;
@@ -1577,11 +1779,11 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
{
+ const u32 sectorsize = sblock->sctx->fs_info->sectorsize;
struct scrub_sector *sector = sblock->sectors[sector_num];
- BUG_ON(sector->page == NULL);
if (sector->io_error)
- clear_page(page_address(sector->page));
+ memset(scrub_sector_get_kaddr(sector), 0, sectorsize);
return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
}
@@ -1608,9 +1810,15 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
return ret;
}
+static void scrub_block_get(struct scrub_block *sblock)
+{
+ refcount_inc(&sblock->refs);
+}
+
static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
struct scrub_sector *sector)
{
+ struct scrub_block *sblock = sector->sblock;
struct scrub_bio *sbio;
int ret;
const u32 sectorsize = sctx->fs_info->sectorsize;
@@ -1629,14 +1837,15 @@ again:
}
sbio = sctx->wr_curr_bio;
if (sbio->sector_count == 0) {
- ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace);
+ ret = fill_writer_pointer_gap(sctx, sector->offset +
+ sblock->physical_for_dev_replace);
if (ret) {
mutex_unlock(&sctx->wr_lock);
return ret;
}
- sbio->physical = sector->physical_for_dev_replace;
- sbio->logical = sector->logical;
+ sbio->physical = sblock->physical_for_dev_replace + sector->offset;
+ sbio->logical = sblock->logical + sector->offset;
sbio->dev = sctx->wr_tgtdev;
if (!sbio->bio) {
sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
@@ -1647,14 +1856,14 @@ again:
sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
sbio->status = 0;
} else if (sbio->physical + sbio->sector_count * sectorsize !=
- sector->physical_for_dev_replace ||
+ sblock->physical_for_dev_replace + sector->offset ||
sbio->logical + sbio->sector_count * sectorsize !=
- sector->logical) {
+ sblock->logical + sector->offset) {
scrub_wr_submit(sctx);
goto again;
}
- ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
+ ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
if (ret != sectorsize) {
if (sbio->sector_count < 1) {
bio_put(sbio->bio);
@@ -1668,6 +1877,13 @@ again:
sbio->sectors[sbio->sector_count] = sector;
scrub_sector_get(sector);
+ /*
+ * Since ssector no longer holds a page, but uses sblock::pages, we
+ * have to ensure the sblock had not been freed before our write bio
+ * finished.
+ */
+ scrub_block_get(sector->sblock);
+
sbio->sector_count++;
if (sbio->sector_count == sctx->sectors_per_bio)
scrub_wr_submit(sctx);
@@ -1729,8 +1945,14 @@ static void scrub_wr_bio_end_io_worker(struct work_struct *work)
}
}
- for (i = 0; i < sbio->sector_count; i++)
+ /*
+ * In scrub_add_sector_to_wr_bio() we grab extra ref for sblock, now in
+ * endio we should put the sblock.
+ */
+ for (i = 0; i < sbio->sector_count; i++) {
+ scrub_block_put(sbio->sectors[i]->sblock);
scrub_sector_put(sbio->sectors[i]);
+ }
bio_put(sbio->bio);
kfree(sbio);
@@ -1762,7 +1984,7 @@ static int scrub_checksum(struct scrub_block *sblock)
else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
ret = scrub_checksum_tree_block(sblock);
else if (flags & BTRFS_EXTENT_FLAG_SUPER)
- (void)scrub_checksum_super(sblock);
+ ret = scrub_checksum_super(sblock);
else
WARN_ON(1);
if (ret)
@@ -1785,15 +2007,11 @@ static int scrub_checksum_data(struct scrub_block *sblock)
if (!sector->have_csum)
return 0;
- kaddr = page_address(sector->page);
+ kaddr = scrub_sector_get_kaddr(sector);
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
- /*
- * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector
- * only contains one sector of data.
- */
crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
if (memcmp(csum, sector->csum, fs_info->csum_size))
@@ -1826,7 +2044,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
ASSERT(sblock->sector_count == num_sectors);
sector = sblock->sectors[0];
- kaddr = page_address(sector->page);
+ kaddr = scrub_sector_get_kaddr(sector);
h = (struct btrfs_header *)kaddr;
memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
@@ -1835,7 +2053,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
* a) don't have an extent buffer and
* b) the page is already kmapped
*/
- if (sector->logical != btrfs_stack_header_bytenr(h))
+ if (sblock->logical != btrfs_stack_header_bytenr(h))
sblock->header_error = 1;
if (sector->generation != btrfs_stack_header_generation(h)) {
@@ -1856,7 +2074,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
sectorsize - BTRFS_CSUM_SIZE);
for (i = 1; i < num_sectors; i++) {
- kaddr = page_address(sblock->sectors[i]->page);
+ kaddr = scrub_sector_get_kaddr(sblock->sectors[i]);
crypto_shash_update(shash, kaddr, sectorsize);
}
@@ -1881,10 +2099,10 @@ static int scrub_checksum_super(struct scrub_block *sblock)
BUG_ON(sblock->sector_count < 1);
sector = sblock->sectors[0];
- kaddr = page_address(sector->page);
+ kaddr = scrub_sector_get_kaddr(sector);
s = (struct btrfs_super_block *)kaddr;
- if (sector->logical != btrfs_super_bytenr(s))
+ if (sblock->logical != btrfs_super_bytenr(s))
++fail_cor;
if (sector->generation != btrfs_super_generation(s))
@@ -1901,31 +2119,9 @@ static int scrub_checksum_super(struct scrub_block *sblock)
if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
++fail_cor;
- if (fail_cor + fail_gen) {
- /*
- * if we find an error in a super block, we just report it.
- * They will get written with the next transaction commit
- * anyway
- */
- spin_lock(&sctx->stat_lock);
- ++sctx->stat.super_errors;
- spin_unlock(&sctx->stat_lock);
- if (fail_cor)
- btrfs_dev_stat_inc_and_print(sector->dev,
- BTRFS_DEV_STAT_CORRUPTION_ERRS);
- else
- btrfs_dev_stat_inc_and_print(sector->dev,
- BTRFS_DEV_STAT_GENERATION_ERRS);
- }
-
return fail_cor + fail_gen;
}
-static void scrub_block_get(struct scrub_block *sblock)
-{
- refcount_inc(&sblock->refs);
-}
-
static void scrub_block_put(struct scrub_block *sblock)
{
if (refcount_dec_and_test(&sblock->refs)) {
@@ -1936,6 +2132,12 @@ static void scrub_block_put(struct scrub_block *sblock)
for (i = 0; i < sblock->sector_count; i++)
scrub_sector_put(sblock->sectors[i]);
+ for (i = 0; i < DIV_ROUND_UP(sblock->len, PAGE_SIZE); i++) {
+ if (sblock->pages[i]) {
+ detach_scrub_page_private(sblock->pages[i]);
+ __free_page(sblock->pages[i]);
+ }
+ }
kfree(sblock);
}
}
@@ -1947,11 +2149,8 @@ static void scrub_sector_get(struct scrub_sector *sector)
static void scrub_sector_put(struct scrub_sector *sector)
{
- if (atomic_dec_and_test(&sector->refs)) {
- if (sector->page)
- __free_page(sector->page);
+ if (atomic_dec_and_test(&sector->refs))
kfree(sector);
- }
}
/*
@@ -2056,9 +2255,9 @@ again:
}
sbio = sctx->bios[sctx->curr];
if (sbio->sector_count == 0) {
- sbio->physical = sector->physical;
- sbio->logical = sector->logical;
- sbio->dev = sector->dev;
+ sbio->physical = sblock->physical + sector->offset;
+ sbio->logical = sblock->logical + sector->offset;
+ sbio->dev = sblock->dev;
if (!sbio->bio) {
sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
REQ_OP_READ, GFP_NOFS);
@@ -2068,16 +2267,16 @@ again:
sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
sbio->status = 0;
} else if (sbio->physical + sbio->sector_count * sectorsize !=
- sector->physical ||
+ sblock->physical + sector->offset ||
sbio->logical + sbio->sector_count * sectorsize !=
- sector->logical ||
- sbio->dev != sector->dev) {
+ sblock->logical + sector->offset ||
+ sbio->dev != sblock->dev) {
scrub_submit(sctx);
goto again;
}
sbio->sectors[sbio->sector_count] = sector;
- ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
+ ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
if (ret != sectorsize) {
if (sbio->sector_count < 1) {
bio_put(sbio->bio);
@@ -2102,6 +2301,7 @@ static void scrub_missing_raid56_end_io(struct bio *bio)
struct scrub_block *sblock = bio->bi_private;
struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
+ btrfs_bio_counter_dec(fs_info);
if (bio->bi_status)
sblock->no_io_error_seen = 0;
@@ -2118,8 +2318,8 @@ static void scrub_missing_raid56_worker(struct work_struct *work)
u64 logical;
struct btrfs_device *dev;
- logical = sblock->sectors[0]->logical;
- dev = sblock->sectors[0]->dev;
+ logical = sblock->logical;
+ dev = sblock->dev;
if (sblock->no_io_error_seen)
scrub_recheck_block_checksum(sblock);
@@ -2130,14 +2330,14 @@ static void scrub_missing_raid56_worker(struct work_struct *work)
spin_unlock(&sctx->stat_lock);
btrfs_err_rl_in_rcu(fs_info,
"IO error rebuilding logical %llu for dev %s",
- logical, rcu_str_deref(dev->name));
+ logical, btrfs_dev_name(dev));
} else if (sblock->header_error || sblock->checksum_error) {
spin_lock(&sctx->stat_lock);
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
btrfs_err_rl_in_rcu(fs_info,
"failed to rebuild valid logical %llu for dev %s",
- logical, rcu_str_deref(dev->name));
+ logical, btrfs_dev_name(dev));
} else {
scrub_write_block_to_dev_replace(sblock);
}
@@ -2157,7 +2357,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
struct scrub_ctx *sctx = sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
u64 length = sblock->sector_count << fs_info->sectorsize_bits;
- u64 logical = sblock->sectors[0]->logical;
+ u64 logical = sblock->logical;
struct btrfs_io_context *bioc = NULL;
struct bio *bio;
struct btrfs_raid_bio *rbio;
@@ -2193,17 +2393,16 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
for (i = 0; i < sblock->sector_count; i++) {
struct scrub_sector *sector = sblock->sectors[i];
- /*
- * For now, our scrub is still one page per sector, so pgoff
- * is always 0.
- */
- raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical);
+ raid56_add_scrub_pages(rbio, scrub_sector_get_page(sector),
+ scrub_sector_get_page_offset(sector),
+ sector->offset + sector->sblock->logical);
}
INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
scrub_block_get(sblock);
scrub_pending_bio_inc(sctx);
raid56_submit_missing_rbio(rbio);
+ btrfs_put_bioc(bioc);
return;
rbio_out:
@@ -2225,7 +2424,8 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
const u32 sectorsize = sctx->fs_info->sectorsize;
int index;
- sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
+ sblock = alloc_scrub_block(sctx, dev, logical, physical,
+ physical_for_dev_replace, mirror_num);
if (!sblock) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2233,12 +2433,6 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
return -ENOMEM;
}
- /* one ref inside this function, plus one for each page added to
- * a bio later on */
- refcount_set(&sblock->refs, 1);
- sblock->sctx = sctx;
- sblock->no_io_error_seen = 1;
-
for (index = 0; len > 0; index++) {
struct scrub_sector *sector;
/*
@@ -2248,36 +2442,22 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
*/
u32 l = min(sectorsize, len);
- sector = kzalloc(sizeof(*sector), GFP_KERNEL);
+ sector = alloc_scrub_sector(sblock, logical);
if (!sector) {
-leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
scrub_block_put(sblock);
return -ENOMEM;
}
- ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
- scrub_sector_get(sector);
- sblock->sectors[index] = sector;
- sector->sblock = sblock;
- sector->dev = dev;
sector->flags = flags;
sector->generation = gen;
- sector->logical = logical;
- sector->physical = physical;
- sector->physical_for_dev_replace = physical_for_dev_replace;
- sector->mirror_num = mirror_num;
if (csum) {
sector->have_csum = 1;
memcpy(sector->csum, csum, sctx->fs_info->csum_size);
} else {
sector->have_csum = 0;
}
- sblock->sector_count++;
- sector->page = alloc_page(GFP_KERNEL);
- if (!sector->page)
- goto leave_nomem;
len -= l;
logical += l;
physical += l;
@@ -2423,8 +2603,9 @@ static void scrub_block_complete(struct scrub_block *sblock)
}
if (sblock->sparity && corrupted && !sblock->data_corrected) {
- u64 start = sblock->sectors[0]->logical;
- u64 end = sblock->sectors[sblock->sector_count - 1]->logical +
+ u64 start = sblock->logical;
+ u64 end = sblock->logical +
+ sblock->sectors[sblock->sector_count - 1]->offset +
sblock->sctx->fs_info->sectorsize;
ASSERT(end - start <= U32_MAX);
@@ -2578,7 +2759,7 @@ static int scrub_sectors_for_parity(struct scrub_parity *sparity,
ASSERT(IS_ALIGNED(len, sectorsize));
- sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
+ sblock = alloc_scrub_block(sctx, dev, logical, physical, physical, mirror_num);
if (!sblock) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2586,51 +2767,32 @@ static int scrub_sectors_for_parity(struct scrub_parity *sparity,
return -ENOMEM;
}
- /* one ref inside this function, plus one for each page added to
- * a bio later on */
- refcount_set(&sblock->refs, 1);
- sblock->sctx = sctx;
- sblock->no_io_error_seen = 1;
sblock->sparity = sparity;
scrub_parity_get(sparity);
for (index = 0; len > 0; index++) {
struct scrub_sector *sector;
- sector = kzalloc(sizeof(*sector), GFP_KERNEL);
+ sector = alloc_scrub_sector(sblock, logical);
if (!sector) {
-leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
scrub_block_put(sblock);
return -ENOMEM;
}
- ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
- /* For scrub block */
- scrub_sector_get(sector);
sblock->sectors[index] = sector;
/* For scrub parity */
scrub_sector_get(sector);
list_add_tail(&sector->list, &sparity->sectors_list);
- sector->sblock = sblock;
- sector->dev = dev;
sector->flags = flags;
sector->generation = gen;
- sector->logical = logical;
- sector->physical = physical;
- sector->mirror_num = mirror_num;
if (csum) {
sector->have_csum = 1;
memcpy(sector->csum, csum, sctx->fs_info->csum_size);
} else {
sector->have_csum = 0;
}
- sblock->sector_count++;
- sector->page = alloc_page(GFP_KERNEL);
- if (!sector->page)
- goto leave_nomem;
-
/* Iterate over the stripe range in sectorsize steps */
len -= sectorsize;
@@ -2774,6 +2936,7 @@ static void scrub_parity_bio_endio_worker(struct work_struct *work)
work);
struct scrub_ctx *sctx = sparity->sctx;
+ btrfs_bio_counter_dec(sctx->fs_info);
scrub_free_parity(sparity);
scrub_pending_bio_dec(sctx);
}
@@ -2824,6 +2987,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
sparity->scrub_dev,
&sparity->dbitmap,
sparity->nsectors);
+ btrfs_put_bioc(bioc);
if (!rbio)
goto rbio_out;
@@ -2835,7 +2999,6 @@ rbio_out:
bio_put(bio);
bioc_out:
btrfs_bio_counter_dec(fs_info);
- btrfs_put_bioc(bioc);
bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
sparity->nsectors);
spin_lock(&sctx->stat_lock);
@@ -3075,9 +3238,9 @@ static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
extent_dev = bioc->stripes[0].dev;
btrfs_put_bioc(bioc);
- ret = btrfs_lookup_csums_range(csum_root, extent_start,
- extent_start + extent_size - 1,
- &sctx->csum_list, 1);
+ ret = btrfs_lookup_csums_list(csum_root, extent_start,
+ extent_start + extent_size - 1,
+ &sctx->csum_list, 1, false);
if (ret) {
scrub_parity_mark_sectors_error(sparity, extent_start,
extent_size);
@@ -3266,7 +3429,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
}
/* Block group removed? */
spin_lock(&bg->lock);
- if (bg->removed) {
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
spin_unlock(&bg->lock);
ret = 0;
break;
@@ -3301,9 +3464,9 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
cur_logical;
if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
- ret = btrfs_lookup_csums_range(csum_root, cur_logical,
+ ret = btrfs_lookup_csums_list(csum_root, cur_logical,
cur_logical + scrub_len - 1,
- &sctx->csum_list, 1);
+ &sctx->csum_list, 1, false);
if (ret)
break;
}
@@ -3606,7 +3769,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
* kthread or relocation.
*/
spin_lock(&bg->lock);
- if (!bg->removed)
+ if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags))
ret = -EINVAL;
spin_unlock(&bg->lock);
@@ -3764,13 +3927,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
}
if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
- spin_lock(&cache->lock);
- if (!cache->to_copy) {
- spin_unlock(&cache->lock);
+ if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) {
btrfs_put_block_group(cache);
goto skip;
}
- spin_unlock(&cache->lock);
}
/*
@@ -3782,7 +3942,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* repair extents.
*/
spin_lock(&cache->lock);
- if (cache->removed) {
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
spin_unlock(&cache->lock);
btrfs_put_block_group(cache);
goto skip;
@@ -3942,8 +4102,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* balance is triggered or it becomes used and unused again.
*/
spin_lock(&cache->lock);
- if (!cache->removed && !cache->ro && cache->reserved == 0 &&
- cache->used == 0) {
+ if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) &&
+ !cache->ro && cache->reserved == 0 && cache->used == 0) {
spin_unlock(&cache->lock);
if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
btrfs_discard_queue_work(&fs_info->discard_ctl,
@@ -4102,36 +4262,21 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
int ret;
struct btrfs_device *dev;
unsigned int nofs_flag;
+ bool need_commit = false;
if (btrfs_fs_closing(fs_info))
return -EAGAIN;
- if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
- /*
- * in this case scrub is unable to calculate the checksum
- * the way scrub is implemented. Do not handle this
- * situation at all because it won't ever happen.
- */
- btrfs_err(fs_info,
- "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
- fs_info->nodesize,
- BTRFS_STRIPE_LEN);
- return -EINVAL;
- }
+ /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */
+ ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN);
- if (fs_info->nodesize >
- SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits ||
- fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) {
- /*
- * Would exhaust the array bounds of sectorv member in
- * struct scrub_block
- */
- btrfs_err(fs_info,
-"scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails",
- fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK,
- fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK);
- return -EINVAL;
- }
+ /*
+ * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible
+ * value (max nodesize / min sectorsize), thus nodesize should always
+ * be fine.
+ */
+ ASSERT(fs_info->nodesize <=
+ SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits);
/* Allocate outside of device_list_mutex */
sctx = scrub_setup_ctx(fs_info, is_dev_replace);
@@ -4156,7 +4301,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
btrfs_err_in_rcu(fs_info,
"scrub on devid %llu: filesystem on %s is not writable",
- devid, rcu_str_deref(dev->name));
+ devid, btrfs_dev_name(dev));
ret = -EROFS;
goto out;
}
@@ -4205,6 +4350,12 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
*/
nofs_flag = memalloc_nofs_save();
if (!is_dev_replace) {
+ u64 old_super_errors;
+
+ spin_lock(&sctx->stat_lock);
+ old_super_errors = sctx->stat.super_errors;
+ spin_unlock(&sctx->stat_lock);
+
btrfs_info(fs_info, "scrub: started on devid %llu", devid);
/*
* by holding device list mutex, we can
@@ -4213,6 +4364,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_lock(&fs_info->fs_devices->device_list_mutex);
ret = scrub_supers(sctx, dev);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ spin_lock(&sctx->stat_lock);
+ /*
+ * Super block errors found, but we can not commit transaction
+ * at current context, since btrfs_commit_transaction() needs
+ * to pause the current running scrub (hold by ourselves).
+ */
+ if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
+ need_commit = true;
+ spin_unlock(&sctx->stat_lock);
}
if (!ret)
@@ -4239,6 +4400,25 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
scrub_workers_put(fs_info);
scrub_put_ctx(sctx);
+ /*
+ * We found some super block errors before, now try to force a
+ * transaction commit, as scrub has finished.
+ */
+ if (need_commit) {
+ struct btrfs_trans_handle *trans;
+
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_err(fs_info,
+ "scrub: failed to start transaction to fix super block errors: %d", ret);
+ return ret;
+ }
+ ret = btrfs_commit_transaction(trans);
+ if (ret < 0)
+ btrfs_err(fs_info,
+ "scrub: failed to commit transaction to fix super block errors: %d", ret);
+ }
return ret;
out:
scrub_workers_put(fs_info);
diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h
new file mode 100644
index 000000000000..7639103ebf9d
--- /dev/null
+++ b/fs/btrfs/scrub.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_SCRUB_H
+#define BTRFS_SCRUB_H
+
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
+ u64 end, struct btrfs_scrub_progress *progress,
+ int readonly, int is_dev_replace);
+void btrfs_scrub_pause(struct btrfs_fs_info *fs_info);
+void btrfs_scrub_continue(struct btrfs_fs_info *fs_info);
+int btrfs_scrub_cancel(struct btrfs_fs_info *info);
+int btrfs_scrub_cancel_dev(struct btrfs_device *dev);
+int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
+ struct btrfs_scrub_progress *progress);
+
+#endif
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e7671afcee4f..e65e6b6600a7 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -15,6 +15,7 @@
#include <linux/string.h>
#include <linux/compat.h>
#include <linux/crc32c.h>
+#include <linux/fsverity.h>
#include "send.h"
#include "ctree.h"
@@ -26,6 +27,11 @@
#include "compression.h"
#include "xattr.h"
#include "print-tree.h"
+#include "accessors.h"
+#include "dir-item.h"
+#include "file-item.h"
+#include "ioctl.h"
+#include "verity.h"
/*
* Maximum number of references an extent can have in order for us to attempt to
@@ -33,7 +39,7 @@
* avoid hitting limitations of the backreference walking code (taking a lot of
* time and using too much memory for extents with large number of references).
*/
-#define SEND_MAX_EXTENT_REFS 64
+#define SEND_MAX_EXTENT_REFS 1024
/*
* A fs_path is a helper to dynamically build path names with unknown size.
@@ -70,13 +76,46 @@ struct clone_root {
struct btrfs_root *root;
u64 ino;
u64 offset;
-
- u64 found_refs;
+ u64 num_bytes;
+ bool found_ref;
};
#define SEND_CTX_MAX_NAME_CACHE_SIZE 128
#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
+/*
+ * Limit the root_ids array of struct backref_cache_entry to 12 elements.
+ * This makes the size of a cache entry to be exactly 128 bytes on x86_64.
+ * The most common case is to have a single root for cloning, which corresponds
+ * to the send root. Having the user specify more than 11 clone roots is not
+ * common, and in such rare cases we simply don't use caching if the number of
+ * cloning roots that lead down to a leaf is more than 12.
+ */
+#define SEND_MAX_BACKREF_CACHE_ROOTS 12
+
+/*
+ * Max number of entries in the cache.
+ * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding
+ * maple tree's internal nodes, is 16K.
+ */
+#define SEND_MAX_BACKREF_CACHE_SIZE 128
+
+/*
+ * A backref cache entry maps a leaf to a list of IDs of roots from which the
+ * leaf is accessible and we can use for clone operations.
+ * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on
+ * x86_64).
+ */
+struct backref_cache_entry {
+ /* List to link to the cache's lru list. */
+ struct list_head list;
+ /* The key for this entry in the cache. */
+ u64 key;
+ u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS];
+ /* Number of valid elements in the root_ids array. */
+ int num_roots;
+};
+
struct send_ctx {
struct file *send_filp;
loff_t send_off;
@@ -127,6 +166,8 @@ struct send_ctx {
bool cur_inode_new_gen;
bool cur_inode_deleted;
bool ignore_cur_inode;
+ bool cur_inode_needs_verity;
+ void *verity_descriptor;
u64 send_progress;
@@ -243,6 +284,14 @@ struct send_ctx {
struct rb_root rbtree_new_refs;
struct rb_root rbtree_deleted_refs;
+
+ struct {
+ u64 last_reloc_trans;
+ struct list_head lru_list;
+ struct maple_tree entries;
+ /* Number of entries stored in the cache. */
+ int size;
+ } backref_cache;
};
struct pending_dir_move {
@@ -345,6 +394,7 @@ static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd)
switch (sctx->proto) {
case 1: return cmd <= BTRFS_SEND_C_MAX_V1;
case 2: return cmd <= BTRFS_SEND_C_MAX_V2;
+ case 3: return cmd <= BTRFS_SEND_C_MAX_V3;
default: return false;
}
}
@@ -436,6 +486,11 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
old_buf_len = p->buf_len;
/*
+ * Allocate to the next largest kmalloc bucket size, to let
+ * the fast path happen most of the time.
+ */
+ len = kmalloc_size_roundup(len);
+ /*
* First time the inline_buf does not suffice
*/
if (p->buf == p->inline_buf) {
@@ -448,11 +503,7 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
if (!tmp_buf)
return -ENOMEM;
p->buf = tmp_buf;
- /*
- * The real size of the buffer is bigger, this will let the fast path
- * happen most of the time
- */
- p->buf_len = ksize(p->buf);
+ p->buf_len = len;
if (p->reversed) {
tmp_buf = p->buf + old_buf_len - path_len - 1;
@@ -624,6 +675,7 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \
}
+TLV_PUT_DEFINE_INT(8)
TLV_PUT_DEFINE_INT(32)
TLV_PUT_DEFINE_INT(64)
@@ -842,17 +894,32 @@ out:
return ret;
}
+struct btrfs_inode_info {
+ u64 size;
+ u64 gen;
+ u64 mode;
+ u64 uid;
+ u64 gid;
+ u64 rdev;
+ u64 fileattr;
+ u64 nlink;
+};
+
/*
* Helper function to retrieve some fields from an inode item.
*/
-static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
- u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid,
- u64 *gid, u64 *rdev, u64 *fileattr)
+static int get_inode_info(struct btrfs_root *root, u64 ino,
+ struct btrfs_inode_info *info)
{
int ret;
+ struct btrfs_path *path;
struct btrfs_inode_item *ii;
struct btrfs_key key;
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
key.objectid = ino;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
@@ -860,47 +927,43 @@ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
if (ret) {
if (ret > 0)
ret = -ENOENT;
- return ret;
+ goto out;
}
+ if (!info)
+ goto out;
+
ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
- if (size)
- *size = btrfs_inode_size(path->nodes[0], ii);
- if (gen)
- *gen = btrfs_inode_generation(path->nodes[0], ii);
- if (mode)
- *mode = btrfs_inode_mode(path->nodes[0], ii);
- if (uid)
- *uid = btrfs_inode_uid(path->nodes[0], ii);
- if (gid)
- *gid = btrfs_inode_gid(path->nodes[0], ii);
- if (rdev)
- *rdev = btrfs_inode_rdev(path->nodes[0], ii);
+ info->size = btrfs_inode_size(path->nodes[0], ii);
+ info->gen = btrfs_inode_generation(path->nodes[0], ii);
+ info->mode = btrfs_inode_mode(path->nodes[0], ii);
+ info->uid = btrfs_inode_uid(path->nodes[0], ii);
+ info->gid = btrfs_inode_gid(path->nodes[0], ii);
+ info->rdev = btrfs_inode_rdev(path->nodes[0], ii);
+ info->nlink = btrfs_inode_nlink(path->nodes[0], ii);
/*
* Transfer the unchanged u64 value of btrfs_inode_item::flags, that's
* otherwise logically split to 32/32 parts.
*/
- if (fileattr)
- *fileattr = btrfs_inode_flags(path->nodes[0], ii);
+ info->fileattr = btrfs_inode_flags(path->nodes[0], ii);
+out:
+ btrfs_free_path(path);
return ret;
}
-static int get_inode_info(struct btrfs_root *root,
- u64 ino, u64 *size, u64 *gen,
- u64 *mode, u64 *uid, u64 *gid,
- u64 *rdev, u64 *fileattr)
+static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
{
- struct btrfs_path *path;
int ret;
+ struct btrfs_inode_info info;
- path = alloc_path_for_send();
- if (!path)
- return -ENOMEM;
- ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid,
- rdev, fileattr);
- btrfs_free_path(path);
+ if (!gen)
+ return -EPERM;
+
+ ret = get_inode_info(root, ino, &info);
+ if (!ret)
+ *gen = info.gen;
return ret;
}
@@ -1077,7 +1140,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
data_len = btrfs_dir_data_len(eb, di);
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
- if (btrfs_dir_type(eb, di) == BTRFS_FT_XATTR) {
+ if (btrfs_dir_ftype(eb, di) == BTRFS_FT_XATTR) {
if (name_len > XATTR_NAME_MAX) {
ret = -ENAMETOOLONG;
goto out;
@@ -1220,8 +1283,12 @@ struct backref_ctx {
/* may be truncated in case it's the last extent in a file */
u64 extent_len;
- /* Just to check for bugs in backref resolving */
- int found_itself;
+ /* The bytenr the file extent item we are processing refers to. */
+ u64 bytenr;
+ /* The owner (root id) of the data backref for the current extent. */
+ u64 backref_owner;
+ /* The offset of the data backref for the current extent. */
+ u64 backref_offset;
};
static int __clone_root_cmp_bsearch(const void *key, const void *elt)
@@ -1250,32 +1317,33 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
/*
* Called for every backref that is found for the current extent.
- * Results are collected in sctx->clone_roots->ino/offset/found_refs
+ * Results are collected in sctx->clone_roots->ino/offset.
*/
-static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
+static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id,
+ void *ctx_)
{
struct backref_ctx *bctx = ctx_;
- struct clone_root *found;
+ struct clone_root *clone_root;
/* First check if the root is in the list of accepted clone sources */
- found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
- bctx->sctx->clone_roots_cnt,
- sizeof(struct clone_root),
- __clone_root_cmp_bsearch);
- if (!found)
+ clone_root = bsearch((void *)(uintptr_t)root_id, bctx->sctx->clone_roots,
+ bctx->sctx->clone_roots_cnt,
+ sizeof(struct clone_root),
+ __clone_root_cmp_bsearch);
+ if (!clone_root)
return 0;
- if (found->root == bctx->sctx->send_root &&
+ /* This is our own reference, bail out as we can't clone from it. */
+ if (clone_root->root == bctx->sctx->send_root &&
ino == bctx->cur_objectid &&
- offset == bctx->cur_offset) {
- bctx->found_itself = 1;
- }
+ offset == bctx->cur_offset)
+ return 0;
/*
* Make sure we don't consider clones from send_root that are
* behind the current inode/offset.
*/
- if (found->root == bctx->sctx->send_root) {
+ if (clone_root->root == bctx->sctx->send_root) {
/*
* If the source inode was not yet processed we can't issue a
* clone operation, as the source extent does not exist yet at
@@ -1296,21 +1364,217 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
}
bctx->found++;
- found->found_refs++;
- if (ino < found->ino) {
- found->ino = ino;
- found->offset = offset;
- } else if (found->ino == ino) {
+ clone_root->found_ref = true;
+
+ /*
+ * If the given backref refers to a file extent item with a larger
+ * number of bytes than what we found before, use the new one so that
+ * we clone more optimally and end up doing less writes and getting
+ * less exclusive, non-shared extents at the destination.
+ */
+ if (num_bytes > clone_root->num_bytes) {
+ clone_root->ino = ino;
+ clone_root->offset = offset;
+ clone_root->num_bytes = num_bytes;
+
/*
- * same extent found more then once in the same file.
+ * Found a perfect candidate, so there's no need to continue
+ * backref walking.
*/
- if (found->offset > offset + bctx->extent_len)
- found->offset = offset;
+ if (num_bytes >= bctx->extent_len)
+ return BTRFS_ITERATE_EXTENT_INODES_STOP;
}
return 0;
}
+static void empty_backref_cache(struct send_ctx *sctx)
+{
+ struct backref_cache_entry *entry;
+ struct backref_cache_entry *tmp;
+
+ list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list)
+ kfree(entry);
+
+ INIT_LIST_HEAD(&sctx->backref_cache.lru_list);
+ mtree_destroy(&sctx->backref_cache.entries);
+ sctx->backref_cache.size = 0;
+}
+
+static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
+ const u64 **root_ids_ret, int *root_count_ret)
+{
+ struct backref_ctx *bctx = ctx;
+ struct send_ctx *sctx = bctx->sctx;
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ const u64 key = leaf_bytenr >> fs_info->sectorsize_bits;
+ struct backref_cache_entry *entry;
+
+ if (sctx->backref_cache.size == 0)
+ return false;
+
+ /*
+ * If relocation happened since we first filled the cache, then we must
+ * empty the cache and can not use it, because even though we operate on
+ * read-only roots, their leaves and nodes may have been reallocated and
+ * now be used for different nodes/leaves of the same tree or some other
+ * tree.
+ *
+ * We are called from iterate_extent_inodes() while either holding a
+ * transaction handle or holding fs_info->commit_root_sem, so no need
+ * to take any lock here.
+ */
+ if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) {
+ empty_backref_cache(sctx);
+ return false;
+ }
+
+ entry = mtree_load(&sctx->backref_cache.entries, key);
+ if (!entry)
+ return false;
+
+ *root_ids_ret = entry->root_ids;
+ *root_count_ret = entry->num_roots;
+ list_move_tail(&entry->list, &sctx->backref_cache.lru_list);
+
+ return true;
+}
+
+static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
+ void *ctx)
+{
+ struct backref_ctx *bctx = ctx;
+ struct send_ctx *sctx = bctx->sctx;
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ struct backref_cache_entry *new_entry;
+ struct ulist_iterator uiter;
+ struct ulist_node *node;
+ int ret;
+
+ /*
+ * We're called while holding a transaction handle or while holding
+ * fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a
+ * NOFS allocation.
+ */
+ new_entry = kmalloc(sizeof(struct backref_cache_entry), GFP_NOFS);
+ /* No worries, cache is optional. */
+ if (!new_entry)
+ return;
+
+ new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits;
+ new_entry->num_roots = 0;
+ ULIST_ITER_INIT(&uiter);
+ while ((node = ulist_next(root_ids, &uiter)) != NULL) {
+ const u64 root_id = node->val;
+ struct clone_root *root;
+
+ root = bsearch((void *)(uintptr_t)root_id, sctx->clone_roots,
+ sctx->clone_roots_cnt, sizeof(struct clone_root),
+ __clone_root_cmp_bsearch);
+ if (!root)
+ continue;
+
+ /* Too many roots, just exit, no worries as caching is optional. */
+ if (new_entry->num_roots >= SEND_MAX_BACKREF_CACHE_ROOTS) {
+ kfree(new_entry);
+ return;
+ }
+
+ new_entry->root_ids[new_entry->num_roots] = root_id;
+ new_entry->num_roots++;
+ }
+
+ /*
+ * We may have not added any roots to the new cache entry, which means
+ * none of the roots is part of the list of roots from which we are
+ * allowed to clone. Cache the new entry as it's still useful to avoid
+ * backref walking to determine which roots have a path to the leaf.
+ */
+
+ if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) {
+ struct backref_cache_entry *lru_entry;
+ struct backref_cache_entry *mt_entry;
+
+ lru_entry = list_first_entry(&sctx->backref_cache.lru_list,
+ struct backref_cache_entry, list);
+ mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key);
+ ASSERT(mt_entry == lru_entry);
+ list_del(&mt_entry->list);
+ kfree(mt_entry);
+ sctx->backref_cache.size--;
+ }
+
+ ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key,
+ new_entry, GFP_NOFS);
+ ASSERT(ret == 0 || ret == -ENOMEM);
+ if (ret) {
+ /* Caching is optional, no worries. */
+ kfree(new_entry);
+ return;
+ }
+
+ list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list);
+
+ /*
+ * We are called from iterate_extent_inodes() while either holding a
+ * transaction handle or holding fs_info->commit_root_sem, so no need
+ * to take any lock here.
+ */
+ if (sctx->backref_cache.size == 0)
+ sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans;
+
+ sctx->backref_cache.size++;
+}
+
+static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei,
+ const struct extent_buffer *leaf, void *ctx)
+{
+ const u64 refs = btrfs_extent_refs(leaf, ei);
+ const struct backref_ctx *bctx = ctx;
+ const struct send_ctx *sctx = bctx->sctx;
+
+ if (bytenr == bctx->bytenr) {
+ const u64 flags = btrfs_extent_flags(leaf, ei);
+
+ if (WARN_ON(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+ return -EUCLEAN;
+
+ /*
+ * If we have only one reference and only the send root as a
+ * clone source - meaning no clone roots were given in the
+ * struct btrfs_ioctl_send_args passed to the send ioctl - then
+ * it's our reference and there's no point in doing backref
+ * walking which is expensive, so exit early.
+ */
+ if (refs == 1 && sctx->clone_roots_cnt == 1)
+ return -ENOENT;
+ }
+
+ /*
+ * Backreference walking (iterate_extent_inodes() below) is currently
+ * too expensive when an extent has a large number of references, both
+ * in time spent and used memory. So for now just fallback to write
+ * operations instead of clone operations when an extent has more than
+ * a certain amount of references.
+ */
+ if (refs > SEND_MAX_EXTENT_REFS)
+ return -ENOENT;
+
+ return 0;
+}
+
+static bool skip_self_data_ref(u64 root, u64 ino, u64 offset, void *ctx)
+{
+ const struct backref_ctx *bctx = ctx;
+
+ if (ino == bctx->cur_objectid &&
+ root == bctx->backref_owner &&
+ offset == bctx->backref_offset)
+ return true;
+
+ return false;
+}
+
/*
* Given an inode, offset and extent item, it finds a good clone for a clone
* instruction. Returns -ENOENT when none could be found. The function makes
@@ -1332,79 +1596,36 @@ static int find_extent_clone(struct send_ctx *sctx,
u64 logical;
u64 disk_byte;
u64 num_bytes;
- u64 extent_item_pos;
- u64 flags = 0;
struct btrfs_file_extent_item *fi;
struct extent_buffer *eb = path->nodes[0];
- struct backref_ctx backref_ctx = {0};
+ struct backref_ctx backref_ctx = { 0 };
+ struct btrfs_backref_walk_ctx backref_walk_ctx = { 0 };
struct clone_root *cur_clone_root;
- struct btrfs_key found_key;
- struct btrfs_path *tmp_path;
- struct btrfs_extent_item *ei;
int compressed;
u32 i;
- tmp_path = alloc_path_for_send();
- if (!tmp_path)
- return -ENOMEM;
+ /*
+ * With fallocate we can get prealloc extents beyond the inode's i_size,
+ * so we don't do anything here because clone operations can not clone
+ * to a range beyond i_size without increasing the i_size of the
+ * destination inode.
+ */
+ if (data_offset >= ino_size)
+ return 0;
- /* We only use this path under the commit sem */
- tmp_path->need_commit_sem = 0;
+ fi = btrfs_item_ptr(eb, path->slots[0], struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(eb, fi);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ return -ENOENT;
- if (data_offset >= ino_size) {
- /*
- * There may be extents that lie behind the file's size.
- * I at least had this in combination with snapshotting while
- * writing large files.
- */
- ret = 0;
- goto out;
- }
+ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+ if (disk_byte == 0)
+ return -ENOENT;
- fi = btrfs_item_ptr(eb, path->slots[0],
- struct btrfs_file_extent_item);
- extent_type = btrfs_file_extent_type(eb, fi);
- if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- ret = -ENOENT;
- goto out;
- }
compressed = btrfs_file_extent_compression(eb, fi);
-
num_bytes = btrfs_file_extent_num_bytes(eb, fi);
- disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
- if (disk_byte == 0) {
- ret = -ENOENT;
- goto out;
- }
logical = disk_byte + btrfs_file_extent_offset(eb, fi);
- down_read(&fs_info->commit_root_sem);
- ret = extent_from_logical(fs_info, disk_byte, tmp_path,
- &found_key, &flags);
- up_read(&fs_info->commit_root_sem);
-
- if (ret < 0)
- goto out;
- if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- ret = -EIO;
- goto out;
- }
-
- ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0],
- struct btrfs_extent_item);
- /*
- * Backreference walking (iterate_extent_inodes() below) is currently
- * too expensive when an extent has a large number of references, both
- * in time spent and used memory. So for now just fallback to write
- * operations instead of clone operations when an extent has more than
- * a certain amount of references.
- */
- if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) {
- ret = -ENOENT;
- goto out;
- }
- btrfs_release_path(tmp_path);
-
/*
* Setup the clone roots.
*/
@@ -1412,37 +1633,59 @@ static int find_extent_clone(struct send_ctx *sctx,
cur_clone_root = sctx->clone_roots + i;
cur_clone_root->ino = (u64)-1;
cur_clone_root->offset = 0;
- cur_clone_root->found_refs = 0;
+ cur_clone_root->num_bytes = 0;
+ cur_clone_root->found_ref = false;
}
backref_ctx.sctx = sctx;
- backref_ctx.found = 0;
backref_ctx.cur_objectid = ino;
backref_ctx.cur_offset = data_offset;
- backref_ctx.found_itself = 0;
- backref_ctx.extent_len = num_bytes;
+ backref_ctx.bytenr = disk_byte;
+ /*
+ * Use the header owner and not the send root's id, because in case of a
+ * snapshot we can have shared subtrees.
+ */
+ backref_ctx.backref_owner = btrfs_header_owner(eb);
+ backref_ctx.backref_offset = data_offset - btrfs_file_extent_offset(eb, fi);
/*
* The last extent of a file may be too large due to page alignment.
* We need to adjust extent_len in this case so that the checks in
- * __iterate_backrefs work.
+ * iterate_backrefs() work.
*/
if (data_offset + num_bytes >= ino_size)
backref_ctx.extent_len = ino_size - data_offset;
+ else
+ backref_ctx.extent_len = num_bytes;
/*
* Now collect all backrefs.
*/
+ backref_walk_ctx.bytenr = disk_byte;
if (compressed == BTRFS_COMPRESS_NONE)
- extent_item_pos = logical - found_key.objectid;
- else
- extent_item_pos = 0;
- ret = iterate_extent_inodes(fs_info, found_key.objectid,
- extent_item_pos, 1, __iterate_backrefs,
- &backref_ctx, false);
+ backref_walk_ctx.extent_item_pos = btrfs_file_extent_offset(eb, fi);
+ backref_walk_ctx.fs_info = fs_info;
+ backref_walk_ctx.cache_lookup = lookup_backref_cache;
+ backref_walk_ctx.cache_store = store_backref_cache;
+ backref_walk_ctx.indirect_ref_iterator = iterate_backrefs;
+ backref_walk_ctx.check_extent_item = check_extent_item;
+ backref_walk_ctx.user_ctx = &backref_ctx;
+
+ /*
+ * If have a single clone root, then it's the send root and we can tell
+ * the backref walking code to skip our own backref and not resolve it,
+ * since we can not use it for cloning - the source and destination
+ * ranges can't overlap and in case the leaf is shared through a subtree
+ * due to snapshots, we can't use those other roots since they are not
+ * in the list of clone roots.
+ */
+ if (sctx->clone_roots_cnt == 1)
+ backref_walk_ctx.skip_data_ref = skip_self_data_ref;
+ ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs,
+ &backref_ctx);
if (ret < 0)
- goto out;
+ return ret;
down_read(&fs_info->commit_root_sem);
if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
@@ -1459,37 +1702,42 @@ static int find_extent_clone(struct send_ctx *sctx,
* was already reallocated after the relocation.
*/
up_read(&fs_info->commit_root_sem);
- ret = -ENOENT;
- goto out;
+ return -ENOENT;
}
up_read(&fs_info->commit_root_sem);
- if (!backref_ctx.found_itself) {
- /* found a bug in backref code? */
- ret = -EIO;
- btrfs_err(fs_info,
- "did not find backref in send_root. inode=%llu, offset=%llu, disk_byte=%llu found extent=%llu",
- ino, data_offset, disk_byte, found_key.objectid);
- goto out;
- }
-
btrfs_debug(fs_info,
"find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
data_offset, ino, num_bytes, logical);
- if (!backref_ctx.found)
+ if (!backref_ctx.found) {
btrfs_debug(fs_info, "no clones found");
+ return -ENOENT;
+ }
cur_clone_root = NULL;
for (i = 0; i < sctx->clone_roots_cnt; i++) {
- if (sctx->clone_roots[i].found_refs) {
- if (!cur_clone_root)
- cur_clone_root = sctx->clone_roots + i;
- else if (sctx->clone_roots[i].root == sctx->send_root)
- /* prefer clones from send_root over others */
- cur_clone_root = sctx->clone_roots + i;
- }
+ struct clone_root *clone_root = &sctx->clone_roots[i];
+
+ if (!clone_root->found_ref)
+ continue;
+
+ /*
+ * Choose the root from which we can clone more bytes, to
+ * minimize write operations and therefore have more extent
+ * sharing at the destination (the same as in the source).
+ */
+ if (!cur_clone_root ||
+ clone_root->num_bytes > cur_clone_root->num_bytes) {
+ cur_clone_root = clone_root;
+ /*
+ * We found an optimal clone candidate (any inode from
+ * any root is fine), so we're done.
+ */
+ if (clone_root->num_bytes >= backref_ctx.extent_len)
+ break;
+ }
}
if (cur_clone_root) {
@@ -1499,8 +1747,6 @@ static int find_extent_clone(struct send_ctx *sctx,
ret = -ENOENT;
}
-out:
- btrfs_free_path(tmp_path);
return ret;
}
@@ -1580,13 +1826,17 @@ static int gen_unique_name(struct send_ctx *sctx,
return -ENOMEM;
while (1) {
+ struct fscrypt_str tmp_name;
+
len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
ino, gen, idx);
ASSERT(len < sizeof(tmp));
+ tmp_name.name = tmp;
+ tmp_name.len = strlen(tmp);
di = btrfs_lookup_dir_item(NULL, sctx->send_root,
path, BTRFS_FIRST_FREE_OBJECTID,
- tmp, strlen(tmp), 0);
+ &tmp_name, 0);
btrfs_release_path(path);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
@@ -1606,7 +1856,7 @@ static int gen_unique_name(struct send_ctx *sctx,
di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
path, BTRFS_FIRST_FREE_OBJECTID,
- tmp, strlen(tmp), 0);
+ &tmp_name, 0);
btrfs_release_path(path);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
@@ -1643,21 +1893,22 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
int right_ret;
u64 left_gen;
u64 right_gen;
+ struct btrfs_inode_info info;
- ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
- NULL, NULL, NULL);
+ ret = get_inode_info(sctx->send_root, ino, &info);
if (ret < 0 && ret != -ENOENT)
goto out;
- left_ret = ret;
+ left_ret = (info.nlink == 0) ? -ENOENT : ret;
+ left_gen = info.gen;
if (!sctx->parent_root) {
right_ret = -ENOENT;
} else {
- ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
- NULL, NULL, NULL, NULL, NULL);
+ ret = get_inode_info(sctx->parent_root, ino, &info);
if (ret < 0 && ret != -ENOENT)
goto out;
- right_ret = ret;
+ right_ret = (info.nlink == 0) ? -ENOENT : ret;
+ right_gen = info.gen;
}
if (!left_ret && !right_ret) {
@@ -1735,13 +1986,13 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
struct btrfs_dir_item *di;
struct btrfs_key key;
struct btrfs_path *path;
+ struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len);
path = alloc_path_for_send();
if (!path)
return -ENOMEM;
- di = btrfs_lookup_dir_item(NULL, root, path,
- dir, name, name_len, 0);
+ di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0);
if (IS_ERR_OR_NULL(di)) {
ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
@@ -1816,8 +2067,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
btrfs_release_path(path);
if (dir_gen) {
- ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL,
- NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(root, parent_dir, dir_gen);
if (ret < 0)
goto out;
}
@@ -1874,6 +2124,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
int ret = 0;
u64 gen;
u64 other_inode = 0;
+ struct btrfs_inode_info info;
if (!sctx->parent_root)
goto out;
@@ -1888,8 +2139,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
* and we can just unlink this entry.
*/
if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) {
- ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL,
- NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->parent_root, dir, &gen);
if (ret < 0 && ret != -ENOENT)
goto out;
if (ret) {
@@ -1916,13 +2166,14 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
*/
if (other_inode > sctx->send_progress ||
is_waiting_for_move(sctx, other_inode)) {
- ret = get_inode_info(sctx->parent_root, other_inode, NULL,
- who_gen, who_mode, NULL, NULL, NULL, NULL);
+ ret = get_inode_info(sctx->parent_root, other_inode, &info);
if (ret < 0)
goto out;
ret = 1;
*who_ino = other_inode;
+ *who_gen = info.gen;
+ *who_mode = info.mode;
} else {
ret = 0;
}
@@ -1955,8 +2206,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
goto out;
if (dir != BTRFS_FIRST_FREE_OBJECTID) {
- ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL,
- NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->send_root, dir, &gen);
if (ret < 0 && ret != -ENOENT)
goto out;
if (ret) {
@@ -1978,8 +2228,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
goto out;
}
- ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
- NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->send_root, ow_inode, &gen);
if (ret < 0)
goto out;
@@ -2645,6 +2894,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
int ret = 0;
struct fs_path *p;
int cmd;
+ struct btrfs_inode_info info;
u64 gen;
u64 mode;
u64 rdev;
@@ -2656,10 +2906,12 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
return -ENOMEM;
if (ino != sctx->cur_ino) {
- ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
- NULL, NULL, &rdev, NULL);
+ ret = get_inode_info(sctx->send_root, ino, &info);
if (ret < 0)
goto out;
+ gen = info.gen;
+ mode = info.mode;
+ rdev = info.rdev;
} else {
gen = sctx->cur_inode_gen;
mode = sctx->cur_inode_mode;
@@ -3359,8 +3611,7 @@ finish:
/*
* The parent inode might have been deleted in the send snapshot
*/
- ret = get_inode_info(sctx->send_root, cur->dir, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL);
+ ret = get_inode_info(sctx->send_root, cur->dir, NULL);
if (ret == -ENOENT) {
ret = 0;
continue;
@@ -3534,12 +3785,10 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
goto out;
}
- ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL,
- &left_gen, NULL, NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen);
if (ret < 0)
goto out;
- ret = get_inode_info(sctx->send_root, di_key.objectid, NULL,
- &right_gen, NULL, NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen);
if (ret < 0) {
if (ret == -ENOENT)
ret = 0;
@@ -3669,8 +3918,7 @@ static int is_ancestor(struct btrfs_root *root,
cur_offset = item_size;
}
- ret = get_inode_info(root, parent, NULL, &parent_gen,
- NULL, NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(root, parent, &parent_gen);
if (ret < 0)
goto out;
ret = check_ino_in_path(root, ino1, ino1_gen,
@@ -3760,9 +4008,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
memcmp(path_before->start, path_after->start, len1))) {
u64 parent_ino_gen;
- ret = get_inode_info(sctx->parent_root, ino, NULL,
- &parent_ino_gen, NULL, NULL, NULL,
- NULL, NULL);
+ ret = get_inode_gen(sctx->parent_root, ino, &parent_ino_gen);
if (ret < 0)
goto out;
if (ino_gen == parent_ino_gen) {
@@ -4441,8 +4687,7 @@ static int record_new_ref_if_needed(int num, u64 dir, int index,
struct recorded_ref *ref;
u64 dir_gen;
- ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL,
- NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->send_root, dir, &dir_gen);
if (ret < 0)
goto out;
@@ -4472,8 +4717,7 @@ static int record_deleted_ref_if_needed(int num, u64 dir, int index,
struct recorded_ref *ref;
u64 dir_gen;
- ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL,
- NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->parent_root, dir, &dir_gen);
if (ret < 0)
goto out;
@@ -4886,6 +5130,84 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
return ret;
}
+static int send_verity(struct send_ctx *sctx, struct fs_path *path,
+ struct fsverity_descriptor *desc)
+{
+ int ret;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+ TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM,
+ le8_to_cpu(desc->hash_algorithm));
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_VERITY_BLOCK_SIZE,
+ 1U << le8_to_cpu(desc->log_blocksize));
+ TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SALT_DATA, desc->salt,
+ le8_to_cpu(desc->salt_size));
+ TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SIG_DATA, desc->signature,
+ le32_to_cpu(desc->sig_size));
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ return ret;
+}
+
+static int process_verity(struct send_ctx *sctx)
+{
+ int ret = 0;
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ struct inode *inode;
+ struct fs_path *p;
+
+ inode = btrfs_iget(fs_info->sb, sctx->cur_ino, sctx->send_root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ ret = btrfs_get_verity_descriptor(inode, NULL, 0);
+ if (ret < 0)
+ goto iput;
+
+ if (ret > FS_VERITY_MAX_DESCRIPTOR_SIZE) {
+ ret = -EMSGSIZE;
+ goto iput;
+ }
+ if (!sctx->verity_descriptor) {
+ sctx->verity_descriptor = kvmalloc(FS_VERITY_MAX_DESCRIPTOR_SIZE,
+ GFP_KERNEL);
+ if (!sctx->verity_descriptor) {
+ ret = -ENOMEM;
+ goto iput;
+ }
+ }
+
+ ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret);
+ if (ret < 0)
+ goto iput;
+
+ p = fs_path_alloc();
+ if (!p) {
+ ret = -ENOMEM;
+ goto iput;
+ }
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ if (ret < 0)
+ goto free_path;
+
+ ret = send_verity(sctx, p, sctx->verity_descriptor);
+ if (ret < 0)
+ goto free_path;
+
+free_path:
+ fs_path_free(p);
+iput:
+ iput(inode);
+ return ret;
+}
+
static inline u64 max_send_read_size(const struct send_ctx *sctx)
{
return sctx->send_max_size - SZ_16K;
@@ -5056,8 +5378,7 @@ static int send_clone(struct send_ctx *sctx,
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
if (clone_root->root == sctx->send_root) {
- ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
- &gen, NULL, NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen);
if (ret < 0)
goto out;
ret = get_cur_path(sctx, clone_root->ino, gen, p);
@@ -5536,6 +5857,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
struct btrfs_path *path;
struct btrfs_key key;
int ret;
+ struct btrfs_inode_info info;
u64 clone_src_i_size = 0;
/*
@@ -5565,12 +5887,11 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
* There are inodes that have extents that lie behind its i_size. Don't
* accept clones from these extents.
*/
- ret = __get_inode_info(clone_root->root, path, clone_root->ino,
- &clone_src_i_size, NULL, NULL, NULL, NULL, NULL,
- NULL);
+ ret = get_inode_info(clone_root->root, clone_root->ino, &info);
btrfs_release_path(path);
if (ret < 0)
goto out;
+ clone_src_i_size = info.size;
/*
* We can't send a clone operation for the entire range if we find
@@ -5615,6 +5936,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
u64 ext_len;
u64 clone_len;
u64 clone_data_offset;
+ bool crossed_src_i_size = false;
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(clone_root->root, path);
@@ -5672,8 +5994,10 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
if (key.offset >= clone_src_i_size)
break;
- if (key.offset + ext_len > clone_src_i_size)
+ if (key.offset + ext_len > clone_src_i_size) {
ext_len = clone_src_i_size - key.offset;
+ crossed_src_i_size = true;
+ }
clone_data_offset = btrfs_file_extent_offset(leaf, ei);
if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) {
@@ -5734,6 +6058,25 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
ret = send_clone(sctx, offset, clone_len,
clone_root);
}
+ } else if (crossed_src_i_size && clone_len < len) {
+ /*
+ * If we are at i_size of the clone source inode and we
+ * can not clone from it, terminate the loop. This is
+ * to avoid sending two write operations, one with a
+ * length matching clone_len and the final one after
+ * this loop with a length of len - clone_len.
+ *
+ * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
+ * was passed to the send ioctl), this helps avoid
+ * sending an encoded write for an offset that is not
+ * sector size aligned, in case the i_size of the source
+ * inode is not sector size aligned. That will make the
+ * receiver fallback to decompression of the data and
+ * writing it using regular buffered IO, therefore while
+ * not incorrect, it's not optimal due decompression and
+ * possible re-compression at the receiver.
+ */
+ break;
} else {
ret = send_extent_data(sctx, dst_path, offset,
clone_len);
@@ -6259,6 +6602,7 @@ out:
static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
{
int ret = 0;
+ struct btrfs_inode_info info;
u64 left_mode;
u64 left_uid;
u64 left_gid;
@@ -6301,11 +6645,13 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
goto out;
if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
goto out;
-
- ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
- &left_mode, &left_uid, &left_gid, NULL, &left_fileattr);
+ ret = get_inode_info(sctx->send_root, sctx->cur_ino, &info);
if (ret < 0)
goto out;
+ left_mode = info.mode;
+ left_uid = info.uid;
+ left_gid = info.gid;
+ left_fileattr = info.fileattr;
if (!sctx->parent_root || sctx->cur_inode_new) {
need_chown = 1;
@@ -6316,11 +6662,14 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
} else {
u64 old_size;
- ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
- &old_size, NULL, &right_mode, &right_uid,
- &right_gid, NULL, &right_fileattr);
+ ret = get_inode_info(sctx->parent_root, sctx->cur_ino, &info);
if (ret < 0)
goto out;
+ old_size = info.size;
+ right_mode = info.mode;
+ right_uid = info.uid;
+ right_gid = info.gid;
+ right_fileattr = info.fileattr;
if (left_uid != right_uid || left_gid != right_gid)
need_chown = 1;
@@ -6378,6 +6727,13 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
goto out;
}
+ if (proto_cmd_ok(sctx, BTRFS_SEND_C_ENABLE_VERITY)
+ && sctx->cur_inode_needs_verity) {
+ ret = process_verity(sctx);
+ if (ret < 0)
+ goto out;
+ }
+
ret = send_capabilities(sctx);
if (ret < 0)
goto out;
@@ -6407,86 +6763,6 @@ out:
return ret;
}
-struct parent_paths_ctx {
- struct list_head *refs;
- struct send_ctx *sctx;
-};
-
-static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name,
- void *ctx)
-{
- struct parent_paths_ctx *ppctx = ctx;
-
- /*
- * Pass 0 as the generation for the directory, we don't care about it
- * here as we have no new references to add, we just want to delete all
- * references for an inode.
- */
- return record_ref_in_tree(&ppctx->sctx->rbtree_deleted_refs, ppctx->refs,
- name, dir, 0, ppctx->sctx);
-}
-
-/*
- * Issue unlink operations for all paths of the current inode found in the
- * parent snapshot.
- */
-static int btrfs_unlink_all_paths(struct send_ctx *sctx)
-{
- LIST_HEAD(deleted_refs);
- struct btrfs_path *path;
- struct btrfs_root *root = sctx->parent_root;
- struct btrfs_key key;
- struct btrfs_key found_key;
- struct parent_paths_ctx ctx;
- int iter_ret = 0;
- int ret;
-
- path = alloc_path_for_send();
- if (!path)
- return -ENOMEM;
-
- key.objectid = sctx->cur_ino;
- key.type = BTRFS_INODE_REF_KEY;
- key.offset = 0;
-
- ctx.refs = &deleted_refs;
- ctx.sctx = sctx;
-
- btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
- if (found_key.objectid != key.objectid)
- break;
- if (found_key.type != key.type &&
- found_key.type != BTRFS_INODE_EXTREF_KEY)
- break;
-
- ret = iterate_inode_ref(root, path, &found_key, 1,
- record_parent_ref, &ctx);
- if (ret < 0)
- goto out;
- }
- /* Catch error found during iteration */
- if (iter_ret < 0) {
- ret = iter_ret;
- goto out;
- }
-
- while (!list_empty(&deleted_refs)) {
- struct recorded_ref *ref;
-
- ref = list_first_entry(&deleted_refs, struct recorded_ref, list);
- ret = send_unlink(sctx, ref->full_path);
- if (ret < 0)
- goto out;
- recorded_ref_free(ref);
- }
- ret = 0;
-out:
- btrfs_free_path(path);
- if (ret)
- __free_recorded_refs(&deleted_refs);
- return ret;
-}
-
static void close_current_inode(struct send_ctx *sctx)
{
u64 i_size;
@@ -6577,25 +6853,36 @@ static int changed_inode(struct send_ctx *sctx,
* file descriptor against it or turning a RO snapshot into RW mode,
* keep an open file descriptor against a file, delete it and then
* turn the snapshot back to RO mode before using it for a send
- * operation. So if we find such cases, ignore the inode and all its
- * items completely if it's a new inode, or if it's a changed inode
- * make sure all its previous paths (from the parent snapshot) are all
- * unlinked and all other the inode items are ignored.
+ * operation. The former is what the receiver operation does.
+ * Therefore, if we want to send these snapshots soon after they're
+ * received, we need to handle orphan inodes as well. Moreover, orphans
+ * can appear not only in the send snapshot but also in the parent
+ * snapshot. Here are several cases:
+ *
+ * Case 1: BTRFS_COMPARE_TREE_NEW
+ * | send snapshot | action
+ * --------------------------------
+ * nlink | 0 | ignore
+ *
+ * Case 2: BTRFS_COMPARE_TREE_DELETED
+ * | parent snapshot | action
+ * ----------------------------------
+ * nlink | 0 | as usual
+ * Note: No unlinks will be sent because there're no paths for it.
+ *
+ * Case 3: BTRFS_COMPARE_TREE_CHANGED
+ * | | parent snapshot | send snapshot | action
+ * -----------------------------------------------------------------------
+ * subcase 1 | nlink | 0 | 0 | ignore
+ * subcase 2 | nlink | >0 | 0 | new_gen(deletion)
+ * subcase 3 | nlink | 0 | >0 | new_gen(creation)
+ *
*/
- if (result == BTRFS_COMPARE_TREE_NEW ||
- result == BTRFS_COMPARE_TREE_CHANGED) {
- u32 nlinks;
-
- nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii);
- if (nlinks == 0) {
+ if (result == BTRFS_COMPARE_TREE_NEW) {
+ if (btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii) == 0) {
sctx->ignore_cur_inode = true;
- if (result == BTRFS_COMPARE_TREE_CHANGED)
- ret = btrfs_unlink_all_paths(sctx);
goto out;
}
- }
-
- if (result == BTRFS_COMPARE_TREE_NEW) {
sctx->cur_inode_gen = left_gen;
sctx->cur_inode_new = true;
sctx->cur_inode_deleted = false;
@@ -6616,6 +6903,16 @@ static int changed_inode(struct send_ctx *sctx,
sctx->cur_inode_mode = btrfs_inode_mode(
sctx->right_path->nodes[0], right_ii);
} else if (result == BTRFS_COMPARE_TREE_CHANGED) {
+ u32 new_nlinks, old_nlinks;
+
+ new_nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii);
+ old_nlinks = btrfs_inode_nlink(sctx->right_path->nodes[0], right_ii);
+ if (new_nlinks == 0 && old_nlinks == 0) {
+ sctx->ignore_cur_inode = true;
+ goto out;
+ } else if (new_nlinks == 0 || old_nlinks == 0) {
+ sctx->cur_inode_new_gen = 1;
+ }
/*
* We need to do some special handling in case the inode was
* reported as changed with a changed generation number. This
@@ -6627,53 +6924,61 @@ static int changed_inode(struct send_ctx *sctx,
/*
* First, process the inode as if it was deleted.
*/
- sctx->cur_inode_gen = right_gen;
- sctx->cur_inode_new = false;
- sctx->cur_inode_deleted = true;
- sctx->cur_inode_size = btrfs_inode_size(
- sctx->right_path->nodes[0], right_ii);
- sctx->cur_inode_mode = btrfs_inode_mode(
- sctx->right_path->nodes[0], right_ii);
- ret = process_all_refs(sctx,
- BTRFS_COMPARE_TREE_DELETED);
- if (ret < 0)
- goto out;
+ if (old_nlinks > 0) {
+ sctx->cur_inode_gen = right_gen;
+ sctx->cur_inode_new = false;
+ sctx->cur_inode_deleted = true;
+ sctx->cur_inode_size = btrfs_inode_size(
+ sctx->right_path->nodes[0], right_ii);
+ sctx->cur_inode_mode = btrfs_inode_mode(
+ sctx->right_path->nodes[0], right_ii);
+ ret = process_all_refs(sctx,
+ BTRFS_COMPARE_TREE_DELETED);
+ if (ret < 0)
+ goto out;
+ }
/*
* Now process the inode as if it was new.
*/
- sctx->cur_inode_gen = left_gen;
- sctx->cur_inode_new = true;
- sctx->cur_inode_deleted = false;
- sctx->cur_inode_size = btrfs_inode_size(
- sctx->left_path->nodes[0], left_ii);
- sctx->cur_inode_mode = btrfs_inode_mode(
- sctx->left_path->nodes[0], left_ii);
- sctx->cur_inode_rdev = btrfs_inode_rdev(
- sctx->left_path->nodes[0], left_ii);
- ret = send_create_inode_if_needed(sctx);
- if (ret < 0)
- goto out;
+ if (new_nlinks > 0) {
+ sctx->cur_inode_gen = left_gen;
+ sctx->cur_inode_new = true;
+ sctx->cur_inode_deleted = false;
+ sctx->cur_inode_size = btrfs_inode_size(
+ sctx->left_path->nodes[0],
+ left_ii);
+ sctx->cur_inode_mode = btrfs_inode_mode(
+ sctx->left_path->nodes[0],
+ left_ii);
+ sctx->cur_inode_rdev = btrfs_inode_rdev(
+ sctx->left_path->nodes[0],
+ left_ii);
+ ret = send_create_inode_if_needed(sctx);
+ if (ret < 0)
+ goto out;
- ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
- if (ret < 0)
- goto out;
- /*
- * Advance send_progress now as we did not get into
- * process_recorded_refs_if_needed in the new_gen case.
- */
- sctx->send_progress = sctx->cur_ino + 1;
+ ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
+ if (ret < 0)
+ goto out;
+ /*
+ * Advance send_progress now as we did not get
+ * into process_recorded_refs_if_needed in the
+ * new_gen case.
+ */
+ sctx->send_progress = sctx->cur_ino + 1;
- /*
- * Now process all extents and xattrs of the inode as if
- * they were all new.
- */
- ret = process_all_extents(sctx);
- if (ret < 0)
- goto out;
- ret = process_all_new_xattrs(sctx);
- if (ret < 0)
- goto out;
+ /*
+ * Now process all extents and xattrs of the
+ * inode as if they were all new.
+ */
+ ret = process_all_extents(sctx);
+ if (ret < 0)
+ goto out;
+ ret = process_all_new_xattrs(sctx);
+ if (ret < 0)
+ goto out;
+ }
} else {
sctx->cur_inode_gen = left_gen;
sctx->cur_inode_new = false;
@@ -6785,18 +7090,27 @@ static int changed_extent(struct send_ctx *sctx,
return ret;
}
+static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result)
+{
+ int ret = 0;
+
+ if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
+ if (result == BTRFS_COMPARE_TREE_NEW)
+ sctx->cur_inode_needs_verity = true;
+ }
+ return ret;
+}
+
static int dir_changed(struct send_ctx *sctx, u64 dir)
{
u64 orig_gen, new_gen;
int ret;
- ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL,
- NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->send_root, dir, &new_gen);
if (ret)
return ret;
- ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL,
- NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->parent_root, dir, &orig_gen);
if (ret)
return ret;
@@ -6939,6 +7253,9 @@ static int changed_cb(struct btrfs_path *left_path,
ret = changed_xattr(sctx, result);
else if (key->type == BTRFS_EXTENT_DATA_KEY)
ret = changed_extent(sctx, result);
+ else if (key->type == BTRFS_VERITY_DESC_ITEM_KEY &&
+ key->offset == 0)
+ ret = changed_verity(sctx, result);
}
out:
@@ -7780,6 +8097,9 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
INIT_LIST_HEAD(&sctx->name_cache_list);
+ INIT_LIST_HEAD(&sctx->backref_cache.lru_list);
+ mt_init(&sctx->backref_cache.entries);
+
sctx->flags = arg->flags;
if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
@@ -7818,7 +8138,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
if (sctx->proto >= 2) {
u32 send_buf_num_pages;
- sctx->send_max_size = ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE);
+ sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V2;
sctx->send_buf = vmalloc(sctx->send_max_size);
if (!sctx->send_buf) {
ret = -ENOMEM;
@@ -8036,11 +8356,14 @@ out:
kvfree(sctx->clone_roots);
kfree(sctx->send_buf_pages);
kvfree(sctx->send_buf);
+ kvfree(sctx->verity_descriptor);
name_cache_free(sctx);
close_current_inode(sctx);
+ empty_backref_cache(sctx);
+
kfree(sctx);
}
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 4bb4e6a638cb..4f5509cb1803 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -10,13 +10,20 @@
#include <linux/types.h>
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
+/* Conditional support for the upcoming protocol version. */
+#ifdef CONFIG_BTRFS_DEBUG
+#define BTRFS_SEND_STREAM_VERSION 3
+#else
#define BTRFS_SEND_STREAM_VERSION 2
+#endif
/*
- * In send stream v1, no command is larger than 64K. In send stream v2, no limit
- * should be assumed.
+ * In send stream v1, no command is larger than 64K. In send stream v2, no
+ * limit should be assumed, the buffer size is set to be a header with
+ * compressed extent size.
*/
#define BTRFS_SEND_BUF_SIZE_V1 SZ_64K
+#define BTRFS_SEND_BUF_SIZE_V2 ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE)
struct inode;
struct btrfs_ioctl_send_args;
@@ -92,8 +99,11 @@ enum btrfs_send_cmd {
BTRFS_SEND_C_ENCODED_WRITE = 25,
BTRFS_SEND_C_MAX_V2 = 25,
+ /* Version 3 */
+ BTRFS_SEND_C_ENABLE_VERITY = 26,
+ BTRFS_SEND_C_MAX_V3 = 26,
/* End */
- BTRFS_SEND_C_MAX = 25,
+ BTRFS_SEND_C_MAX = 26,
};
/* attributes in send stream */
@@ -160,8 +170,14 @@ enum {
BTRFS_SEND_A_ENCRYPTION = 31,
BTRFS_SEND_A_MAX_V2 = 31,
- /* End */
- BTRFS_SEND_A_MAX = 31,
+ /* Version 3 */
+ BTRFS_SEND_A_VERITY_ALGORITHM = 32,
+ BTRFS_SEND_A_VERITY_BLOCK_SIZE = 33,
+ BTRFS_SEND_A_VERITY_SALT_DATA = 34,
+ BTRFS_SEND_A_VERITY_SIG_DATA = 35,
+ BTRFS_SEND_A_MAX_V3 = 35,
+
+ __BTRFS_SEND_A_MAX = 35,
};
long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg);
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 435559ba94fa..d28ee4e36f3d 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -10,6 +10,9 @@
#include "transaction.h"
#include "block-group.h"
#include "zoned.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
/*
* HOW DOES SPACE RESERVATION WORK
@@ -293,32 +296,36 @@ out:
return ret;
}
-void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
- u64 total_bytes, u64 bytes_used,
- u64 bytes_readonly, u64 bytes_zone_unusable,
- bool active, struct btrfs_space_info **space_info)
+void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
+ struct btrfs_block_group *block_group)
{
struct btrfs_space_info *found;
- int factor;
+ int factor, index;
- factor = btrfs_bg_type_to_factor(flags);
+ factor = btrfs_bg_type_to_factor(block_group->flags);
- found = btrfs_find_space_info(info, flags);
+ found = btrfs_find_space_info(info, block_group->flags);
ASSERT(found);
spin_lock(&found->lock);
- found->total_bytes += total_bytes;
- if (active)
- found->active_total_bytes += total_bytes;
- found->disk_total += total_bytes * factor;
- found->bytes_used += bytes_used;
- found->disk_used += bytes_used * factor;
- found->bytes_readonly += bytes_readonly;
- found->bytes_zone_unusable += bytes_zone_unusable;
- if (total_bytes > 0)
+ found->total_bytes += block_group->length;
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
+ found->active_total_bytes += block_group->length;
+ found->disk_total += block_group->length * factor;
+ found->bytes_used += block_group->used;
+ found->disk_used += block_group->used * factor;
+ found->bytes_readonly += block_group->bytes_super;
+ found->bytes_zone_unusable += block_group->zone_unusable;
+ if (block_group->length > 0)
found->full = 0;
btrfs_try_granting_tickets(info, found);
spin_unlock(&found->lock);
- *space_info = found;
+
+ block_group->space_info = found;
+
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
+ down_write(&found->groups_sem);
+ list_add_tail(&block_group->list, &found->block_groups[index]);
+ up_write(&found->groups_sem);
}
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
@@ -472,28 +479,47 @@ do { \
spin_unlock(&__rsv->lock); \
} while (0)
+static const char *space_info_flag_to_str(const struct btrfs_space_info *space_info)
+{
+ switch (space_info->flags) {
+ case BTRFS_BLOCK_GROUP_SYSTEM:
+ return "SYSTEM";
+ case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA:
+ return "DATA+METADATA";
+ case BTRFS_BLOCK_GROUP_DATA:
+ return "DATA";
+ case BTRFS_BLOCK_GROUP_METADATA:
+ return "METADATA";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+static void dump_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ DUMP_BLOCK_RSV(fs_info, global_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
+}
+
static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info)
{
+ const char *flag_str = space_info_flag_to_str(info);
lockdep_assert_held(&info->lock);
/* The free space could be negative in case of overcommit */
- btrfs_info(fs_info, "space_info %llu has %lld free, is %sfull",
- info->flags,
+ btrfs_info(fs_info, "space_info %s has %lld free, is %sfull",
+ flag_str,
(s64)(info->total_bytes - btrfs_space_info_used(info, true)),
info->full ? "" : "not ");
btrfs_info(fs_info,
- "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
+"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
info->total_bytes, info->bytes_used, info->bytes_pinned,
info->bytes_reserved, info->bytes_may_use,
info->bytes_readonly, info->bytes_zone_unusable);
-
- DUMP_BLOCK_RSV(fs_info, global_block_rsv);
- DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
- DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
- DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
- DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
-
}
void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
@@ -505,6 +531,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
spin_lock(&info->lock);
__btrfs_dump_space_info(fs_info, info);
+ dump_global_block_rsv(fs_info);
spin_unlock(&info->lock);
if (!dump_block_groups)
@@ -832,7 +859,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
u64 thresh;
u64 used;
- thresh = div_factor_fine(total, 90);
+ thresh = mult_perc(total, 90);
lockdep_assert_held(&space_info->lock);
@@ -950,7 +977,7 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
return false;
spin_lock(&global_rsv->lock);
- min_bytes = div_factor(global_rsv->size, 1);
+ min_bytes = mult_perc(global_rsv->size, 10);
if (global_rsv->reserved < min_bytes + ticket->bytes) {
spin_unlock(&global_rsv->lock);
return false;
@@ -1466,8 +1493,8 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
spin_unlock(&space_info->lock);
}
-/**
- * Do the appropriate flushing and waiting for a ticket
+/*
+ * Do the appropriate flushing and waiting for a ticket.
*
* @fs_info: the filesystem
* @space_info: space info for the reservation
@@ -1559,8 +1586,18 @@ static inline bool can_steal(enum btrfs_reserve_flush_enum flush)
flush == BTRFS_RESERVE_FLUSH_EVICT);
}
-/**
- * Try to reserve bytes from the block_rsv's space
+/*
+ * NO_FLUSH and FLUSH_EMERGENCY don't want to create a ticket, they just want to
+ * fail as quickly as possible.
+ */
+static inline bool can_ticket(enum btrfs_reserve_flush_enum flush)
+{
+ return (flush != BTRFS_RESERVE_NO_FLUSH &&
+ flush != BTRFS_RESERVE_FLUSH_EMERGENCY);
+}
+
+/*
+ * Try to reserve bytes from the block_rsv's space.
*
* @fs_info: the filesystem
* @space_info: space info we want to allocate from
@@ -1621,13 +1658,28 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
}
/*
+ * Things are dire, we need to make a reservation so we don't abort. We
+ * will let this reservation go through as long as we have actual space
+ * left to allocate for the block.
+ */
+ if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) {
+ used = btrfs_space_info_used(space_info, false);
+ if (used + orig_bytes <=
+ writable_total_bytes(fs_info, space_info)) {
+ btrfs_space_info_update_bytes_may_use(fs_info, space_info,
+ orig_bytes);
+ ret = 0;
+ }
+ }
+
+ /*
* If we couldn't make a reservation then setup our reservation ticket
* and kick the async worker if it's not already running.
*
* If we are a priority flusher then we just need to add our ticket to
* the list and we will do our own flushing further down.
*/
- if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
+ if (ret && can_ticket(flush)) {
ticket.bytes = orig_bytes;
ticket.error = 0;
space_info->reclaim_size += ticket.bytes;
@@ -1662,7 +1714,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
&space_info->priority_tickets);
}
} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
- used += orig_bytes;
/*
* We will do the space reservation dance during log replay,
* which means we won't have fs_info->fs_root set, so don't do
@@ -1678,15 +1729,15 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
}
}
spin_unlock(&space_info->lock);
- if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
+ if (!ret || !can_ticket(flush))
return ret;
return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns,
orig_bytes, flush);
}
-/**
- * Trye to reserve metadata bytes from the block_rsv's space
+/*
+ * Try to reserve metadata bytes from the block_rsv's space.
*
* @fs_info: the filesystem
* @block_rsv: block_rsv we're allocating for
@@ -1720,8 +1771,8 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
return ret;
}
-/**
- * Try to reserve data bytes for an allocation
+/*
+ * Try to reserve data bytes for an allocation.
*
* @fs_info: the filesystem
* @bytes: number of bytes we need
@@ -1737,7 +1788,8 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
int ret;
ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
- flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE);
+ flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE ||
+ flush == BTRFS_RESERVE_NO_FLUSH);
ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
@@ -1749,3 +1801,51 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
}
return ret;
}
+
+/* Dump all the space infos when we abort a transaction due to ENOSPC. */
+__cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *space_info;
+
+ btrfs_info(fs_info, "dumping space info:");
+ list_for_each_entry(space_info, &fs_info->space_info, list) {
+ spin_lock(&space_info->lock);
+ __btrfs_dump_space_info(fs_info, space_info);
+ spin_unlock(&space_info->lock);
+ }
+ dump_global_block_rsv(fs_info);
+}
+
+/*
+ * Account the unused space of all the readonly block group in the space_info.
+ * takes mirrors into account.
+ */
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
+{
+ struct btrfs_block_group *block_group;
+ u64 free_bytes = 0;
+ int factor;
+
+ /* It's df, we don't care if it's racy */
+ if (list_empty(&sinfo->ro_bgs))
+ return 0;
+
+ spin_lock(&sinfo->lock);
+ list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
+ spin_lock(&block_group->lock);
+
+ if (!block_group->ro) {
+ spin_unlock(&block_group->lock);
+ continue;
+ }
+
+ factor = btrfs_bg_type_to_factor(block_group->flags);
+ free_bytes += (block_group->length -
+ block_group->used) * factor;
+
+ spin_unlock(&block_group->lock);
+ }
+ spin_unlock(&sinfo->lock);
+
+ return free_bytes;
+}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 12fd6147f92d..fc99ea2b0c34 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -5,6 +5,83 @@
#include "volumes.h"
+/*
+ * Different levels for to flush space when doing space reservations.
+ *
+ * The higher the level, the more methods we try to reclaim space.
+ */
+enum btrfs_reserve_flush_enum {
+ /* If we are in the transaction, we can't flush anything.*/
+ BTRFS_RESERVE_NO_FLUSH,
+
+ /*
+ * Flush space by:
+ * - Running delayed inode items
+ * - Allocating a new chunk
+ */
+ BTRFS_RESERVE_FLUSH_LIMIT,
+
+ /*
+ * Flush space by:
+ * - Running delayed inode items
+ * - Running delayed refs
+ * - Running delalloc and waiting for ordered extents
+ * - Allocating a new chunk
+ */
+ BTRFS_RESERVE_FLUSH_EVICT,
+
+ /*
+ * Flush space by above mentioned methods and by:
+ * - Running delayed iputs
+ * - Committing transaction
+ *
+ * Can be interrupted by a fatal signal.
+ */
+ BTRFS_RESERVE_FLUSH_DATA,
+ BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE,
+ BTRFS_RESERVE_FLUSH_ALL,
+
+ /*
+ * Pretty much the same as FLUSH_ALL, but can also steal space from
+ * global rsv.
+ *
+ * Can be interrupted by a fatal signal.
+ */
+ BTRFS_RESERVE_FLUSH_ALL_STEAL,
+
+ /*
+ * This is for btrfs_use_block_rsv only. We have exhausted our block
+ * rsv and our global block rsv. This can happen for things like
+ * delalloc where we are overwriting a lot of extents with a single
+ * extent and didn't reserve enough space. Alternatively it can happen
+ * with delalloc where we reserve 1 extents worth for a large extent but
+ * fragmentation leads to multiple extents being created. This will
+ * give us the reservation in the case of
+ *
+ * if (num_bytes < (space_info->total_bytes -
+ * btrfs_space_info_used(space_info, false))
+ *
+ * Which ignores bytes_may_use. This is potentially dangerous, but our
+ * reservation system is generally pessimistic so is able to absorb this
+ * style of mistake.
+ */
+ BTRFS_RESERVE_FLUSH_EMERGENCY,
+};
+
+enum btrfs_flush_state {
+ FLUSH_DELAYED_ITEMS_NR = 1,
+ FLUSH_DELAYED_ITEMS = 2,
+ FLUSH_DELAYED_REFS_NR = 3,
+ FLUSH_DELAYED_REFS = 4,
+ FLUSH_DELALLOC = 5,
+ FLUSH_DELALLOC_WAIT = 6,
+ FLUSH_DELALLOC_FULL = 7,
+ ALLOC_CHUNK = 8,
+ ALLOC_CHUNK_FORCE = 9,
+ RUN_DELAYED_IPUTS = 10,
+ COMMIT_TRANS = 11,
+};
+
struct btrfs_space_info {
spinlock_t lock;
@@ -123,10 +200,8 @@ DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info");
DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned");
int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
-void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
- u64 total_bytes, u64 bytes_used,
- u64 bytes_readonly, u64 bytes_zone_unusable,
- bool active, struct btrfs_space_info **space_info);
+void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
+ struct btrfs_block_group *block_group);
void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
u64 chunk_size);
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
@@ -159,4 +234,8 @@ static inline void btrfs_space_info_free_bytes_may_use(
}
int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
enum btrfs_reserve_flush_enum flush);
+void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info);
+void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
+
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 6fc2b77ae5c3..dd46b978ac2c 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/slab.h>
+#include "messages.h"
#include "ctree.h"
#include "subpage.h"
#include "btrfs_inode.h"
@@ -337,7 +338,7 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
*
* Even with 0 returned, the page still need extra check to make sure
* it's really the correct page, as the caller is using
- * find_get_pages_contig(), which can race with page invalidating.
+ * filemap_get_folios_contig(), which can race with page invalidating.
*/
int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f89beac3c665..93f52ee85f6f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -26,6 +26,7 @@
#include <linux/ratelimit.h>
#include <linux/crc32c.h>
#include <linux/btrfs.h>
+#include "messages.h"
#include "delayed-inode.h"
#include "ctree.h"
#include "disk-io.h"
@@ -34,7 +35,7 @@
#include "print-tree.h"
#include "props.h"
#include "xattr.h"
-#include "volumes.h"
+#include "bio.h"
#include "export.h"
#include "compression.h"
#include "rcu-string.h"
@@ -49,6 +50,14 @@
#include "discard.h"
#include "qgroup.h"
#include "raid56.h"
+#include "fs.h"
+#include "accessors.h"
+#include "defrag.h"
+#include "dir-item.h"
+#include "ioctl.h"
+#include "scrub.h"
+#include "verity.h"
+#include "super.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
@@ -67,326 +76,6 @@ static struct file_system_type btrfs_root_fs_type;
static int btrfs_remount(struct super_block *sb, int *flags, char *data);
-#ifdef CONFIG_PRINTK
-
-#define STATE_STRING_PREFACE ": state "
-#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT)
-
-/*
- * Characters to print to indicate error conditions or uncommon filesystem state.
- * RO is not an error.
- */
-static const char fs_state_chars[] = {
- [BTRFS_FS_STATE_ERROR] = 'E',
- [BTRFS_FS_STATE_REMOUNTING] = 'M',
- [BTRFS_FS_STATE_RO] = 0,
- [BTRFS_FS_STATE_TRANS_ABORTED] = 'A',
- [BTRFS_FS_STATE_DEV_REPLACING] = 'R',
- [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0,
- [BTRFS_FS_STATE_NO_CSUMS] = 'C',
- [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L',
-};
-
-static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf)
-{
- unsigned int bit;
- bool states_printed = false;
- unsigned long fs_state = READ_ONCE(info->fs_state);
- char *curr = buf;
-
- memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE));
- curr += sizeof(STATE_STRING_PREFACE) - 1;
-
- for_each_set_bit(bit, &fs_state, sizeof(fs_state)) {
- WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT);
- if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) {
- *curr++ = fs_state_chars[bit];
- states_printed = true;
- }
- }
-
- /* If no states were printed, reset the buffer */
- if (!states_printed)
- curr = buf;
-
- *curr++ = 0;
-}
-#endif
-
-/*
- * Generally the error codes correspond to their respective errors, but there
- * are a few special cases.
- *
- * EUCLEAN: Any sort of corruption that we encounter. The tree-checker for
- * instance will return EUCLEAN if any of the blocks are corrupted in
- * a way that is problematic. We want to reserve EUCLEAN for these
- * sort of corruptions.
- *
- * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we
- * need to use EROFS for this case. We will have no idea of the
- * original failure, that will have been reported at the time we tripped
- * over the error. Each subsequent error that doesn't have any context
- * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR.
- */
-const char * __attribute_const__ btrfs_decode_error(int errno)
-{
- char *errstr = "unknown";
-
- switch (errno) {
- case -ENOENT: /* -2 */
- errstr = "No such entry";
- break;
- case -EIO: /* -5 */
- errstr = "IO failure";
- break;
- case -ENOMEM: /* -12*/
- errstr = "Out of memory";
- break;
- case -EEXIST: /* -17 */
- errstr = "Object already exists";
- break;
- case -ENOSPC: /* -28 */
- errstr = "No space left";
- break;
- case -EROFS: /* -30 */
- errstr = "Readonly filesystem";
- break;
- case -EOPNOTSUPP: /* -95 */
- errstr = "Operation not supported";
- break;
- case -EUCLEAN: /* -117 */
- errstr = "Filesystem corrupted";
- break;
- case -EDQUOT: /* -122 */
- errstr = "Quota exceeded";
- break;
- }
-
- return errstr;
-}
-
-/*
- * __btrfs_handle_fs_error decodes expected errors from the caller and
- * invokes the appropriate error response.
- */
-__cold
-void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...)
-{
- struct super_block *sb = fs_info->sb;
-#ifdef CONFIG_PRINTK
- char statestr[STATE_STRING_BUF_LEN];
- const char *errstr;
-#endif
-
- /*
- * Special case: if the error is EROFS, and we're already
- * under SB_RDONLY, then it is safe here.
- */
- if (errno == -EROFS && sb_rdonly(sb))
- return;
-
-#ifdef CONFIG_PRINTK
- errstr = btrfs_decode_error(errno);
- btrfs_state_to_string(fs_info, statestr);
- if (fmt) {
- struct va_format vaf;
- va_list args;
-
- va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
-
- pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n",
- sb->s_id, statestr, function, line, errno, errstr, &vaf);
- va_end(args);
- } else {
- pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n",
- sb->s_id, statestr, function, line, errno, errstr);
- }
-#endif
-
- /*
- * Today we only save the error info to memory. Long term we'll
- * also send it down to the disk
- */
- set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
-
- /* Don't go through full error handling during mount */
- if (!(sb->s_flags & SB_BORN))
- return;
-
- if (sb_rdonly(sb))
- return;
-
- btrfs_discard_stop(fs_info);
-
- /* btrfs handle error by forcing the filesystem readonly */
- btrfs_set_sb_rdonly(sb);
- btrfs_info(fs_info, "forced readonly");
- /*
- * Note that a running device replace operation is not canceled here
- * although there is no way to update the progress. It would add the
- * risk of a deadlock, therefore the canceling is omitted. The only
- * penalty is that some I/O remains active until the procedure
- * completes. The next time when the filesystem is mounted writable
- * again, the device replace operation continues.
- */
-}
-
-#ifdef CONFIG_PRINTK
-static const char * const logtypes[] = {
- "emergency",
- "alert",
- "critical",
- "error",
- "warning",
- "notice",
- "info",
- "debug",
-};
-
-
-/*
- * Use one ratelimit state per log level so that a flood of less important
- * messages doesn't cause more important ones to be dropped.
- */
-static struct ratelimit_state printk_limits[] = {
- RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100),
- RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100),
- RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100),
- RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100),
- RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100),
- RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100),
- RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100),
- RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
-};
-
-void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
-{
- char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
- struct va_format vaf;
- va_list args;
- int kern_level;
- const char *type = logtypes[4];
- struct ratelimit_state *ratelimit = &printk_limits[4];
-
- va_start(args, fmt);
-
- while ((kern_level = printk_get_level(fmt)) != 0) {
- size_t size = printk_skip_level(fmt) - fmt;
-
- if (kern_level >= '0' && kern_level <= '7') {
- memcpy(lvl, fmt, size);
- lvl[size] = '\0';
- type = logtypes[kern_level - '0'];
- ratelimit = &printk_limits[kern_level - '0'];
- }
- fmt += size;
- }
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- if (__ratelimit(ratelimit)) {
- if (fs_info) {
- char statestr[STATE_STRING_BUF_LEN];
-
- btrfs_state_to_string(fs_info, statestr);
- _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
- fs_info->sb->s_id, statestr, &vaf);
- } else {
- _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
- }
- }
-
- va_end(args);
-}
-#endif
-
-#if BITS_PER_LONG == 32
-void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info)
-{
- if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) {
- btrfs_warn(fs_info, "reaching 32bit limit for logical addresses");
- btrfs_warn(fs_info,
-"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT",
- BTRFS_32BIT_MAX_FILE_SIZE >> 40);
- btrfs_warn(fs_info,
- "please consider upgrading to 64bit kernel/hardware");
- }
-}
-
-void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
-{
- if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) {
- btrfs_err(fs_info, "reached 32bit limit for logical addresses");
- btrfs_err(fs_info,
-"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed",
- BTRFS_32BIT_MAX_FILE_SIZE >> 40);
- btrfs_err(fs_info,
- "please consider upgrading to 64bit kernel/hardware");
- }
-}
-#endif
-
-/*
- * We only mark the transaction aborted and then set the file system read-only.
- * This will prevent new transactions from starting or trying to join this
- * one.
- *
- * This means that error recovery at the call site is limited to freeing
- * any local memory allocations and passing the error code up without
- * further cleanup. The transaction should complete as it normally would
- * in the call path but will return -EIO.
- *
- * We'll complete the cleanup in btrfs_end_transaction and
- * btrfs_commit_transaction.
- */
-__cold
-void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
- const char *function,
- unsigned int line, int errno)
-{
- struct btrfs_fs_info *fs_info = trans->fs_info;
-
- WRITE_ONCE(trans->aborted, errno);
- WRITE_ONCE(trans->transaction->aborted, errno);
- /* Wake up anybody who may be waiting on this transaction */
- wake_up(&fs_info->transaction_wait);
- wake_up(&fs_info->transaction_blocked_wait);
- __btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
-}
-/*
- * __btrfs_panic decodes unexpected, fatal errors from the caller,
- * issues an alert, and either panics or BUGs, depending on mount options.
- */
-__cold
-void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...)
-{
- char *s_id = "<unknown>";
- const char *errstr;
- struct va_format vaf = { .fmt = fmt };
- va_list args;
-
- if (fs_info)
- s_id = fs_info->sb->s_id;
-
- va_start(args, fmt);
- vaf.va = &args;
-
- errstr = btrfs_decode_error(errno);
- if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR)))
- panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
- s_id, function, line, &vaf, errno, errstr);
-
- btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
- function, line, &vaf, errno, errstr);
- va_end(args);
- /* Caller calls BUG() */
-}
-
static void btrfs_put_super(struct super_block *sb)
{
close_ctree(btrfs_sb(sb));
@@ -626,6 +315,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
int saved_compress_level;
bool saved_compress_force;
int no_compress = 0;
+ const bool remounting = test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state);
if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
@@ -915,12 +605,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
ret = -EINVAL;
goto out;
}
+ btrfs_clear_opt(info->mount_opt, NODISCARD);
break;
case Opt_nodiscard:
btrfs_clear_and_info(info, DISCARD_SYNC,
"turning off discard");
btrfs_clear_and_info(info, DISCARD_ASYNC,
"turning off async discard");
+ btrfs_set_opt(info->mount_opt, NODISCARD);
break;
case Opt_space_cache:
case Opt_space_cache_version:
@@ -1137,10 +829,12 @@ out:
}
if (!ret)
ret = btrfs_check_mountopts_zoned(info);
- if (!ret && btrfs_test_opt(info, SPACE_CACHE))
- btrfs_info(info, "disk space caching is enabled");
- if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE))
- btrfs_info(info, "using free space tree");
+ if (!ret && !remounting) {
+ if (btrfs_test_opt(info, SPACE_CACHE))
+ btrfs_info(info, "disk space caching is enabled");
+ if (btrfs_test_opt(info, FREE_SPACE_TREE))
+ btrfs_info(info, "using free space tree");
+ }
return ret;
}
@@ -1389,6 +1083,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
struct btrfs_dir_item *di;
struct btrfs_path *path;
struct btrfs_key location;
+ struct fscrypt_str name = FSTR_INIT("default", 7);
u64 dir_id;
path = btrfs_alloc_path();
@@ -1401,7 +1096,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
* to mount.
*/
dir_id = btrfs_super_root_dir(fs_info->super_copy);
- di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
+ di = btrfs_lookup_dir_item(NULL, root, path, dir_id, &name, 0);
if (IS_ERR(di)) {
btrfs_free_path(path);
return PTR_ERR(di);
@@ -1502,7 +1197,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
* Exit unless we have some pending changes
* that need to go through commit
*/
- if (fs_info->pending_changes == 0)
+ if (!test_bit(BTRFS_FS_NEED_TRANS_COMMIT,
+ &fs_info->flags))
return 0;
/*
* A non-blocking test if the fs is frozen. We must not
@@ -2009,14 +1705,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (ret)
goto restore;
- /* V1 cache is not supported for subpage mount. */
- if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
- btrfs_warn(fs_info,
- "v1 space cache is not supported for page size %lu with sectorsize %u",
- PAGE_SIZE, fs_info->sectorsize);
- ret = -EINVAL;
+ ret = btrfs_check_features(fs_info, sb);
+ if (ret < 0)
goto restore;
- }
+
btrfs_remount_begin(fs_info, old_opts, *flags);
btrfs_resize_thread_pool(fs_info,
fs_info->thread_pool_size, old_thread_pool_size);
@@ -2550,11 +2242,87 @@ static int btrfs_freeze(struct super_block *sb)
return btrfs_commit_transaction(trans);
}
+static int check_dev_super(struct btrfs_device *dev)
+{
+ struct btrfs_fs_info *fs_info = dev->fs_info;
+ struct btrfs_super_block *sb;
+ u16 csum_type;
+ int ret = 0;
+
+ /* This should be called with fs still frozen. */
+ ASSERT(test_bit(BTRFS_FS_FROZEN, &fs_info->flags));
+
+ /* Missing dev, no need to check. */
+ if (!dev->bdev)
+ return 0;
+
+ /* Only need to check the primary super block. */
+ sb = btrfs_read_dev_one_super(dev->bdev, 0, true);
+ if (IS_ERR(sb))
+ return PTR_ERR(sb);
+
+ /* Verify the checksum. */
+ csum_type = btrfs_super_csum_type(sb);
+ if (csum_type != btrfs_super_csum_type(fs_info->super_copy)) {
+ btrfs_err(fs_info, "csum type changed, has %u expect %u",
+ csum_type, btrfs_super_csum_type(fs_info->super_copy));
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ if (btrfs_check_super_csum(fs_info, sb)) {
+ btrfs_err(fs_info, "csum for on-disk super block no longer matches");
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ /* Btrfs_validate_super() includes fsid check against super->fsid. */
+ ret = btrfs_validate_super(fs_info, sb, 0);
+ if (ret < 0)
+ goto out;
+
+ if (btrfs_super_generation(sb) != fs_info->last_trans_committed) {
+ btrfs_err(fs_info, "transid mismatch, has %llu expect %llu",
+ btrfs_super_generation(sb),
+ fs_info->last_trans_committed);
+ ret = -EUCLEAN;
+ goto out;
+ }
+out:
+ btrfs_release_disk_super(sb);
+ return ret;
+}
+
static int btrfs_unfreeze(struct super_block *sb)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_device *device;
+ int ret = 0;
+ /*
+ * Make sure the fs is not changed by accident (like hibernation then
+ * modified by other OS).
+ * If we found anything wrong, we mark the fs error immediately.
+ *
+ * And since the fs is frozen, no one can modify the fs yet, thus
+ * we don't need to hold device_list_mutex.
+ */
+ list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+ ret = check_dev_super(device);
+ if (ret < 0) {
+ btrfs_handle_fs_error(fs_info, ret,
+ "super block on devid %llu got modified unexpectedly",
+ device->devid);
+ break;
+ }
+ }
clear_bit(BTRFS_FS_FROZEN, &fs_info->flags);
+
+ /*
+ * We still return 0, to allow VFS layer to unfreeze the fs even the
+ * above checks failed. Since the fs is either fine or read-only, we're
+ * safe to continue, without causing further damage.
+ */
return 0;
}
@@ -2568,7 +2336,7 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
* the end of RCU grace period.
*/
rcu_read_lock();
- seq_escape(m, rcu_str_deref(fs_info->fs_devices->latest_dev->name), " \t\n\\");
+ seq_escape(m, btrfs_dev_name(fs_info->fs_devices->latest_dev), " \t\n\\");
rcu_read_unlock();
return 0;
@@ -2617,7 +2385,7 @@ static __cold void btrfs_interface_exit(void)
misc_deregister(&btrfs_misc);
}
-static void __init btrfs_print_mod_info(void)
+static int __init btrfs_print_mod_info(void)
{
static const char options[] = ""
#ifdef CONFIG_BTRFS_DEBUG
@@ -2644,115 +2412,125 @@ static void __init btrfs_print_mod_info(void)
#endif
;
pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
+ return 0;
}
-static int __init init_btrfs_fs(void)
+static int register_btrfs(void)
{
- int err;
-
- btrfs_props_init();
-
- err = btrfs_init_sysfs();
- if (err)
- return err;
-
- btrfs_init_compress();
-
- err = btrfs_init_cachep();
- if (err)
- goto free_compress;
-
- err = extent_io_init();
- if (err)
- goto free_cachep;
-
- err = extent_state_cache_init();
- if (err)
- goto free_extent_io;
-
- err = extent_map_init();
- if (err)
- goto free_extent_state_cache;
-
- err = ordered_data_init();
- if (err)
- goto free_extent_map;
-
- err = btrfs_delayed_inode_init();
- if (err)
- goto free_ordered_data;
-
- err = btrfs_auto_defrag_init();
- if (err)
- goto free_delayed_inode;
-
- err = btrfs_delayed_ref_init();
- if (err)
- goto free_auto_defrag;
-
- err = btrfs_prelim_ref_init();
- if (err)
- goto free_delayed_ref;
-
- err = btrfs_interface_init();
- if (err)
- goto free_prelim_ref;
+ return register_filesystem(&btrfs_fs_type);
+}
- btrfs_print_mod_info();
+static void unregister_btrfs(void)
+{
+ unregister_filesystem(&btrfs_fs_type);
+}
- err = btrfs_run_sanity_tests();
- if (err)
- goto unregister_ioctl;
+/* Helper structure for long init/exit functions. */
+struct init_sequence {
+ int (*init_func)(void);
+ /* Can be NULL if the init_func doesn't need cleanup. */
+ void (*exit_func)(void);
+};
- err = register_filesystem(&btrfs_fs_type);
- if (err)
- goto unregister_ioctl;
+static const struct init_sequence mod_init_seq[] = {
+ {
+ .init_func = btrfs_props_init,
+ .exit_func = NULL,
+ }, {
+ .init_func = btrfs_init_sysfs,
+ .exit_func = btrfs_exit_sysfs,
+ }, {
+ .init_func = btrfs_init_compress,
+ .exit_func = btrfs_exit_compress,
+ }, {
+ .init_func = btrfs_init_cachep,
+ .exit_func = btrfs_destroy_cachep,
+ }, {
+ .init_func = btrfs_transaction_init,
+ .exit_func = btrfs_transaction_exit,
+ }, {
+ .init_func = btrfs_ctree_init,
+ .exit_func = btrfs_ctree_exit,
+ }, {
+ .init_func = btrfs_free_space_init,
+ .exit_func = btrfs_free_space_exit,
+ }, {
+ .init_func = extent_state_init_cachep,
+ .exit_func = extent_state_free_cachep,
+ }, {
+ .init_func = extent_buffer_init_cachep,
+ .exit_func = extent_buffer_free_cachep,
+ }, {
+ .init_func = btrfs_bioset_init,
+ .exit_func = btrfs_bioset_exit,
+ }, {
+ .init_func = extent_map_init,
+ .exit_func = extent_map_exit,
+ }, {
+ .init_func = ordered_data_init,
+ .exit_func = ordered_data_exit,
+ }, {
+ .init_func = btrfs_delayed_inode_init,
+ .exit_func = btrfs_delayed_inode_exit,
+ }, {
+ .init_func = btrfs_auto_defrag_init,
+ .exit_func = btrfs_auto_defrag_exit,
+ }, {
+ .init_func = btrfs_delayed_ref_init,
+ .exit_func = btrfs_delayed_ref_exit,
+ }, {
+ .init_func = btrfs_prelim_ref_init,
+ .exit_func = btrfs_prelim_ref_exit,
+ }, {
+ .init_func = btrfs_interface_init,
+ .exit_func = btrfs_interface_exit,
+ }, {
+ .init_func = btrfs_print_mod_info,
+ .exit_func = NULL,
+ }, {
+ .init_func = btrfs_run_sanity_tests,
+ .exit_func = NULL,
+ }, {
+ .init_func = register_btrfs,
+ .exit_func = unregister_btrfs,
+ }
+};
- return 0;
+static bool mod_init_result[ARRAY_SIZE(mod_init_seq)];
-unregister_ioctl:
- btrfs_interface_exit();
-free_prelim_ref:
- btrfs_prelim_ref_exit();
-free_delayed_ref:
- btrfs_delayed_ref_exit();
-free_auto_defrag:
- btrfs_auto_defrag_exit();
-free_delayed_inode:
- btrfs_delayed_inode_exit();
-free_ordered_data:
- ordered_data_exit();
-free_extent_map:
- extent_map_exit();
-free_extent_state_cache:
- extent_state_cache_exit();
-free_extent_io:
- extent_io_exit();
-free_cachep:
- btrfs_destroy_cachep();
-free_compress:
- btrfs_exit_compress();
- btrfs_exit_sysfs();
+static __always_inline void btrfs_exit_btrfs_fs(void)
+{
+ int i;
- return err;
+ for (i = ARRAY_SIZE(mod_init_seq) - 1; i >= 0; i--) {
+ if (!mod_init_result[i])
+ continue;
+ if (mod_init_seq[i].exit_func)
+ mod_init_seq[i].exit_func();
+ mod_init_result[i] = false;
+ }
}
static void __exit exit_btrfs_fs(void)
{
- btrfs_destroy_cachep();
- btrfs_delayed_ref_exit();
- btrfs_auto_defrag_exit();
- btrfs_delayed_inode_exit();
- btrfs_prelim_ref_exit();
- ordered_data_exit();
- extent_map_exit();
- extent_state_cache_exit();
- extent_io_exit();
- btrfs_interface_exit();
- unregister_filesystem(&btrfs_fs_type);
- btrfs_exit_sysfs();
- btrfs_cleanup_fs_uuids();
- btrfs_exit_compress();
+ btrfs_exit_btrfs_fs();
+}
+
+static int __init init_btrfs_fs(void)
+{
+ int ret;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mod_init_seq); i++) {
+ ASSERT(!mod_init_result[i]);
+ ret = mod_init_seq[i].init_func();
+ if (ret < 0) {
+ btrfs_exit_btrfs_fs();
+ return ret;
+ }
+ mod_init_result[i] = true;
+ }
+ return 0;
}
late_initcall(init_btrfs_fs);
diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h
new file mode 100644
index 000000000000..8dbb909b364f
--- /dev/null
+++ b/fs/btrfs/super.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_SUPER_H
+#define BTRFS_SUPER_H
+
+int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
+ unsigned long new_flags);
+int btrfs_sync_fs(struct super_block *sb, int wait);
+char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
+ u64 subvol_objectid);
+
+static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+static inline void btrfs_set_sb_rdonly(struct super_block *sb)
+{
+ sb->s_flags |= SB_RDONLY;
+ set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
+}
+
+static inline void btrfs_clear_sb_rdonly(struct super_block *sb)
+{
+ sb->s_flags &= ~SB_RDONLY;
+ clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
+}
+
+#endif
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index d5d0717fd09a..45615ce36498 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -10,7 +10,7 @@
#include <linux/completion.h>
#include <linux/bug.h>
#include <crypto/hash.h>
-
+#include "messages.h"
#include "ctree.h"
#include "discard.h"
#include "disk-io.h"
@@ -22,6 +22,8 @@
#include "block-group.h"
#include "qgroup.h"
#include "misc.h"
+#include "fs.h"
+#include "accessors.h"
/*
* Structure name Path
@@ -35,12 +37,12 @@
* qgroup_attrs /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>
* space_info_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>
* raid_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>
+ * discard_attrs /sys/fs/btrfs/<uuid>/discard
*
* When built with BTRFS_CONFIG_DEBUG:
*
* btrfs_debug_feature_attrs /sys/fs/btrfs/debug
* btrfs_debug_mount_attrs /sys/fs/btrfs/<uuid>/debug
- * discard_debug_attrs /sys/fs/btrfs/<uuid>/debug/discard
*/
struct btrfs_feature_attr {
@@ -248,7 +250,7 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
/*
* We don't want to do full transaction commit from inside sysfs
*/
- btrfs_set_pending(fs_info, COMMIT);
+ set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags);
wake_up_process(fs_info->transaction_kthread);
return count;
@@ -286,6 +288,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
+BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE);
BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
#ifdef CONFIG_BLK_DEV_ZONED
BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
@@ -317,6 +320,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(metadata_uuid),
BTRFS_FEAT_ATTR_PTR(free_space_tree),
BTRFS_FEAT_ATTR_PTR(raid1c34),
+ BTRFS_FEAT_ATTR_PTR(block_group_tree),
#ifdef CONFIG_BLK_DEV_ZONED
BTRFS_FEAT_ATTR_PTR(zoned),
#endif
@@ -429,12 +433,10 @@ static const struct attribute_group btrfs_static_feature_attr_group = {
.attrs = btrfs_supported_static_feature_attrs,
};
-#ifdef CONFIG_BTRFS_DEBUG
-
/*
* Discard statistics and tunables
*/
-#define discard_to_fs_info(_kobj) to_fs_info((_kobj)->parent->parent)
+#define discard_to_fs_info(_kobj) to_fs_info(get_btrfs_kobj(_kobj))
static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj,
struct kobj_attribute *a,
@@ -583,11 +585,11 @@ BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show,
btrfs_discard_max_discard_size_store);
/*
- * Per-filesystem debugging of discard (when mounted with discard=async).
+ * Per-filesystem stats for discard (when mounted with discard=async).
*
- * Path: /sys/fs/btrfs/<uuid>/debug/discard/
+ * Path: /sys/fs/btrfs/<uuid>/discard/
*/
-static const struct attribute *discard_debug_attrs[] = {
+static const struct attribute *discard_attrs[] = {
BTRFS_ATTR_PTR(discard, discardable_bytes),
BTRFS_ATTR_PTR(discard, discardable_extents),
BTRFS_ATTR_PTR(discard, discard_bitmap_bytes),
@@ -599,6 +601,8 @@ static const struct attribute *discard_debug_attrs[] = {
NULL,
};
+#ifdef CONFIG_BTRFS_DEBUG
+
/*
* Per-filesystem runtime debugging exported via sysfs.
*
@@ -760,7 +764,7 @@ static ssize_t btrfs_chunk_size_store(struct kobject *kobj,
val = min(val, BTRFS_MAX_DATA_CHUNK_SIZE);
/* Limit stripe size to 10% of available space. */
- val = min(div_factor(fs_info->fs_devices->total_rw_bytes, 1), val);
+ val = min(mult_perc(fs_info->fs_devices->total_rw_bytes, 10), val);
/* Must be multiple of 256M. */
val &= ~((u64)SZ_256M - 1);
@@ -837,11 +841,8 @@ static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
char *buf)
{
struct btrfs_space_info *space_info = to_space_info(kobj);
- ssize_t ret;
-
- ret = sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold));
- return ret;
+ return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold));
}
static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj,
@@ -960,7 +961,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
/*
* We don't want to do full transaction commit from inside sysfs
*/
- btrfs_set_pending(fs_info, COMMIT);
+ set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags);
wake_up_process(fs_info->transaction_kthread);
return len;
@@ -1150,25 +1151,6 @@ static ssize_t btrfs_generation_show(struct kobject *kobj,
}
BTRFS_ATTR(, generation, btrfs_generation_show);
-/*
- * Look for an exact string @string in @buffer with possible leading or
- * trailing whitespace
- */
-static bool strmatch(const char *buffer, const char *string)
-{
- const size_t len = strlen(string);
-
- /* Skip leading whitespace */
- buffer = skip_spaces(buffer);
-
- /* Match entire string, check if the rest is whitespace or empty */
- if (strncmp(string, buffer, len) == 0 &&
- strlen(skip_spaces(buffer + len)) == 0)
- return true;
-
- return false;
-}
-
static const char * const btrfs_read_policy_name[] = { "pid" };
static ssize_t btrfs_read_policy_show(struct kobject *kobj,
@@ -1180,16 +1162,16 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
if (fs_devices->read_policy == i)
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s[%s]",
+ ret += sysfs_emit_at(buf, ret, "%s[%s]",
(ret == 0 ? "" : " "),
btrfs_read_policy_name[i]);
else
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
+ ret += sysfs_emit_at(buf, ret, "%s%s",
(ret == 0 ? "" : " "),
btrfs_read_policy_name[i]);
}
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ ret += sysfs_emit_at(buf, ret, "\n");
return ret;
}
@@ -1202,7 +1184,7 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
int i;
for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
- if (strmatch(buf, btrfs_read_policy_name[i])) {
+ if (sysfs_streq(buf, btrfs_read_policy_name[i])) {
if (i != fs_devices->read_policy) {
fs_devices->read_policy = i;
btrfs_info(fs_devices->fs_info,
@@ -1222,11 +1204,8 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj,
char *buf)
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- ssize_t ret;
- ret = sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold));
-
- return ret;
+ return sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold));
}
static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
@@ -1427,13 +1406,12 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
kobject_del(fs_info->space_info_kobj);
kobject_put(fs_info->space_info_kobj);
}
-#ifdef CONFIG_BTRFS_DEBUG
- if (fs_info->discard_debug_kobj) {
- sysfs_remove_files(fs_info->discard_debug_kobj,
- discard_debug_attrs);
- kobject_del(fs_info->discard_debug_kobj);
- kobject_put(fs_info->discard_debug_kobj);
+ if (fs_info->discard_kobj) {
+ sysfs_remove_files(fs_info->discard_kobj, discard_attrs);
+ kobject_del(fs_info->discard_kobj);
+ kobject_put(fs_info->discard_kobj);
}
+#ifdef CONFIG_BTRFS_DEBUG
if (fs_info->debug_kobj) {
sysfs_remove_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
kobject_del(fs_info->debug_kobj);
@@ -2001,20 +1979,18 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
if (error)
goto failure;
+#endif
/* Discard directory */
- fs_info->discard_debug_kobj = kobject_create_and_add("discard",
- fs_info->debug_kobj);
- if (!fs_info->discard_debug_kobj) {
+ fs_info->discard_kobj = kobject_create_and_add("discard", fsid_kobj);
+ if (!fs_info->discard_kobj) {
error = -ENOMEM;
goto failure;
}
- error = sysfs_create_files(fs_info->discard_debug_kobj,
- discard_debug_attrs);
+ error = sysfs_create_files(fs_info->discard_kobj, discard_attrs);
if (error)
goto failure;
-#endif
error = addrm_unknown_feature_attrs(fs_info, true);
if (error)
@@ -2041,6 +2017,98 @@ failure:
return error;
}
+static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ bool enabled;
+
+ spin_lock(&fs_info->qgroup_lock);
+ enabled = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return sysfs_emit(buf, "%d\n", enabled);
+}
+BTRFS_ATTR(qgroups, enabled, qgroup_enabled_show);
+
+static ssize_t qgroup_inconsistent_show(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ bool inconsistent;
+
+ spin_lock(&fs_info->qgroup_lock);
+ inconsistent = (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT);
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return sysfs_emit(buf, "%d\n", inconsistent);
+}
+BTRFS_ATTR(qgroups, inconsistent, qgroup_inconsistent_show);
+
+static ssize_t qgroup_drop_subtree_thres_show(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ u8 result;
+
+ spin_lock(&fs_info->qgroup_lock);
+ result = fs_info->qgroup_drop_subtree_thres;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return sysfs_emit(buf, "%d\n", result);
+}
+
+static ssize_t qgroup_drop_subtree_thres_store(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ u8 new_thres;
+ int ret;
+
+ ret = kstrtou8(buf, 10, &new_thres);
+ if (ret)
+ return -EINVAL;
+
+ if (new_thres > BTRFS_MAX_LEVEL)
+ return -EINVAL;
+
+ spin_lock(&fs_info->qgroup_lock);
+ fs_info->qgroup_drop_subtree_thres = new_thres;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return len;
+}
+BTRFS_ATTR_RW(qgroups, drop_subtree_threshold, qgroup_drop_subtree_thres_show,
+ qgroup_drop_subtree_thres_store);
+
+/*
+ * Qgroups global info
+ *
+ * Path: /sys/fs/btrfs/<uuid>/qgroups/
+ */
+static struct attribute *qgroups_attrs[] = {
+ BTRFS_ATTR_PTR(qgroups, enabled),
+ BTRFS_ATTR_PTR(qgroups, inconsistent),
+ BTRFS_ATTR_PTR(qgroups, drop_subtree_threshold),
+ NULL
+};
+ATTRIBUTE_GROUPS(qgroups);
+
+static void qgroups_release(struct kobject *kobj)
+{
+ kfree(kobj);
+}
+
+static struct kobj_type qgroups_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = qgroups_groups,
+ .release = qgroups_release,
+};
+
static inline struct btrfs_fs_info *qgroup_kobj_to_fs_info(struct kobject *kobj)
{
return to_fs_info(kobj->parent->parent);
@@ -2166,11 +2234,15 @@ int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info)
if (fs_info->qgroups_kobj)
return 0;
- fs_info->qgroups_kobj = kobject_create_and_add("qgroups", fsid_kobj);
- if (!fs_info->qgroups_kobj) {
- ret = -ENOMEM;
+ fs_info->qgroups_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+ if (!fs_info->qgroups_kobj)
+ return -ENOMEM;
+
+ ret = kobject_init_and_add(fs_info->qgroups_kobj, &qgroups_ktype,
+ fsid_kobj, "qgroups");
+ if (ret < 0)
goto out;
- }
+
rbtree_postorder_for_each_entry_safe(qgroup, next,
&fs_info->qgroup_tree, node) {
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
@@ -2251,8 +2323,11 @@ int __init btrfs_init_sysfs(void)
#ifdef CONFIG_BTRFS_DEBUG
ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group);
- if (ret)
- goto out2;
+ if (ret) {
+ sysfs_unmerge_group(&btrfs_kset->kobj,
+ &btrfs_static_feature_attr_group);
+ goto out_remove_group;
+ }
#endif
return 0;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index cc9377cf56a3..181469fc0bb3 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -16,6 +16,7 @@
#include "../disk-io.h"
#include "../qgroup.h"
#include "../block-group.h"
+#include "../fs.h"
static struct vfsmount *test_mnt = NULL;
@@ -101,7 +102,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info)
if (!dev)
return ERR_PTR(-ENOMEM);
- extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
+ extent_io_tree_init(NULL, &dev->alloc_state, 0);
INIT_LIST_HEAD(&dev->dev_list);
list_add(&dev->dev_list, &fs_info->fs_devices->devices);
@@ -200,7 +201,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
void btrfs_free_dummy_root(struct btrfs_root *root)
{
- if (!root)
+ if (IS_ERR_OR_NULL(root))
return;
/* Will be freed by btrfs_free_fs_roots */
if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
@@ -243,7 +244,7 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group *cache)
{
if (!cache)
return;
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
kfree(cache->free_space_ctl);
kfree(cache);
}
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index b7d181a08eab..5ef0b90e25c3 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -8,6 +8,7 @@
#include "../ctree.h"
#include "../extent_io.h"
#include "../disk-io.h"
+#include "../accessors.h"
static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
{
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index a232b15b8021..dfc5c7fa6038 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -4,6 +4,7 @@
*/
#include <linux/pagemap.h>
+#include <linux/pagevec.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/sizes.h>
@@ -20,39 +21,40 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
unsigned long flags)
{
int ret;
- struct page *pages[16];
+ struct folio_batch fbatch;
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
- unsigned long nr_pages = end_index - index + 1;
int i;
int count = 0;
int loops = 0;
- while (nr_pages > 0) {
- ret = find_get_pages_contig(inode->i_mapping, index,
- min_t(unsigned long, nr_pages,
- ARRAY_SIZE(pages)), pages);
+ folio_batch_init(&fbatch);
+
+ while (index <= end_index) {
+ ret = filemap_get_folios_contig(inode->i_mapping, &index,
+ end_index, &fbatch);
for (i = 0; i < ret; i++) {
+ struct folio *folio = fbatch.folios[i];
+
if (flags & PROCESS_TEST_LOCKED &&
- !PageLocked(pages[i]))
+ !folio_test_locked(folio))
count++;
- if (flags & PROCESS_UNLOCK && PageLocked(pages[i]))
- unlock_page(pages[i]);
- put_page(pages[i]);
+ if (flags & PROCESS_UNLOCK && folio_test_locked(folio))
+ folio_unlock(folio);
if (flags & PROCESS_RELEASE)
- put_page(pages[i]);
+ folio_put(folio);
}
- nr_pages -= ret;
- index += ret;
+ folio_batch_release(&fbatch);
cond_resched();
loops++;
if (loops > 100000) {
printk(KERN_ERR
- "stuck in a loop, start %llu, end %llu, nr_pages %lu, ret %d\n",
- start, end, nr_pages, ret);
+ "stuck in a loop, start %llu, end %llu, ret %d\n",
+ start, end, ret);
break;
}
}
+
return count;
}
@@ -80,7 +82,6 @@ static void extent_flag_to_str(const struct extent_state *state, char *dest)
PRINT_ONE_FLAG(state, dest, cur, NODATASUM);
PRINT_ONE_FLAG(state, dest, cur, CLEAR_META_RESV);
PRINT_ONE_FLAG(state, dest, cur, NEED_WAIT);
- PRINT_ONE_FLAG(state, dest, cur, DAMAGED);
PRINT_ONE_FLAG(state, dest, cur, NORESERVE);
PRINT_ONE_FLAG(state, dest, cur, QGROUP_RESERVED);
PRINT_ONE_FLAG(state, dest, cur, CLEAR_DATA_RESV);
@@ -131,7 +132,7 @@ static int test_find_delalloc(u32 sectorsize)
* Passing NULL as we don't have fs_info but tracepoints are not used
* at this point
*/
- extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST, NULL);
+ extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST);
/*
* First go through and create and mark all of our pages dirty, we pin
@@ -172,7 +173,7 @@ static int test_find_delalloc(u32 sectorsize)
sectorsize - 1, start, end);
goto out_bits;
}
- unlock_extent(tmp, start, end);
+ unlock_extent(tmp, start, end, NULL);
unlock_page(locked_page);
put_page(locked_page);
@@ -208,7 +209,7 @@ static int test_find_delalloc(u32 sectorsize)
test_err("there were unlocked pages in the range");
goto out_bits;
}
- unlock_extent(tmp, start, end);
+ unlock_extent(tmp, start, end, NULL);
/* locked_page was unlocked above */
put_page(locked_page);
@@ -263,7 +264,7 @@ static int test_find_delalloc(u32 sectorsize)
test_err("pages in range were not all locked");
goto out_bits;
}
- unlock_extent(tmp, start, end);
+ unlock_extent(tmp, start, end, NULL);
/*
* Now to test where we run into a page that is no longer dirty in the
@@ -488,7 +489,7 @@ static int test_find_first_clear_extent_bit(void)
test_msg("running find_first_clear_extent_bit test");
- extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL);
+ extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST);
/* Test correct handling of empty tree */
find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED);
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 5930cdcae5cb..ebf68fcd2149 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -82,7 +82,7 @@ static int test_extents(struct btrfs_block_group *cache)
}
/* Cleanup */
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
return 0;
}
@@ -149,7 +149,7 @@ static int test_bitmaps(struct btrfs_block_group *cache, u32 sectorsize)
return -1;
}
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
return 0;
}
@@ -230,7 +230,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
return -1;
}
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
/* Now with the extent entry offset into the bitmap */
ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1);
@@ -266,7 +266,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
* [ bitmap ]
* [ del ]
*/
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1);
if (ret) {
test_err("couldn't add bitmap %d", ret);
@@ -291,7 +291,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
return -1;
}
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
/*
* This blew up before, we have part of the free space in a bitmap and
@@ -317,7 +317,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
return ret;
}
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
return 0;
}
@@ -629,7 +629,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
if (ret)
return ret;
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
/*
* Now test a similar scenario, but where our extent entry is located
@@ -819,7 +819,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
return ret;
cache->free_space_ctl->op = orig_free_space_ops;
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
return 0;
}
@@ -868,7 +868,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
}
/* Now validate bitmaps do the correct thing. */
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
for (i = 0; i < 2; i++) {
offset = i * BITS_PER_BITMAP * sectorsize;
bytes = (i + 1) * SZ_1M;
@@ -891,7 +891,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
}
/* Now validate bitmaps with different ->max_extent_size. */
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
orig_free_space_ops = cache->free_space_ctl->op;
cache->free_space_ctl->op = &test_free_space_ops;
@@ -998,7 +998,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
}
cache->free_space_ctl->op = orig_free_space_ops;
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
return 0;
}
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 13734ed43bfc..b61972046feb 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -10,6 +10,7 @@
#include "../free-space-tree.h"
#include "../transaction.h"
#include "../block-group.h"
+#include "../accessors.h"
struct free_space_extent {
u64 start;
@@ -470,7 +471,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
}
cache->bitmap_low_thresh = 0;
cache->bitmap_high_thresh = (u32)-1;
- cache->needs_free_space = 1;
+ set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags);
cache->fs_info = root->fs_info;
btrfs_init_dummy_trans(&trans, root->fs_info);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index cac89c388131..05b03f5eab83 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -11,6 +11,7 @@
#include "../extent_io.h"
#include "../volumes.h"
#include "../compression.h"
+#include "../accessors.h"
static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
u64 ram_bytes, u64 offset, u64 disk_bytenr,
@@ -72,8 +73,8 @@ static void insert_inode_item_key(struct btrfs_root *root)
* diagram of how the extents will look though this may not be possible we still
* want to make sure everything acts normally (the last number is not inclusive)
*
- * [0 - 5][5 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291]
- * [hole ][inline][hole but no extent][ hole ][ regular ][regular1 split]
+ * [0 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291]
+ * [inline][hole but no extent][ hole ][ regular ][regular1 split]
*
* [12291 - 16387][16387 - 24579][24579 - 28675][ 28675 - 32771][32771 - 36867 ]
* [ hole ][regular1 split][ prealloc ][ prealloc1 ][prealloc1 written]
@@ -90,19 +91,12 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
u64 disk_bytenr = SZ_1M;
u64 offset = 0;
- /* First we want a hole */
- insert_extent(root, offset, 5, 5, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
- slot);
- slot++;
- offset += 5;
-
/*
- * Now we want an inline extent, I don't think this is possible but hey
- * why not? Also keep in mind if we have an inline extent it counts as
- * the whole first page. If we were to expand it we would have to cow
- * and we wouldn't have an inline extent anymore.
+ * Tree-checker has strict limits on inline extents that they can only
+ * exist at file offset 0, thus we can only have one inline file extent
+ * at most.
*/
- insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0,
+ insert_extent(root, offset, 6, 6, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0,
slot);
slot++;
offset = sectorsize;
@@ -267,7 +261,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
free_extent_map(em);
- btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
+ btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
/*
* All of the magic numbers are based on the mapping setup in
@@ -281,37 +275,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != EXTENT_MAP_HOLE) {
- test_err("expected a hole, got %llu", em->block_start);
- goto out;
- }
- if (em->start != 0 || em->len != 5) {
- test_err(
- "unexpected extent wanted start 0 len 5, got start %llu len %llu",
- em->start, em->len);
- goto out;
- }
- if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
- goto out;
- }
- offset = em->start + em->len;
- free_extent_map(em);
-
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
- if (IS_ERR(em)) {
- test_err("got an error when we shouldn't have");
- goto out;
- }
if (em->block_start != EXTENT_MAP_INLINE) {
test_err("expected an inline, got %llu", em->block_start);
goto out;
}
- if (em->start != offset || em->len != (sectorsize - 5)) {
+ /*
+ * For inline extent, we always round up the em to sectorsize, as
+ * they are either:
+ *
+ * a) a hidden hole
+ * The range will be zeroed at inline extent read time.
+ *
+ * b) a file extent with unaligned bytenr
+ * Tree checker will reject it.
+ */
+ if (em->start != 0 || em->len != sectorsize) {
test_err(
- "unexpected extent wanted start %llu len 1, got start %llu len %llu",
- offset, em->start, em->len);
+ "unexpected extent wanted start 0 len %u, got start %llu len %llu",
+ sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
@@ -975,7 +957,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
BTRFS_MAX_EXTENT_SIZE >> 1,
(BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_UPTODATE, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1043,7 +1025,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_UPTODATE, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1076,7 +1058,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/* Empty */
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_UPTODATE, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1092,7 +1074,7 @@ out:
if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_UPTODATE, NULL);
iput(inode);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index eee1e4459541..3fc8dc3fd980 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -10,6 +10,8 @@
#include "../disk-io.h"
#include "../qgroup.h"
#include "../backref.h"
+#include "../fs.h"
+#include "../accessors.h"
static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
u64 num_bytes, u64 parent, u64 root_objectid)
@@ -203,6 +205,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
static int test_no_shared_qgroup(struct btrfs_root *root,
u32 sectorsize, u32 nodesize)
{
+ struct btrfs_backref_walk_ctx ctx = { 0 };
struct btrfs_trans_handle trans;
struct btrfs_fs_info *fs_info = root->fs_info;
struct ulist *old_roots = NULL;
@@ -218,30 +221,38 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
return ret;
}
+ ctx.bytenr = nodesize;
+ ctx.trans = &trans;
+ ctx.fs_info = fs_info;
+
/*
* Since the test trans doesn't have the complicated delayed refs,
* we can only call btrfs_qgroup_account_extent() directly to test
* quota.
*/
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
+ old_roots = ctx.roots;
+ ctx.roots = NULL;
ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FS_TREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
+ new_roots = ctx.roots;
+ ctx.roots = NULL;
ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
new_roots);
@@ -250,32 +261,38 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
return ret;
}
+ /* btrfs_qgroup_account_extent() always frees the ulists passed to it. */
+ old_roots = NULL;
+ new_roots = NULL;
+
if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
nodesize, nodesize)) {
test_err("qgroup counts didn't match expected values");
return -EINVAL;
}
- old_roots = NULL;
- new_roots = NULL;
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
+ old_roots = ctx.roots;
+ ctx.roots = NULL;
ret = remove_extent_item(root, nodesize, nodesize);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return -EINVAL;
+ }
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
+ new_roots = ctx.roots;
+ ctx.roots = NULL;
ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
new_roots);
@@ -300,6 +317,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
static int test_multiple_refs(struct btrfs_root *root,
u32 sectorsize, u32 nodesize)
{
+ struct btrfs_backref_walk_ctx ctx = { 0 };
struct btrfs_trans_handle trans;
struct btrfs_fs_info *fs_info = root->fs_info;
struct ulist *old_roots = NULL;
@@ -320,25 +338,33 @@ static int test_multiple_refs(struct btrfs_root *root,
return ret;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
+ ctx.bytenr = nodesize;
+ ctx.trans = &trans;
+ ctx.fs_info = fs_info;
+
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
+ old_roots = ctx.roots;
+ ctx.roots = NULL;
ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FS_TREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
+ new_roots = ctx.roots;
+ ctx.roots = NULL;
ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
new_roots);
@@ -353,25 +379,29 @@ static int test_multiple_refs(struct btrfs_root *root,
return -EINVAL;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
+ old_roots = ctx.roots;
+ ctx.roots = NULL;
ret = add_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FIRST_FREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
+ new_roots = ctx.roots;
+ ctx.roots = NULL;
ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
new_roots);
@@ -392,25 +422,29 @@ static int test_multiple_refs(struct btrfs_root *root,
return -EINVAL;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
+ old_roots = ctx.roots;
+ ctx.roots = NULL;
ret = remove_extent_ref(root, nodesize, nodesize, 0,
BTRFS_FIRST_FREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
+ ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
+ new_roots = ctx.roots;
+ ctx.roots = NULL;
ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
new_roots);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0bec10740ad3..b8c52e89688c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -6,6 +6,7 @@
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/writeback.h>
#include <linux/pagemap.h>
#include <linux/blkdev.h>
@@ -23,6 +24,18 @@
#include "block-group.h"
#include "space-info.h"
#include "zoned.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "root-tree.h"
+#include "defrag.h"
+#include "dir-item.h"
+#include "uuid-tree.h"
+#include "ioctl.h"
+#include "relocation.h"
+#include "scrub.h"
+
+static struct kmem_cache *btrfs_trans_handle_cachep;
#define BTRFS_ROOT_TRANS_TAG 0
@@ -161,7 +174,6 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root, *tmp;
- struct btrfs_caching_control *caching_ctl, *next;
/*
* At this point no one can be using this transaction to modify any tree
@@ -196,46 +208,6 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
}
spin_unlock(&cur_trans->dropped_roots_lock);
- /*
- * We have to update the last_byte_to_unpin under the commit_root_sem,
- * at the same time we swap out the commit roots.
- *
- * This is because we must have a real view of the last spot the caching
- * kthreads were while caching. Consider the following views of the
- * extent tree for a block group
- *
- * commit root
- * +----+----+----+----+----+----+----+
- * |\\\\| |\\\\|\\\\| |\\\\|\\\\|
- * +----+----+----+----+----+----+----+
- * 0 1 2 3 4 5 6 7
- *
- * new commit root
- * +----+----+----+----+----+----+----+
- * | | | |\\\\| | |\\\\|
- * +----+----+----+----+----+----+----+
- * 0 1 2 3 4 5 6 7
- *
- * If the cache_ctl->progress was at 3, then we are only allowed to
- * unpin [0,1) and [2,3], because the caching thread has already
- * processed those extents. We are not allowed to unpin [5,6), because
- * the caching thread will re-start it's search from 3, and thus find
- * the hole from [4,6) to add to the free space cache.
- */
- write_lock(&fs_info->block_group_cache_lock);
- list_for_each_entry_safe(caching_ctl, next,
- &fs_info->caching_block_groups, list) {
- struct btrfs_block_group *cache = caching_ctl->block_group;
-
- if (btrfs_block_group_done(cache)) {
- cache->last_byte_to_unpin = (u64)-1;
- list_del_init(&caching_ctl->list);
- btrfs_put_caching_control(caching_ctl);
- } else {
- cache->last_byte_to_unpin = caching_ctl->progress;
- }
- }
- write_unlock(&fs_info->block_group_cache_lock);
up_write(&fs_info->commit_root_sem);
}
@@ -313,6 +285,8 @@ loop:
atomic_inc(&cur_trans->num_writers);
extwriter_counter_inc(cur_trans, type);
spin_unlock(&fs_info->trans_lock);
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers);
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters);
return 0;
}
spin_unlock(&fs_info->trans_lock);
@@ -334,16 +308,23 @@ loop:
if (!cur_trans)
return -ENOMEM;
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers);
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters);
+
spin_lock(&fs_info->trans_lock);
if (fs_info->running_transaction) {
/*
* someone started a transaction after we unlocked. Make sure
* to redo the checks above
*/
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
kfree(cur_trans);
goto loop;
} else if (BTRFS_FS_ERROR(fs_info)) {
spin_unlock(&fs_info->trans_lock);
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
kfree(cur_trans);
return -EROFS;
}
@@ -397,9 +378,9 @@ loop:
spin_lock_init(&cur_trans->releasing_ebs_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
- IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
+ IO_TREE_TRANS_DIRTY_PAGES);
extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
- IO_TREE_FS_PINNED_EXTENTS, NULL);
+ IO_TREE_FS_PINNED_EXTENTS);
fs_info->generation++;
cur_trans->transid = fs_info->generation;
fs_info->running_transaction = cur_trans;
@@ -541,6 +522,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info)
refcount_inc(&cur_trans->use_count);
spin_unlock(&fs_info->trans_lock);
+ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
wait_event(fs_info->transaction_wait,
cur_trans->state >= TRANS_STATE_UNBLOCKED ||
TRANS_ABORTED(cur_trans));
@@ -625,7 +607,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
*/
num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
if (flush == BTRFS_RESERVE_FLUSH_ALL &&
- delayed_refs_rsv->full == 0) {
+ btrfs_block_rsv_full(delayed_refs_rsv) == 0) {
delayed_refs_bytes = num_bytes;
num_bytes <<= 1;
}
@@ -650,7 +632,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
if (rsv->space_info->force_alloc)
do_chunk_alloc = true;
} else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
- !delayed_refs_rsv->full) {
+ !btrfs_block_rsv_full(delayed_refs_rsv)) {
/*
* Some people call with btrfs_start_transaction(root, 0)
* because they can be throttled, but have some other mechanism
@@ -859,6 +841,15 @@ static noinline void wait_for_commit(struct btrfs_transaction *commit,
u64 transid = commit->transid;
bool put = false;
+ /*
+ * At the moment this function is called with min_state either being
+ * TRANS_STATE_COMPLETED or TRANS_STATE_SUPER_COMMITTED.
+ */
+ if (min_state == TRANS_STATE_COMPLETED)
+ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
+ else
+ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+
while (1) {
wait_event(commit->commit_wait, commit->state >= min_state);
if (put)
@@ -958,7 +949,7 @@ static bool should_end_transaction(struct btrfs_trans_handle *trans)
if (btrfs_check_space_for_delayed_refs(fs_info))
return true;
- return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
+ return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 50);
}
bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
@@ -1022,6 +1013,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
extwriter_counter_dec(cur_trans, trans->type);
cond_wake_up(&cur_trans->writer_wait);
+
+ btrfs_lockdep_release(info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_release(info, btrfs_trans_num_writers);
+
btrfs_put_transaction(cur_trans);
if (current->journal_info == trans)
@@ -1134,7 +1129,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
* it's safe to do it (through extent_io_tree_release()).
*/
err = clear_extent_bit(dirty_pages, start, end,
- EXTENT_NEED_WAIT, 0, 0, &cached_state);
+ EXTENT_NEED_WAIT, &cached_state);
if (err == -ENOMEM)
err = 0;
if (!err)
@@ -1625,10 +1620,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_root *root = pending->root;
struct btrfs_root *parent_root;
struct btrfs_block_rsv *rsv;
- struct inode *parent_inode;
+ struct inode *parent_inode = pending->dir;
struct btrfs_path *path;
struct btrfs_dir_item *dir_item;
- struct dentry *dentry;
struct extent_buffer *tmp;
struct extent_buffer *old;
struct timespec64 cur_time;
@@ -1637,6 +1631,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
u64 index = 0;
u64 objectid;
u64 root_flags;
+ unsigned int nofs_flags;
+ struct fscrypt_name fname;
ASSERT(pending->path);
path = pending->path;
@@ -1644,9 +1640,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ASSERT(pending->root_item);
new_root_item = pending->root_item;
+ /*
+ * We're inside a transaction and must make sure that any potential
+ * allocations with GFP_KERNEL in fscrypt won't recurse back to
+ * filesystem.
+ */
+ nofs_flags = memalloc_nofs_save();
+ pending->error = fscrypt_setup_filename(parent_inode,
+ &pending->dentry->d_name, 0,
+ &fname);
+ memalloc_nofs_restore(nofs_flags);
+ if (pending->error)
+ goto free_pending;
+
pending->error = btrfs_get_free_objectid(tree_root, &objectid);
if (pending->error)
- goto no_free_objectid;
+ goto free_fname;
/*
* Make qgroup to skip current new snapshot's qgroupid, as it is
@@ -1675,8 +1684,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
trace_btrfs_space_reservation(fs_info, "transaction",
trans->transid,
trans->bytes_reserved, 1);
- dentry = pending->dentry;
- parent_inode = pending->dir;
parent_root = BTRFS_I(parent_inode)->root;
ret = record_root_in_trans(trans, parent_root, 0);
if (ret)
@@ -1692,8 +1699,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
/* check if there is a file/dir which has the same name. */
dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
btrfs_ino(BTRFS_I(parent_inode)),
- dentry->d_name.name,
- dentry->d_name.len, 0);
+ &fname.disk_name, 0);
if (dir_item != NULL && !IS_ERR(dir_item)) {
pending->error = -EEXIST;
goto dir_item_existed;
@@ -1788,7 +1794,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ret = btrfs_add_root_ref(trans, objectid,
parent_root->root_key.objectid,
btrfs_ino(BTRFS_I(parent_inode)), index,
- dentry->d_name.name, dentry->d_name.len);
+ &fname.disk_name);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1820,9 +1826,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
if (ret < 0)
goto fail;
- ret = btrfs_insert_dir_item(trans, dentry->d_name.name,
- dentry->d_name.len, BTRFS_I(parent_inode),
- &key, BTRFS_FT_DIR, index);
+ ret = btrfs_insert_dir_item(trans, &fname.disk_name,
+ BTRFS_I(parent_inode), &key, BTRFS_FT_DIR,
+ index);
/* We have check then name at the beginning, so it is impossible. */
BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
if (ret) {
@@ -1831,7 +1837,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
- dentry->d_name.len * 2);
+ fname.disk_name.len * 2);
parent_inode->i_mtime = current_time(parent_inode);
parent_inode->i_ctime = parent_inode->i_mtime;
ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode));
@@ -1863,7 +1869,9 @@ dir_item_existed:
trans->bytes_reserved = 0;
clear_skip_qgroup:
btrfs_clear_skip_qgroup(trans);
-no_free_objectid:
+free_fname:
+ fscrypt_free_filename(&fname);
+free_pending:
kfree(new_root_item);
pending->root_item = NULL;
btrfs_free_path(path);
@@ -1912,14 +1920,6 @@ static void update_super_roots(struct btrfs_fs_info *fs_info)
super->cache_generation = 0;
if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
super->uuid_tree_generation = root_item->generation;
-
- if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
- root_item = &fs_info->block_group_root->root_item;
-
- super->block_group_root = root_item->bytenr;
- super->block_group_root_generation = root_item->generation;
- super->block_group_root_level = root_item->level;
- }
}
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
@@ -1967,6 +1967,7 @@ void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
* Wait for the current transaction commit to start and block
* subsequent transaction joins
*/
+ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
wait_event(fs_info->transaction_blocked_wait,
cur_trans->state >= TRANS_STATE_COMMIT_START ||
TRANS_ABORTED(cur_trans));
@@ -1994,6 +1995,12 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
if (cur_trans == fs_info->running_transaction) {
cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&fs_info->trans_lock);
+
+ /*
+ * The thread has already released the lockdep map as reader
+ * already in btrfs_commit_transaction().
+ */
+ btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers);
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
@@ -2118,12 +2125,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
ktime_t interval;
ASSERT(refcount_read(&trans->use_count) == 1);
+ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
+
+ clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags);
/* Stop the commit early if ->aborted is set */
if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
- btrfs_end_transaction(trans);
- return ret;
+ goto lockdep_trans_commit_start_release;
}
btrfs_trans_release_metadata(trans);
@@ -2140,10 +2149,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* Any running threads may add more while we are here.
*/
ret = btrfs_run_delayed_refs(trans, 0);
- if (ret) {
- btrfs_end_transaction(trans);
- return ret;
- }
+ if (ret)
+ goto lockdep_trans_commit_start_release;
}
btrfs_create_pending_block_groups(trans);
@@ -2172,10 +2179,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (run_it) {
ret = btrfs_start_dirty_block_groups(trans);
- if (ret) {
- btrfs_end_transaction(trans);
- return ret;
- }
+ if (ret)
+ goto lockdep_trans_commit_start_release;
}
}
@@ -2190,6 +2195,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (trans->in_fsync)
want_state = TRANS_STATE_SUPER_COMMITTED;
+
+ btrfs_trans_state_lockdep_release(fs_info,
+ BTRFS_LOCKDEP_TRANS_COMMIT_START);
ret = btrfs_end_transaction(trans);
wait_for_commit(cur_trans, want_state);
@@ -2203,6 +2211,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
cur_trans->state = TRANS_STATE_COMMIT_START;
wake_up(&fs_info->transaction_blocked_wait);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
if (cur_trans->list.prev != &fs_info->trans_list) {
enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
@@ -2222,7 +2231,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_put_transaction(prev_trans);
if (ret)
- goto cleanup_transaction;
+ goto lockdep_release;
} else {
spin_unlock(&fs_info->trans_lock);
}
@@ -2236,7 +2245,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
if (BTRFS_FS_ERROR(fs_info)) {
ret = -EROFS;
- goto cleanup_transaction;
+ goto lockdep_release;
}
}
@@ -2250,19 +2259,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
ret = btrfs_start_delalloc_flush(fs_info);
if (ret)
- goto cleanup_transaction;
+ goto lockdep_release;
ret = btrfs_run_delayed_items(trans);
if (ret)
- goto cleanup_transaction;
+ goto lockdep_release;
+ /*
+ * The thread has started/joined the transaction thus it holds the
+ * lockdep map as a reader. It has to release it before acquiring the
+ * lockdep map as a writer.
+ */
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+ btrfs_might_wait_for_event(fs_info, btrfs_trans_num_extwriters);
wait_event(cur_trans->writer_wait,
extwriter_counter_read(cur_trans) == 0);
/* some pending stuffs might be added after the previous flush. */
ret = btrfs_run_delayed_items(trans);
- if (ret)
+ if (ret) {
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
goto cleanup_transaction;
+ }
btrfs_wait_delalloc_flush(fs_info);
@@ -2271,6 +2289,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* transaction. Otherwise if this transaction commits before the ordered
* extents complete we lose logged data after a power failure.
*/
+ btrfs_might_wait_for_event(fs_info, btrfs_trans_pending_ordered);
wait_event(cur_trans->pending_wait,
atomic_read(&cur_trans->pending_ordered) == 0);
@@ -2284,10 +2303,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
add_pending_snapshot(trans);
cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&fs_info->trans_lock);
+
+ /*
+ * The thread has started/joined the transaction thus it holds the
+ * lockdep map as a reader. It has to release it before acquiring the
+ * lockdep map as a writer.
+ */
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
+ btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers);
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
/*
+ * Make lockdep happy by acquiring the state locks after
+ * btrfs_trans_num_writers is released. If we acquired the state locks
+ * before releasing the btrfs_trans_num_writers lock then lockdep would
+ * complain because we did not follow the reverse order unlocking rule.
+ */
+ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
+ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+
+ /*
* We've started the commit, clear the flag in case we were triggered to
* do an async commit but somebody else started before the transaction
* kthread could do the work.
@@ -2296,6 +2333,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
goto scrub_continue;
}
/*
@@ -2344,12 +2382,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto unlock_reloc;
- /*
- * Since the transaction is done, we can apply the pending changes
- * before the next transaction.
- */
- btrfs_apply_pending_changes(fs_info);
-
/* commit_fs_roots gets rid of all the tree log roots, it is now
* safe to free the root of tree log roots
*/
@@ -2430,6 +2462,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
mutex_unlock(&fs_info->reloc_mutex);
wake_up(&fs_info->transaction_wait);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
ret = btrfs_write_and_wait_transaction(trans);
if (ret) {
@@ -2461,6 +2494,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
cur_trans->state = TRANS_STATE_SUPER_COMMITTED;
wake_up(&cur_trans->commit_wait);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
btrfs_finish_extent_commit(trans);
@@ -2474,6 +2508,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
cur_trans->state = TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
spin_lock(&fs_info->trans_lock);
list_del_init(&cur_trans->list);
@@ -2502,7 +2537,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
unlock_reloc:
mutex_unlock(&fs_info->reloc_mutex);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
scrub_continue:
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
btrfs_scrub_continue(fs_info);
cleanup_transaction:
btrfs_trans_release_metadata(trans);
@@ -2515,6 +2553,16 @@ cleanup_transaction:
cleanup_transaction(trans, ret);
return ret;
+
+lockdep_release:
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
+ goto cleanup_transaction;
+
+lockdep_trans_commit_start_release:
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ btrfs_end_transaction(trans);
+ return ret;
}
/*
@@ -2556,21 +2604,17 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
return (ret < 0) ? 0 : 1;
}
-void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
+int __init btrfs_transaction_init(void)
{
- unsigned long prev;
- unsigned long bit;
-
- prev = xchg(&fs_info->pending_changes, 0);
- if (!prev)
- return;
-
- bit = 1 << BTRFS_PENDING_COMMIT;
- if (prev & bit)
- btrfs_debug(fs_info, "pending commit done");
- prev &= ~bit;
+ btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
+ sizeof(struct btrfs_trans_handle), 0,
+ SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_trans_handle_cachep)
+ return -ENOMEM;
+ return 0;
+}
- if (prev)
- btrfs_warn(fs_info,
- "unknown pending changes left 0x%lx, ignoring", prev);
+void __cold btrfs_transaction_exit(void)
+{
+ kmem_cache_destroy(btrfs_trans_handle_cachep);
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 970ff316069d..97f6c39f59c8 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -10,6 +10,7 @@
#include "btrfs_inode.h"
#include "delayed-ref.h"
#include "ctree.h"
+#include "misc.h"
enum btrfs_trans_state {
TRANS_STATE_RUNNING,
@@ -98,14 +99,15 @@ struct btrfs_transaction {
struct list_head releasing_ebs;
};
-#define __TRANS_FREEZABLE (1U << 0)
-
-#define __TRANS_START (1U << 9)
-#define __TRANS_ATTACH (1U << 10)
-#define __TRANS_JOIN (1U << 11)
-#define __TRANS_JOIN_NOLOCK (1U << 12)
-#define __TRANS_DUMMY (1U << 13)
-#define __TRANS_JOIN_NOSTART (1U << 14)
+enum {
+ ENUM_BIT(__TRANS_FREEZABLE),
+ ENUM_BIT(__TRANS_START),
+ ENUM_BIT(__TRANS_ATTACH),
+ ENUM_BIT(__TRANS_JOIN),
+ ENUM_BIT(__TRANS_JOIN_NOLOCK),
+ ENUM_BIT(__TRANS_DUMMY),
+ ENUM_BIT(__TRANS_JOIN_NOSTART),
+};
#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
#define TRANS_ATTACH (__TRANS_ATTACH)
@@ -231,9 +233,11 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark);
int btrfs_transaction_blocked(struct btrfs_fs_info *info);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
void btrfs_put_transaction(struct btrfs_transaction *transaction);
-void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
+int __init btrfs_transaction_init(void);
+void __cold btrfs_transaction_exit(void);
+
#endif
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 43f905ab0a18..baad1ed7e111 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -18,6 +18,7 @@
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/error-injection.h>
+#include "messages.h"
#include "ctree.h"
#include "tree-checker.h"
#include "disk-io.h"
@@ -25,6 +26,9 @@
#include "volumes.h"
#include "misc.h"
#include "btrfs_inode.h"
+#include "fs.h"
+#include "accessors.h"
+#include "file-item.h"
/*
* Error message should follow the following format:
@@ -528,7 +532,7 @@ static int check_dir_item(struct extent_buffer *leaf,
}
/* dir type check */
- dir_type = btrfs_dir_type(leaf, di);
+ dir_type = btrfs_dir_ftype(leaf, di);
if (unlikely(dir_type >= BTRFS_FT_MAX)) {
dir_item_err(leaf, slot,
"invalid dir item type, have %u expect [0, %u)",
@@ -1780,10 +1784,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
/* Also check if the item pointer overlaps with btrfs item. */
if (unlikely(btrfs_item_ptr_offset(leaf, slot) <
- btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item))) {
+ btrfs_item_nr_offset(leaf, slot) + sizeof(struct btrfs_item))) {
generic_err(leaf, slot,
"slot overlaps with its data, item end %lu data start %lu",
- btrfs_item_nr_offset(slot) +
+ btrfs_item_nr_offset(leaf, slot) +
sizeof(struct btrfs_item),
btrfs_item_ptr_offset(leaf, slot));
return -EUCLEAN;
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index ece497e26558..bfb5efa4e01f 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -6,8 +6,39 @@
#ifndef BTRFS_TREE_CHECKER_H
#define BTRFS_TREE_CHECKER_H
-#include "ctree.h"
-#include "extent_io.h"
+#include <uapi/linux/btrfs_tree.h>
+
+struct extent_buffer;
+struct btrfs_chunk;
+
+/* All the extra info needed to verify the parentness of a tree block. */
+struct btrfs_tree_parent_check {
+ /*
+ * The owner check against the tree block.
+ *
+ * Can be 0 to skip the owner check.
+ */
+ u64 owner_root;
+
+ /*
+ * Expected transid, can be 0 to skip the check, but such skip
+ * should only be utlized for backref walk related code.
+ */
+ u64 transid;
+
+ /*
+ * The expected first key.
+ *
+ * This check can be skipped if @has_first_key is false, such skip
+ * can happen for case where we don't have the parent node key,
+ * e.g. reading the tree root, doing backref walk.
+ */
+ struct btrfs_key first_key;
+ bool has_first_key;
+
+ /* The expected level. Should always be set. */
+ u8 level;
+};
/*
* Comprehensive leaf checker.
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
deleted file mode 100644
index b6cf39f4e7e4..000000000000
--- a/fs/btrfs/tree-defrag.c
+++ /dev/null
@@ -1,132 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2007 Oracle. All rights reserved.
- */
-
-#include <linux/sched.h>
-#include "ctree.h"
-#include "disk-io.h"
-#include "print-tree.h"
-#include "transaction.h"
-#include "locking.h"
-
-/*
- * Defrag all the leaves in a given btree.
- * Read all the leaves and try to get key order to
- * better reflect disk order
- */
-
-int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
-{
- struct btrfs_path *path = NULL;
- struct btrfs_key key;
- int ret = 0;
- int wret;
- int level;
- int next_key_ret = 0;
- u64 last_ret = 0;
-
- if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
- goto out;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- level = btrfs_header_level(root->node);
-
- if (level == 0)
- goto out;
-
- if (root->defrag_progress.objectid == 0) {
- struct extent_buffer *root_node;
- u32 nritems;
-
- root_node = btrfs_lock_root_node(root);
- nritems = btrfs_header_nritems(root_node);
- root->defrag_max.objectid = 0;
- /* from above we know this is not a leaf */
- btrfs_node_key_to_cpu(root_node, &root->defrag_max,
- nritems - 1);
- btrfs_tree_unlock(root_node);
- free_extent_buffer(root_node);
- memset(&key, 0, sizeof(key));
- } else {
- memcpy(&key, &root->defrag_progress, sizeof(key));
- }
-
- path->keep_locks = 1;
-
- ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
- if (ret < 0)
- goto out;
- if (ret > 0) {
- ret = 0;
- goto out;
- }
- btrfs_release_path(path);
- /*
- * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
- * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
- * a deadlock (attempting to write lock an already write locked leaf).
- */
- path->lowest_level = 1;
- wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
-
- if (wret < 0) {
- ret = wret;
- goto out;
- }
- if (!path->nodes[1]) {
- ret = 0;
- goto out;
- }
- /*
- * The node at level 1 must always be locked when our path has
- * keep_locks set and lowest_level is 1, regardless of the value of
- * path->slots[1].
- */
- BUG_ON(path->locks[1] == 0);
- ret = btrfs_realloc_node(trans, root,
- path->nodes[1], 0,
- &last_ret,
- &root->defrag_progress);
- if (ret) {
- WARN_ON(ret == -EAGAIN);
- goto out;
- }
- /*
- * Now that we reallocated the node we can find the next key. Note that
- * btrfs_find_next_key() can release our path and do another search
- * without COWing, this is because even with path->keep_locks = 1,
- * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
- * node when path->slots[node_level - 1] does not point to the last
- * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
- * we search for the next key after reallocating our node.
- */
- path->slots[1] = btrfs_header_nritems(path->nodes[1]);
- next_key_ret = btrfs_find_next_key(root, path, &key, 1,
- BTRFS_OLDEST_GENERATION);
- if (next_key_ret == 0) {
- memcpy(&root->defrag_progress, &key, sizeof(key));
- ret = -EAGAIN;
- }
-out:
- btrfs_free_path(path);
- if (ret == -EAGAIN) {
- if (root->defrag_max.objectid > root->defrag_progress.objectid)
- goto done;
- if (root->defrag_max.type > root->defrag_progress.type)
- goto done;
- if (root->defrag_max.offset > root->defrag_progress.offset)
- goto done;
- ret = 0;
- }
-done:
- if (ret != -EAGAIN)
- memset(&root->defrag_progress, 0,
- sizeof(root->defrag_progress));
-
- return ret;
-}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9205c4a5ca81..a3c43f0b1c95 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -21,6 +21,17 @@
#include "space-info.h"
#include "zoned.h"
#include "inode-item.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "root-tree.h"
+#include "dir-item.h"
+#include "file-item.h"
+#include "file.h"
+#include "orphan.h"
+#include "tree-checker.h"
+
+#define MAX_CONFLICT_INODES 10
/* magic values for the inode_only field in btrfs_log_inode:
*
@@ -31,8 +42,6 @@
enum {
LOG_INODE_ALL,
LOG_INODE_EXISTS,
- LOG_OTHER_INODE,
- LOG_OTHER_INODE_ALL,
};
/*
@@ -333,7 +342,12 @@ static int process_one_buffer(struct btrfs_root *log,
* pin down any logged extents, so we have to read the block.
*/
if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
- ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
+ struct btrfs_tree_parent_check check = {
+ .level = level,
+ .transid = gen
+ };
+
+ ret = btrfs_read_extent_buffer(eb, &check);
if (ret)
return ret;
}
@@ -351,11 +365,25 @@ static int process_one_buffer(struct btrfs_root *log,
return ret;
}
-static int do_overwrite_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+/*
+ * Item overwrite used by replay and tree logging. eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten. If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static int overwrite_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
{
int ret;
u32 item_size;
@@ -363,31 +391,24 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans,
int save_old_i_size = 0;
unsigned long src_ptr;
unsigned long dst_ptr;
- int overwrite_root = 0;
bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
- overwrite_root = 1;
+ /*
+ * This is only used during log replay, so the root is always from a
+ * fs/subvolume tree. In case we ever need to support a log root, then
+ * we'll have to clone the leaf in the path, release the path and use
+ * the leaf before writing into the log tree. See the comments at
+ * copy_items() for more details.
+ */
+ ASSERT(root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
item_size = btrfs_item_size(eb, slot);
src_ptr = btrfs_item_ptr_offset(eb, slot);
- /* Our caller must have done a search for the key for us. */
- ASSERT(path->nodes[0] != NULL);
-
- /*
- * And the slot must point to the exact key or the slot where the key
- * should be at (the first item with a key greater than 'key')
- */
- if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
- struct btrfs_key found_key;
-
- btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
- ret = btrfs_comp_cpu_keys(&found_key, key);
- ASSERT(ret >= 0);
- } else {
- ret = 1;
- }
+ /* Look for the key in the destination tree. */
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret < 0)
+ return ret;
if (ret == 0) {
char *src_copy;
@@ -532,8 +553,7 @@ insert:
goto no_copy;
}
- if (overwrite_root &&
- S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
+ if (S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
save_old_i_size = 1;
saved_i_size = btrfs_inode_size(path->nodes[0],
@@ -565,34 +585,19 @@ no_copy:
return 0;
}
-/*
- * Item overwrite used by replay and tree logging. eb, slot and key all refer
- * to the src data we are copying out.
- *
- * root is the tree we are copying into, and path is a scratch
- * path for use in this function (it should be released on entry and
- * will be released on exit).
- *
- * If the key is already in the destination tree the existing item is
- * overwritten. If the existing item isn't big enough, it is extended.
- * If it is too large, it is truncated.
- *
- * If the key isn't in the destination yet, a new item is inserted.
- */
-static int overwrite_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
+ struct fscrypt_str *name)
{
- int ret;
+ char *buf;
- /* Look for the key in the destination tree. */
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
- if (ret < 0)
- return ret;
+ buf = kmalloc(len, GFP_NOFS);
+ if (!buf)
+ return -ENOMEM;
- return do_overwrite_item(trans, root, path, eb, slot, key);
+ read_extent_buffer(eb, buf, (unsigned long)start, len);
+ name->name = buf;
+ name->len = len;
+ return 0;
}
/*
@@ -747,8 +752,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
*/
ret = btrfs_qgroup_trace_extent(trans,
btrfs_file_extent_disk_bytenr(eb, item),
- btrfs_file_extent_disk_num_bytes(eb, item),
- GFP_NOFS);
+ btrfs_file_extent_disk_num_bytes(eb, item));
if (ret < 0)
goto out;
@@ -799,9 +803,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
btrfs_file_extent_num_bytes(eb, item);
}
- ret = btrfs_lookup_csums_range(root->log_root,
+ ret = btrfs_lookup_csums_list(root->log_root,
csum_start, csum_end - 1,
- &ordered_sums, 0);
+ &ordered_sums, 0, false);
if (ret)
goto out;
/*
@@ -901,12 +905,11 @@ out:
static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
- const char *name,
- int name_len)
+ const struct fscrypt_str *name)
{
int ret;
- ret = btrfs_unlink_inode(trans, dir, inode, name, name_len);
+ ret = btrfs_unlink_inode(trans, dir, inode, name);
if (ret)
return ret;
/*
@@ -933,8 +936,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = dir->root;
struct inode *inode;
- char *name;
- int name_len;
+ struct fscrypt_str name;
struct extent_buffer *leaf;
struct btrfs_key location;
int ret;
@@ -942,12 +944,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
btrfs_dir_item_key_to_cpu(leaf, di, &location);
- name_len = btrfs_dir_name_len(leaf, di);
- name = kmalloc(name_len, GFP_NOFS);
- if (!name)
+ ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
+ if (ret)
return -ENOMEM;
- read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
btrfs_release_path(path);
inode = read_one_inode(root, location.objectid);
@@ -960,10 +960,9 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name,
- name_len);
+ ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name);
out:
- kfree(name);
+ kfree(name.name);
iput(inode);
return ret;
}
@@ -978,14 +977,14 @@ out:
static noinline int inode_in_dir(struct btrfs_root *root,
struct btrfs_path *path,
u64 dirid, u64 objectid, u64 index,
- const char *name, int name_len)
+ struct fscrypt_str *name)
{
struct btrfs_dir_item *di;
struct btrfs_key location;
int ret = 0;
di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
- index, name, name_len, 0);
+ index, name, 0);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
goto out;
@@ -998,7 +997,7 @@ static noinline int inode_in_dir(struct btrfs_root *root,
}
btrfs_release_path(path);
- di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
+ di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
goto out;
@@ -1025,7 +1024,7 @@ out:
static noinline int backref_in_log(struct btrfs_root *log,
struct btrfs_key *key,
u64 ref_objectid,
- const char *name, int namelen)
+ const struct fscrypt_str *name)
{
struct btrfs_path *path;
int ret;
@@ -1045,12 +1044,10 @@ static noinline int backref_in_log(struct btrfs_root *log,
if (key->type == BTRFS_INODE_EXTREF_KEY)
ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
path->slots[0],
- ref_objectid,
- name, namelen);
+ ref_objectid, name);
else
ret = !!btrfs_find_name_in_backref(path->nodes[0],
- path->slots[0],
- name, namelen);
+ path->slots[0], name);
out:
btrfs_free_path(path);
return ret;
@@ -1063,12 +1060,9 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
u64 inode_objectid, u64 parent_objectid,
- u64 ref_index, char *name, int namelen,
- int *search_done)
+ u64 ref_index, struct fscrypt_str *name)
{
int ret;
- char *victim_name;
- int victim_name_len;
struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key search_key;
@@ -1100,50 +1094,40 @@ again:
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
while (ptr < ptr_end) {
- victim_ref = (struct btrfs_inode_ref *)ptr;
- victim_name_len = btrfs_inode_ref_name_len(leaf,
- victim_ref);
- victim_name = kmalloc(victim_name_len, GFP_NOFS);
- if (!victim_name)
- return -ENOMEM;
+ struct fscrypt_str victim_name;
- read_extent_buffer(leaf, victim_name,
- (unsigned long)(victim_ref + 1),
- victim_name_len);
+ victim_ref = (struct btrfs_inode_ref *)ptr;
+ ret = read_alloc_one_name(leaf, (victim_ref + 1),
+ btrfs_inode_ref_name_len(leaf, victim_ref),
+ &victim_name);
+ if (ret)
+ return ret;
ret = backref_in_log(log_root, &search_key,
- parent_objectid, victim_name,
- victim_name_len);
+ parent_objectid, &victim_name);
if (ret < 0) {
- kfree(victim_name);
+ kfree(victim_name.name);
return ret;
} else if (!ret) {
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
ret = unlink_inode_for_log_replay(trans, dir, inode,
- victim_name, victim_name_len);
- kfree(victim_name);
+ &victim_name);
+ kfree(victim_name.name);
if (ret)
return ret;
- *search_done = 1;
goto again;
}
- kfree(victim_name);
+ kfree(victim_name.name);
- ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
+ ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
}
-
- /*
- * NOTE: we have searched root tree and checked the
- * corresponding ref, it does not need to check again.
- */
- *search_done = 1;
}
btrfs_release_path(path);
/* Same search but for extended refs */
- extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
+ extref = btrfs_lookup_inode_extref(NULL, root, path, name,
inode_objectid, parent_objectid, 0,
0);
if (IS_ERR(extref)) {
@@ -1160,29 +1144,28 @@ again:
base = btrfs_item_ptr_offset(leaf, path->slots[0]);
while (cur_offset < item_size) {
- extref = (struct btrfs_inode_extref *)(base + cur_offset);
+ struct fscrypt_str victim_name;
- victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
+ extref = (struct btrfs_inode_extref *)(base + cur_offset);
if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
goto next;
- victim_name = kmalloc(victim_name_len, GFP_NOFS);
- if (!victim_name)
- return -ENOMEM;
- read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
- victim_name_len);
+ ret = read_alloc_one_name(leaf, &extref->name,
+ btrfs_inode_extref_name_len(leaf, extref),
+ &victim_name);
+ if (ret)
+ return ret;
search_key.objectid = inode_objectid;
search_key.type = BTRFS_INODE_EXTREF_KEY;
search_key.offset = btrfs_extref_hash(parent_objectid,
- victim_name,
- victim_name_len);
+ victim_name.name,
+ victim_name.len);
ret = backref_in_log(log_root, &search_key,
- parent_objectid, victim_name,
- victim_name_len);
+ parent_objectid, &victim_name);
if (ret < 0) {
- kfree(victim_name);
+ kfree(victim_name.name);
return ret;
} else if (!ret) {
ret = -ENOENT;
@@ -1194,28 +1177,24 @@ again:
ret = unlink_inode_for_log_replay(trans,
BTRFS_I(victim_parent),
- inode,
- victim_name,
- victim_name_len);
+ inode, &victim_name);
}
iput(victim_parent);
- kfree(victim_name);
+ kfree(victim_name.name);
if (ret)
return ret;
- *search_done = 1;
goto again;
}
- kfree(victim_name);
+ kfree(victim_name.name);
next:
- cur_offset += victim_name_len + sizeof(*extref);
+ cur_offset += victim_name.len + sizeof(*extref);
}
- *search_done = 1;
}
btrfs_release_path(path);
/* look for a conflicting sequence number */
di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
- ref_index, name, namelen, 0);
+ ref_index, name, 0);
if (IS_ERR(di)) {
return PTR_ERR(di);
} else if (di) {
@@ -1226,8 +1205,7 @@ next:
btrfs_release_path(path);
/* look for a conflicting name */
- di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
- name, namelen, 0);
+ di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0);
if (IS_ERR(di)) {
return PTR_ERR(di);
} else if (di) {
@@ -1241,20 +1219,18 @@ next:
}
static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
- u32 *namelen, char **name, u64 *index,
+ struct fscrypt_str *name, u64 *index,
u64 *parent_objectid)
{
struct btrfs_inode_extref *extref;
+ int ret;
extref = (struct btrfs_inode_extref *)ref_ptr;
- *namelen = btrfs_inode_extref_name_len(eb, extref);
- *name = kmalloc(*namelen, GFP_NOFS);
- if (*name == NULL)
- return -ENOMEM;
-
- read_extent_buffer(eb, *name, (unsigned long)&extref->name,
- *namelen);
+ ret = read_alloc_one_name(eb, &extref->name,
+ btrfs_inode_extref_name_len(eb, extref), name);
+ if (ret)
+ return ret;
if (index)
*index = btrfs_inode_extref_index(eb, extref);
@@ -1265,18 +1241,17 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
}
static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
- u32 *namelen, char **name, u64 *index)
+ struct fscrypt_str *name, u64 *index)
{
struct btrfs_inode_ref *ref;
+ int ret;
ref = (struct btrfs_inode_ref *)ref_ptr;
- *namelen = btrfs_inode_ref_name_len(eb, ref);
- *name = kmalloc(*namelen, GFP_NOFS);
- if (*name == NULL)
- return -ENOMEM;
-
- read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
+ ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref),
+ name);
+ if (ret)
+ return ret;
if (index)
*index = btrfs_inode_ref_index(eb, ref);
@@ -1318,28 +1293,24 @@ again:
ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
while (ref_ptr < ref_end) {
- char *name = NULL;
- int namelen;
+ struct fscrypt_str name;
u64 parent_id;
if (key->type == BTRFS_INODE_EXTREF_KEY) {
- ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+ ret = extref_get_fields(eb, ref_ptr, &name,
NULL, &parent_id);
} else {
parent_id = key->offset;
- ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
- NULL);
+ ret = ref_get_fields(eb, ref_ptr, &name, NULL);
}
if (ret)
goto out;
if (key->type == BTRFS_INODE_EXTREF_KEY)
ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
- parent_id, name,
- namelen);
+ parent_id, &name);
else
- ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
- name, namelen);
+ ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);
if (!ret) {
struct inode *dir;
@@ -1348,20 +1319,20 @@ again:
dir = read_one_inode(root, parent_id);
if (!dir) {
ret = -ENOENT;
- kfree(name);
+ kfree(name.name);
goto out;
}
ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
- inode, name, namelen);
- kfree(name);
+ inode, &name);
+ kfree(name.name);
iput(dir);
if (ret)
goto out;
goto again;
}
- kfree(name);
- ref_ptr += namelen;
+ kfree(name.name);
+ ref_ptr += name.len;
if (key->type == BTRFS_INODE_EXTREF_KEY)
ref_ptr += sizeof(struct btrfs_inode_extref);
else
@@ -1373,103 +1344,6 @@ again:
return ret;
}
-static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
- const u8 ref_type, const char *name,
- const int namelen)
-{
- struct btrfs_key key;
- struct btrfs_path *path;
- const u64 parent_id = btrfs_ino(BTRFS_I(dir));
- int ret;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- key.objectid = btrfs_ino(BTRFS_I(inode));
- key.type = ref_type;
- if (key.type == BTRFS_INODE_REF_KEY)
- key.offset = parent_id;
- else
- key.offset = btrfs_extref_hash(parent_id, name, namelen);
-
- ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
- if (ret > 0) {
- ret = 0;
- goto out;
- }
- if (key.type == BTRFS_INODE_EXTREF_KEY)
- ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
- path->slots[0], parent_id, name, namelen);
- else
- ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
- name, namelen);
-
-out:
- btrfs_free_path(path);
- return ret;
-}
-
-static int add_link(struct btrfs_trans_handle *trans,
- struct inode *dir, struct inode *inode, const char *name,
- int namelen, u64 ref_index)
-{
- struct btrfs_root *root = BTRFS_I(dir)->root;
- struct btrfs_dir_item *dir_item;
- struct btrfs_key key;
- struct btrfs_path *path;
- struct inode *other_inode = NULL;
- int ret;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- dir_item = btrfs_lookup_dir_item(NULL, root, path,
- btrfs_ino(BTRFS_I(dir)),
- name, namelen, 0);
- if (!dir_item) {
- btrfs_release_path(path);
- goto add_link;
- } else if (IS_ERR(dir_item)) {
- ret = PTR_ERR(dir_item);
- goto out;
- }
-
- /*
- * Our inode's dentry collides with the dentry of another inode which is
- * in the log but not yet processed since it has a higher inode number.
- * So delete that other dentry.
- */
- btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
- btrfs_release_path(path);
- other_inode = read_one_inode(root, key.objectid);
- if (!other_inode) {
- ret = -ENOENT;
- goto out;
- }
- ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(other_inode),
- name, namelen);
- if (ret)
- goto out;
- /*
- * If we dropped the link count to 0, bump it so that later the iput()
- * on the inode will not free it. We will fixup the link count later.
- */
- if (other_inode->i_nlink == 0)
- set_nlink(other_inode, 1);
-add_link:
- ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
- name, namelen, 0, ref_index);
-out:
- iput(other_inode);
- btrfs_free_path(path);
-
- return ret;
-}
-
/*
* replay one inode back reference item found in the log tree.
* eb, slot and key refer to the buffer and key found in the log tree.
@@ -1487,10 +1361,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
struct inode *inode = NULL;
unsigned long ref_ptr;
unsigned long ref_end;
- char *name = NULL;
- int namelen;
+ struct fscrypt_str name;
int ret;
- int search_done = 0;
int log_ref_ver = 0;
u64 parent_objectid;
u64 inode_objectid;
@@ -1533,7 +1405,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
while (ref_ptr < ref_end) {
if (log_ref_ver) {
- ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+ ret = extref_get_fields(eb, ref_ptr, &name,
&ref_index, &parent_objectid);
/*
* parent object can change from one array
@@ -1546,15 +1418,13 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
goto out;
}
} else {
- ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
- &ref_index);
+ ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
}
if (ret)
goto out;
ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
- btrfs_ino(BTRFS_I(inode)), ref_index,
- name, namelen);
+ btrfs_ino(BTRFS_I(inode)), ref_index, &name);
if (ret < 0) {
goto out;
} else if (ret == 0) {
@@ -1565,51 +1435,19 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* overwrite any existing back reference, and we don't
* want to create dangling pointers in the directory.
*/
-
- if (!search_done) {
- ret = __add_inode_ref(trans, root, path, log,
- BTRFS_I(dir),
- BTRFS_I(inode),
- inode_objectid,
- parent_objectid,
- ref_index, name, namelen,
- &search_done);
- if (ret) {
- if (ret == 1)
- ret = 0;
- goto out;
- }
- }
-
- /*
- * If a reference item already exists for this inode
- * with the same parent and name, but different index,
- * drop it and the corresponding directory index entries
- * from the parent before adding the new reference item
- * and dir index entries, otherwise we would fail with
- * -EEXIST returned from btrfs_add_link() below.
- */
- ret = btrfs_inode_ref_exists(inode, dir, key->type,
- name, namelen);
- if (ret > 0) {
- ret = unlink_inode_for_log_replay(trans,
- BTRFS_I(dir),
- BTRFS_I(inode),
- name, namelen);
- /*
- * If we dropped the link count to 0, bump it so
- * that later the iput() on the inode will not
- * free it. We will fixup the link count later.
- */
- if (!ret && inode->i_nlink == 0)
- set_nlink(inode, 1);
- }
- if (ret < 0)
+ ret = __add_inode_ref(trans, root, path, log,
+ BTRFS_I(dir), BTRFS_I(inode),
+ inode_objectid, parent_objectid,
+ ref_index, &name);
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
goto out;
+ }
/* insert our name */
- ret = add_link(trans, dir, inode, name, namelen,
- ref_index);
+ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+ &name, 0, ref_index);
if (ret)
goto out;
@@ -1619,9 +1457,9 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
}
/* Else, ret == 1, we already have a perfect match, we're done. */
- ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
- kfree(name);
- name = NULL;
+ ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
+ kfree(name.name);
+ name.name = NULL;
if (log_ref_ver) {
iput(dir);
dir = NULL;
@@ -1645,7 +1483,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
ret = overwrite_item(trans, root, path, eb, slot, key);
out:
btrfs_release_path(path);
- kfree(name);
+ kfree(name.name);
iput(dir);
iput(inode);
return ret;
@@ -1917,7 +1755,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
static noinline int insert_one_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 dirid, u64 index,
- char *name, int name_len,
+ const struct fscrypt_str *name,
struct btrfs_key *location)
{
struct inode *inode;
@@ -1935,7 +1773,7 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
}
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
- name_len, 1, index);
+ 1, index);
/* FIXME, put inode into FIXUP list */
@@ -1949,7 +1787,7 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_dir_item *dst_di,
const struct btrfs_key *log_key,
- u8 log_type,
+ u8 log_flags,
bool exists)
{
struct btrfs_key found_key;
@@ -1959,7 +1797,7 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
if (found_key.objectid == log_key->objectid &&
found_key.type == log_key->type &&
found_key.offset == log_key->offset &&
- btrfs_dir_type(path->nodes[0], dst_di) == log_type)
+ btrfs_dir_flags(path->nodes[0], dst_di) == log_flags)
return 1;
/*
@@ -1995,8 +1833,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
struct btrfs_dir_item *di,
struct btrfs_key *key)
{
- char *name;
- int name_len;
+ struct fscrypt_str name;
struct btrfs_dir_item *dir_dst_di;
struct btrfs_dir_item *index_dst_di;
bool dir_dst_matches = false;
@@ -2004,7 +1841,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
struct btrfs_key log_key;
struct btrfs_key search_key;
struct inode *dir;
- u8 log_type;
+ u8 log_flags;
bool exists;
int ret;
bool update_size = true;
@@ -2014,17 +1851,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
if (!dir)
return -EIO;
- name_len = btrfs_dir_name_len(eb, di);
- name = kmalloc(name_len, GFP_NOFS);
- if (!name) {
- ret = -ENOMEM;
+ ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
+ if (ret)
goto out;
- }
-
- log_type = btrfs_dir_type(eb, di);
- read_extent_buffer(eb, name, (unsigned long)(di + 1),
- name_len);
+ log_flags = btrfs_dir_flags(eb, di);
btrfs_dir_item_key_to_cpu(eb, di, &log_key);
ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
btrfs_release_path(path);
@@ -2034,14 +1865,14 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
ret = 0;
dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
- name, name_len, 1);
+ &name, 1);
if (IS_ERR(dir_dst_di)) {
ret = PTR_ERR(dir_dst_di);
goto out;
} else if (dir_dst_di) {
ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
- dir_dst_di, &log_key, log_type,
- exists);
+ dir_dst_di, &log_key,
+ log_flags, exists);
if (ret < 0)
goto out;
dir_dst_matches = (ret == 1);
@@ -2051,14 +1882,14 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
key->objectid, key->offset,
- name, name_len, 1);
+ &name, 1);
if (IS_ERR(index_dst_di)) {
ret = PTR_ERR(index_dst_di);
goto out;
} else if (index_dst_di) {
ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
index_dst_di, &log_key,
- log_type, exists);
+ log_flags, exists);
if (ret < 0)
goto out;
index_dst_matches = (ret == 1);
@@ -2079,7 +1910,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
search_key.objectid = log_key.objectid;
search_key.type = BTRFS_INODE_REF_KEY;
search_key.offset = key->objectid;
- ret = backref_in_log(root->log_root, &search_key, 0, name, name_len);
+ ret = backref_in_log(root->log_root, &search_key, 0, &name);
if (ret < 0) {
goto out;
} else if (ret) {
@@ -2092,8 +1923,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
search_key.objectid = log_key.objectid;
search_key.type = BTRFS_INODE_EXTREF_KEY;
search_key.offset = key->objectid;
- ret = backref_in_log(root->log_root, &search_key, key->objectid, name,
- name_len);
+ ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
if (ret < 0) {
goto out;
} else if (ret) {
@@ -2104,7 +1934,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
ret = insert_one_name(trans, root, key->objectid, key->offset,
- name, name_len, &log_key);
+ &name, &log_key);
if (ret && ret != -ENOENT && ret != -EEXIST)
goto out;
if (!ret)
@@ -2114,10 +1944,10 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
out:
if (!ret && update_size) {
- btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
+ btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2);
ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
}
- kfree(name);
+ kfree(name.name);
iput(dir);
if (!ret && name_added)
ret = 1;
@@ -2168,7 +1998,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
* to ever delete the parent directory has it would result in stale
* dentries that can never be deleted.
*/
- if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
+ if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) {
struct btrfs_path *fixup_path;
struct btrfs_key di_key;
@@ -2283,8 +2113,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
struct extent_buffer *eb;
int slot;
struct btrfs_dir_item *di;
- int name_len;
- char *name;
+ struct fscrypt_str name;
struct inode *inode = NULL;
struct btrfs_key location;
@@ -2299,22 +2128,16 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
eb = path->nodes[0];
slot = path->slots[0];
di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
- name_len = btrfs_dir_name_len(eb, di);
- name = kmalloc(name_len, GFP_NOFS);
- if (!name) {
- ret = -ENOMEM;
+ ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
+ if (ret)
goto out;
- }
-
- read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len);
if (log) {
struct btrfs_dir_item *log_di;
log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
dir_key->objectid,
- dir_key->offset,
- name, name_len, 0);
+ dir_key->offset, &name, 0);
if (IS_ERR(log_di)) {
ret = PTR_ERR(log_di);
goto out;
@@ -2340,7 +2163,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
inc_nlink(inode);
ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
- name, name_len);
+ &name);
/*
* Unlike dir item keys, dir index keys can only have one name (entry) in
* them, as there are no key collisions since each key has a unique offset
@@ -2349,7 +2172,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
out:
btrfs_release_path(path);
btrfs_release_path(log_path);
- kfree(name);
+ kfree(name.name);
iput(inode);
return ret;
}
@@ -2570,13 +2393,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
struct walk_control *wc, u64 gen, int level)
{
int nritems;
+ struct btrfs_tree_parent_check check = {
+ .transid = gen,
+ .level = level
+ };
struct btrfs_path *path;
struct btrfs_root *root = wc->replay_dest;
struct btrfs_key key;
int i;
int ret;
- ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
+ ret = btrfs_read_extent_buffer(eb, &check);
if (ret)
return ret;
@@ -2756,7 +2583,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
int ret = 0;
while (*level > 0) {
- struct btrfs_key first_key;
+ struct btrfs_tree_parent_check check = { 0 };
cur = path->nodes[*level];
@@ -2768,7 +2595,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
- btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
+ check.transid = ptr_gen;
+ check.level = *level - 1;
+ check.has_first_key = true;
+ btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]);
blocksize = fs_info->nodesize;
next = btrfs_find_create_tree_block(fs_info, bytenr,
@@ -2787,8 +2617,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
path->slots[*level]++;
if (wc->free) {
- ret = btrfs_read_extent_buffer(next, ptr_gen,
- *level - 1, &first_key);
+ ret = btrfs_read_extent_buffer(next, &check);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -2816,7 +2645,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
free_extent_buffer(next);
continue;
}
- ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key);
+ ret = btrfs_read_extent_buffer(next, &check);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -3588,7 +3417,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
u64 dir_ino,
- const char *name, int name_len,
+ const struct fscrypt_str *name,
u64 index)
{
struct btrfs_dir_item *di;
@@ -3598,7 +3427,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans,
* for dir item keys.
*/
di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
- index, name, name_len, -1);
+ index, name, -1);
if (IS_ERR(di))
return PTR_ERR(di);
else if (!di)
@@ -3635,7 +3464,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans,
*/
void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- const char *name, int name_len,
+ const struct fscrypt_str *name,
struct btrfs_inode *dir, u64 index)
{
struct btrfs_path *path;
@@ -3662,7 +3491,7 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
}
ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
- name, name_len, index);
+ name, index);
btrfs_free_path(path);
out_unlock:
mutex_unlock(&dir->log_mutex);
@@ -3674,7 +3503,7 @@ out_unlock:
/* see comments for btrfs_del_dir_entries_in_log */
void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- const char *name, int name_len,
+ const struct fscrypt_str *name,
struct btrfs_inode *inode, u64 dirid)
{
struct btrfs_root *log;
@@ -3695,7 +3524,7 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
log = root->log_root;
mutex_lock(&inode->log_mutex);
- ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
+ ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode),
dirid, &index);
mutex_unlock(&inode->log_mutex);
if (ret < 0 && ret != -ENOENT)
@@ -3834,15 +3663,29 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
u64 *last_old_dentry_offset)
{
struct btrfs_root *log = inode->root->log_root;
- struct extent_buffer *src = path->nodes[0];
- const int nritems = btrfs_header_nritems(src);
+ struct extent_buffer *src;
+ const int nritems = btrfs_header_nritems(path->nodes[0]);
const u64 ino = btrfs_ino(inode);
bool last_found = false;
int batch_start = 0;
int batch_size = 0;
int i;
- for (i = path->slots[0]; i < nritems; i++) {
+ /*
+ * We need to clone the leaf, release the read lock on it, and use the
+ * clone before modifying the log tree. See the comment at copy_items()
+ * about why we need to do this.
+ */
+ src = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!src)
+ return -ENOMEM;
+
+ i = path->slots[0];
+ btrfs_release_path(path);
+ path->nodes[0] = src;
+ path->slots[0] = i;
+
+ for (; i < nritems; i++) {
struct btrfs_dir_item *di;
struct btrfs_key key;
int ret;
@@ -3875,6 +3718,11 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
*last_old_dentry_offset = key.offset;
continue;
}
+
+ /* If we logged this dir index item before, we can skip it. */
+ if (key.offset <= inode->last_dir_index_offset)
+ continue;
+
/*
* We must make sure that when we log a directory entry, the
* corresponding inode, after log replay, has a matching link
@@ -3905,51 +3753,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
ctx->log_new_dentries = true;
}
- if (!ctx->logged_before)
- goto add_to_batch;
-
- /*
- * If we were logged before and have logged dir items, we can skip
- * checking if any item with a key offset larger than the last one
- * we logged is in the log tree, saving time and avoiding adding
- * contention on the log tree. We can only rely on the value of
- * last_dir_index_offset when we know for sure that the inode was
- * previously logged in the current transaction.
- */
- if (key.offset > inode->last_dir_index_offset)
- goto add_to_batch;
- /*
- * Check if the key was already logged before. If not we can add
- * it to a batch for bulk insertion.
- */
- ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0);
- if (ret < 0) {
- return ret;
- } else if (ret > 0) {
- btrfs_release_path(dst_path);
- goto add_to_batch;
- }
-
- /*
- * Item exists in the log. Overwrite the item in the log if it
- * has different content or do nothing if it has exactly the same
- * content. And then flush the current batch if any - do it after
- * overwriting the current item, or we would deadlock otherwise,
- * since we are holding a path for the existing item.
- */
- ret = do_overwrite_item(trans, log, dst_path, src, i, &key);
- if (ret < 0)
- return ret;
-
- if (batch_size > 0) {
- ret = flush_dir_items_batch(trans, log, src, dst_path,
- batch_start, batch_size);
- if (ret < 0)
- return ret;
- batch_size = 0;
- }
- continue;
-add_to_batch:
if (batch_size == 0)
batch_start = i;
batch_size++;
@@ -4136,6 +3939,71 @@ done:
}
/*
+ * If the inode was logged before and it was evicted, then its
+ * last_dir_index_offset is (u64)-1, so we don't the value of the last index
+ * key offset. If that's the case, search for it and update the inode. This
+ * is to avoid lookups in the log tree every time we try to insert a dir index
+ * key from a leaf changed in the current transaction, and to allow us to always
+ * do batch insertions of dir index keys.
+ */
+static int update_last_dir_index_offset(struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct btrfs_log_ctx *ctx)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_key key;
+ int ret;
+
+ lockdep_assert_held(&inode->log_mutex);
+
+ if (inode->last_dir_index_offset != (u64)-1)
+ return 0;
+
+ if (!ctx->logged_before) {
+ inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
+ return 0;
+ }
+
+ key.objectid = ino;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
+ /*
+ * An error happened or we actually have an index key with an offset
+ * value of (u64)-1. Bail out, we're done.
+ */
+ if (ret <= 0)
+ goto out;
+
+ ret = 0;
+ inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
+
+ /*
+ * No dir index items, bail out and leave last_dir_index_offset with
+ * the value right before the first valid index value.
+ */
+ if (path->slots[0] == 0)
+ goto out;
+
+ /*
+ * btrfs_search_slot() left us at one slot beyond the slot with the last
+ * index key, or beyond the last key of the directory that is not an
+ * index key. If we have an index key before, set last_dir_index_offset
+ * to its offset value, otherwise leave it with a value right before the
+ * first valid index value, as it means we have an empty directory.
+ */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY)
+ inode->last_dir_index_offset = key.offset;
+
+out:
+ btrfs_release_path(path);
+
+ return ret;
+}
+
+/*
* logging directories is very similar to logging inodes, We find all the items
* from the current transaction and write them to the log.
*
@@ -4157,6 +4025,10 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
u64 max_key;
int ret;
+ ret = update_last_dir_index_offset(inode, path, ctx);
+ if (ret)
+ return ret;
+
min_key = BTRFS_DIR_START_INDEX;
max_key = 0;
ctx->last_dir_item_offset = inode->last_dir_index_offset;
@@ -4382,8 +4254,8 @@ static int log_csums(struct btrfs_trans_handle *trans,
* file which happens to refer to the same extent as well. Such races
* can leave checksum items in the log with overlapping ranges.
*/
- ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr,
- lock_end, &cached_state);
+ ret = lock_extent(&log_root->log_csum_range, sums->bytenr, lock_end,
+ &cached_state);
if (ret)
return ret;
/*
@@ -4399,8 +4271,8 @@ static int log_csums(struct btrfs_trans_handle *trans,
if (!ret)
ret = btrfs_csum_file_blocks(trans, log_root, sums);
- unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end,
- &cached_state);
+ unlock_extent(&log_root->log_csum_range, sums->bytenr, lock_end,
+ &cached_state);
return ret;
}
@@ -4414,7 +4286,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
{
struct btrfs_root *log = inode->root->log_root;
struct btrfs_file_extent_item *extent;
- struct extent_buffer *src = src_path->nodes[0];
+ struct extent_buffer *src;
int ret = 0;
struct btrfs_key *ins_keys;
u32 *ins_sizes;
@@ -4425,6 +4297,43 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
const u64 i_size = i_size_read(&inode->vfs_inode);
+ /*
+ * To keep lockdep happy and avoid deadlocks, clone the source leaf and
+ * use the clone. This is because otherwise we would be changing the log
+ * tree, to insert items from the subvolume tree or insert csum items,
+ * while holding a read lock on a leaf from the subvolume tree, which
+ * creates a nasty lock dependency when COWing log tree nodes/leaves:
+ *
+ * 1) Modifying the log tree triggers an extent buffer allocation while
+ * holding a write lock on a parent extent buffer from the log tree.
+ * Allocating the pages for an extent buffer, or the extent buffer
+ * struct, can trigger inode eviction and finally the inode eviction
+ * will trigger a release/remove of a delayed node, which requires
+ * taking the delayed node's mutex;
+ *
+ * 2) Allocating a metadata extent for a log tree can trigger the async
+ * reclaim thread and make us wait for it to release enough space and
+ * unblock our reservation ticket. The reclaim thread can start
+ * flushing delayed items, and that in turn results in the need to
+ * lock delayed node mutexes and in the need to write lock extent
+ * buffers of a subvolume tree - all this while holding a write lock
+ * on the parent extent buffer in the log tree.
+ *
+ * So one task in scenario 1) running in parallel with another task in
+ * scenario 2) could lead to a deadlock, one wanting to lock a delayed
+ * node mutex while having a read lock on a leaf from the subvolume,
+ * while the other is holding the delayed node's mutex and wants to
+ * write lock the same subvolume leaf for flushing delayed items.
+ */
+ src = btrfs_clone_extent_buffer(src_path->nodes[0]);
+ if (!src)
+ return -ENOMEM;
+
+ i = src_path->slots[0];
+ btrfs_release_path(src_path);
+ src_path->nodes[0] = src;
+ src_path->slots[0] = i;
+
ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
nr * sizeof(u32), GFP_NOFS);
if (!ins_data)
@@ -4511,9 +4420,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
disk_bytenr += extent_offset;
- ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
- disk_bytenr + extent_num_bytes - 1,
- &ordered_sums, 0);
+ ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
+ disk_bytenr + extent_num_bytes - 1,
+ &ordered_sums, 0, false);
if (ret)
goto out;
@@ -4706,10 +4615,9 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
/* block start is already adjusted for the file extent offset. */
csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
- ret = btrfs_lookup_csums_range(csum_root,
- em->block_start + csum_offset,
- em->block_start + csum_offset +
- csum_len - 1, &ordered_sums, 0);
+ ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset,
+ em->block_start + csum_offset +
+ csum_len - 1, &ordered_sums, 0, false);
if (ret)
return ret;
@@ -5221,10 +5129,9 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
* leafs from the log root.
*/
btrfs_release_path(path);
- ret = btrfs_insert_file_extent(trans, root->log_root,
- ino, prev_extent_end, 0,
- 0, hole_len, 0, hole_len,
- 0, 0, 0);
+ ret = btrfs_insert_hole_extent(trans, root->log_root,
+ ino, prev_extent_end,
+ hole_len);
if (ret < 0)
return ret;
@@ -5253,10 +5160,8 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
- ret = btrfs_insert_file_extent(trans, root->log_root,
- ino, prev_extent_end, 0, 0,
- hole_len, 0, hole_len,
- 0, 0, 0);
+ ret = btrfs_insert_hole_extent(trans, root->log_root, ino,
+ prev_extent_end, hole_len);
if (ret < 0)
return ret;
}
@@ -5332,6 +5237,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
u32 this_len;
unsigned long name_ptr;
struct btrfs_dir_item *di;
+ struct fscrypt_str name_str;
if (key->type == BTRFS_INODE_REF_KEY) {
struct btrfs_inode_ref *iref;
@@ -5365,8 +5271,11 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
}
read_extent_buffer(eb, name, name_ptr, this_name_len);
+
+ name_str.name = name;
+ name_str.len = this_name_len;
di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
- parent, name, this_name_len, 0);
+ parent, &name_str, 0);
if (di && !IS_ERR(di)) {
struct btrfs_key di_key;
@@ -5399,111 +5308,461 @@ out:
return ret;
}
+/*
+ * Check if we need to log an inode. This is used in contexts where while
+ * logging an inode we need to log another inode (either that it exists or in
+ * full mode). This is used instead of btrfs_inode_in_log() because the later
+ * requires the inode to be in the log and have the log transaction committed,
+ * while here we do not care if the log transaction was already committed - our
+ * caller will commit the log later - and we want to avoid logging an inode
+ * multiple times when multiple tasks have joined the same log transaction.
+ */
+static bool need_log_inode(const struct btrfs_trans_handle *trans,
+ const struct btrfs_inode *inode)
+{
+ /*
+ * If a directory was not modified, no dentries added or removed, we can
+ * and should avoid logging it.
+ */
+ if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
+ return false;
+
+ /*
+ * If this inode does not have new/updated/deleted xattrs since the last
+ * time it was logged and is flagged as logged in the current transaction,
+ * we can skip logging it. As for new/deleted names, those are updated in
+ * the log by link/unlink/rename operations.
+ * In case the inode was logged and then evicted and reloaded, its
+ * logged_trans will be 0, in which case we have to fully log it since
+ * logged_trans is a transient field, not persisted.
+ */
+ if (inode->logged_trans == trans->transid &&
+ !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
+ return false;
+
+ return true;
+}
+
+struct btrfs_dir_list {
+ u64 ino;
+ struct list_head list;
+};
+
+/*
+ * Log the inodes of the new dentries of a directory.
+ * See process_dir_items_leaf() for details about why it is needed.
+ * This is a recursive operation - if an existing dentry corresponds to a
+ * directory, that directory's new entries are logged too (same behaviour as
+ * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
+ * the dentries point to we do not acquire their VFS lock, otherwise lockdep
+ * complains about the following circular lock dependency / possible deadlock:
+ *
+ * CPU0 CPU1
+ * ---- ----
+ * lock(&type->i_mutex_dir_key#3/2);
+ * lock(sb_internal#2);
+ * lock(&type->i_mutex_dir_key#3/2);
+ * lock(&sb->s_type->i_mutex_key#14);
+ *
+ * Where sb_internal is the lock (a counter that works as a lock) acquired by
+ * sb_start_intwrite() in btrfs_start_transaction().
+ * Not acquiring the VFS lock of the inodes is still safe because:
+ *
+ * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
+ * that while logging the inode new references (names) are added or removed
+ * from the inode, leaving the logged inode item with a link count that does
+ * not match the number of logged inode reference items. This is fine because
+ * at log replay time we compute the real number of links and correct the
+ * link count in the inode item (see replay_one_buffer() and
+ * link_to_fixup_dir());
+ *
+ * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
+ * while logging the inode's items new index items (key type
+ * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
+ * has a size that doesn't match the sum of the lengths of all the logged
+ * names - this is ok, not a problem, because at log replay time we set the
+ * directory's i_size to the correct value (see replay_one_name() and
+ * overwrite_item()).
+ */
+static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *start_inode,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *root = start_inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_path *path;
+ LIST_HEAD(dir_list);
+ struct btrfs_dir_list *dir_elem;
+ u64 ino = btrfs_ino(start_inode);
+ int ret = 0;
+
+ /*
+ * If we are logging a new name, as part of a link or rename operation,
+ * don't bother logging new dentries, as we just want to log the names
+ * of an inode and that any new parents exist.
+ */
+ if (ctx->logging_new_name)
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (true) {
+ struct extent_buffer *leaf;
+ struct btrfs_key min_key;
+ bool continue_curr_inode = true;
+ int nritems;
+ int i;
+
+ min_key.objectid = ino;
+ min_key.type = BTRFS_DIR_INDEX_KEY;
+ min_key.offset = 0;
+again:
+ btrfs_release_path(path);
+ ret = btrfs_search_forward(root, &min_key, path, trans->transid);
+ if (ret < 0) {
+ break;
+ } else if (ret > 0) {
+ ret = 0;
+ goto next;
+ }
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_dir_item *di;
+ struct btrfs_key di_key;
+ struct inode *di_inode;
+ int log_mode = LOG_INODE_EXISTS;
+ int type;
+
+ btrfs_item_key_to_cpu(leaf, &min_key, i);
+ if (min_key.objectid != ino ||
+ min_key.type != BTRFS_DIR_INDEX_KEY) {
+ continue_curr_inode = false;
+ break;
+ }
+
+ di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
+ type = btrfs_dir_ftype(leaf, di);
+ if (btrfs_dir_transid(leaf, di) < trans->transid)
+ continue;
+ btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
+ if (di_key.type == BTRFS_ROOT_ITEM_KEY)
+ continue;
+
+ btrfs_release_path(path);
+ di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
+ if (IS_ERR(di_inode)) {
+ ret = PTR_ERR(di_inode);
+ goto out;
+ }
+
+ if (!need_log_inode(trans, BTRFS_I(di_inode))) {
+ btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ break;
+ }
+
+ ctx->log_new_dentries = false;
+ if (type == BTRFS_FT_DIR)
+ log_mode = LOG_INODE_ALL;
+ ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
+ log_mode, ctx);
+ btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ if (ret)
+ goto out;
+ if (ctx->log_new_dentries) {
+ dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
+ if (!dir_elem) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ dir_elem->ino = di_key.objectid;
+ list_add_tail(&dir_elem->list, &dir_list);
+ }
+ break;
+ }
+
+ if (continue_curr_inode && min_key.offset < (u64)-1) {
+ min_key.offset++;
+ goto again;
+ }
+
+next:
+ if (list_empty(&dir_list))
+ break;
+
+ dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list);
+ ino = dir_elem->ino;
+ list_del(&dir_elem->list);
+ kfree(dir_elem);
+ }
+out:
+ btrfs_free_path(path);
+ if (ret) {
+ struct btrfs_dir_list *next;
+
+ list_for_each_entry_safe(dir_elem, next, &dir_list, list)
+ kfree(dir_elem);
+ }
+
+ return ret;
+}
+
struct btrfs_ino_list {
u64 ino;
u64 parent;
struct list_head list;
};
-static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_log_ctx *ctx,
- u64 ino, u64 parent)
+static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ino_list *curr;
+ struct btrfs_ino_list *next;
+
+ list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
+ list_del(&curr->list);
+ kfree(curr);
+ }
+}
+
+static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
+ struct btrfs_path *path)
+{
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (WARN_ON_ONCE(ret > 0)) {
+ /*
+ * We have previously found the inode through the commit root
+ * so this should not happen. If it does, just error out and
+ * fallback to a transaction commit.
+ */
+ ret = -ENOENT;
+ } else if (ret == 0) {
+ struct btrfs_inode_item *item;
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item)))
+ ret = 1;
+ }
+
+ btrfs_release_path(path);
+ path->search_commit_root = 0;
+ path->skip_locking = 0;
+
+ return ret;
+}
+
+static int add_conflicting_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 ino, u64 parent,
+ struct btrfs_log_ctx *ctx)
{
struct btrfs_ino_list *ino_elem;
- LIST_HEAD(inode_list);
- int ret = 0;
+ struct inode *inode;
+
+ /*
+ * It's rare to have a lot of conflicting inodes, in practice it is not
+ * common to have more than 1 or 2. We don't want to collect too many,
+ * as we could end up logging too many inodes (even if only in
+ * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
+ * commits.
+ */
+ if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
+ return BTRFS_LOG_FORCE_COMMIT;
+
+ inode = btrfs_iget(root->fs_info->sb, ino, root);
+ /*
+ * If the other inode that had a conflicting dir entry was deleted in
+ * the current transaction then we either:
+ *
+ * 1) Log the parent directory (later after adding it to the list) if
+ * the inode is a directory. This is because it may be a deleted
+ * subvolume/snapshot or it may be a regular directory that had
+ * deleted subvolumes/snapshots (or subdirectories that had them),
+ * and at the moment we can't deal with dropping subvolumes/snapshots
+ * during log replay. So we just log the parent, which will result in
+ * a fallback to a transaction commit if we are dealing with those
+ * cases (last_unlink_trans will match the current transaction);
+ *
+ * 2) Do nothing if it's not a directory. During log replay we simply
+ * unlink the conflicting dentry from the parent directory and then
+ * add the dentry for our inode. Like this we can avoid logging the
+ * parent directory (and maybe fallback to a transaction commit in
+ * case it has a last_unlink_trans == trans->transid, due to moving
+ * some inode from it to some other directory).
+ */
+ if (IS_ERR(inode)) {
+ int ret = PTR_ERR(inode);
+
+ if (ret != -ENOENT)
+ return ret;
+
+ ret = conflicting_inode_is_dir(root, ino, path);
+ /* Not a directory or we got an error. */
+ if (ret <= 0)
+ return ret;
+
+ /* Conflicting inode is a directory, so we'll log its parent. */
+ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+ if (!ino_elem)
+ return -ENOMEM;
+ ino_elem->ino = ino;
+ ino_elem->parent = parent;
+ list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
+ ctx->num_conflict_inodes++;
+
+ return 0;
+ }
+
+ /*
+ * If the inode was already logged skip it - otherwise we can hit an
+ * infinite loop. Example:
+ *
+ * From the commit root (previous transaction) we have the following
+ * inodes:
+ *
+ * inode 257 a directory
+ * inode 258 with references "zz" and "zz_link" on inode 257
+ * inode 259 with reference "a" on inode 257
+ *
+ * And in the current (uncommitted) transaction we have:
+ *
+ * inode 257 a directory, unchanged
+ * inode 258 with references "a" and "a2" on inode 257
+ * inode 259 with reference "zz_link" on inode 257
+ * inode 261 with reference "zz" on inode 257
+ *
+ * When logging inode 261 the following infinite loop could
+ * happen if we don't skip already logged inodes:
+ *
+ * - we detect inode 258 as a conflicting inode, with inode 261
+ * on reference "zz", and log it;
+ *
+ * - we detect inode 259 as a conflicting inode, with inode 258
+ * on reference "a", and log it;
+ *
+ * - we detect inode 258 as a conflicting inode, with inode 259
+ * on reference "zz_link", and log it - again! After this we
+ * repeat the above steps forever.
+ *
+ * Here we can use need_log_inode() because we only need to log the
+ * inode in LOG_INODE_EXISTS mode and rename operations update the log,
+ * so that the log ends up with the new name and without the old name.
+ */
+ if (!need_log_inode(trans, BTRFS_I(inode))) {
+ btrfs_add_delayed_iput(BTRFS_I(inode));
+ return 0;
+ }
+
+ btrfs_add_delayed_iput(BTRFS_I(inode));
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
if (!ino_elem)
return -ENOMEM;
ino_elem->ino = ino;
ino_elem->parent = parent;
- list_add_tail(&ino_elem->list, &inode_list);
+ list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
+ ctx->num_conflict_inodes++;
- while (!list_empty(&inode_list)) {
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_key key;
- struct inode *inode;
+ return 0;
+}
- ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
- list);
- ino = ino_elem->ino;
- parent = ino_elem->parent;
- list_del(&ino_elem->list);
- kfree(ino_elem);
- if (ret)
- continue;
+static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret = 0;
- btrfs_release_path(path);
+ /*
+ * Conflicting inodes are logged by the first call to btrfs_log_inode(),
+ * otherwise we could have unbounded recursion of btrfs_log_inode()
+ * calls. This check guarantees we can have only 1 level of recursion.
+ */
+ if (ctx->logging_conflict_inodes)
+ return 0;
+
+ ctx->logging_conflict_inodes = true;
+
+ /*
+ * New conflicting inodes may be found and added to the list while we
+ * are logging a conflicting inode, so keep iterating while the list is
+ * not empty.
+ */
+ while (!list_empty(&ctx->conflict_inodes)) {
+ struct btrfs_ino_list *curr;
+ struct inode *inode;
+ u64 ino;
+ u64 parent;
+
+ curr = list_first_entry(&ctx->conflict_inodes,
+ struct btrfs_ino_list, list);
+ ino = curr->ino;
+ parent = curr->parent;
+ list_del(&curr->list);
+ kfree(curr);
inode = btrfs_iget(fs_info->sb, ino, root);
/*
* If the other inode that had a conflicting dir entry was
* deleted in the current transaction, we need to log its parent
- * directory.
+ * directory. See the comment at add_conflicting_inode().
*/
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
- if (ret == -ENOENT) {
- inode = btrfs_iget(fs_info->sb, parent, root);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- } else {
- ret = btrfs_log_inode(trans,
- BTRFS_I(inode),
- LOG_OTHER_INODE_ALL,
- ctx);
- btrfs_add_delayed_iput(inode);
- }
+ if (ret != -ENOENT)
+ break;
+
+ inode = btrfs_iget(fs_info->sb, parent, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ break;
}
+
+ /*
+ * Always log the directory, we cannot make this
+ * conditional on need_log_inode() because the directory
+ * might have been logged in LOG_INODE_EXISTS mode or
+ * the dir index of the conflicting inode is not in a
+ * dir index key range logged for the directory. So we
+ * must make sure the deletion is recorded.
+ */
+ ret = btrfs_log_inode(trans, BTRFS_I(inode),
+ LOG_INODE_ALL, ctx);
+ btrfs_add_delayed_iput(BTRFS_I(inode));
+ if (ret)
+ break;
continue;
}
+
/*
- * If the inode was already logged skip it - otherwise we can
- * hit an infinite loop. Example:
- *
- * From the commit root (previous transaction) we have the
- * following inodes:
- *
- * inode 257 a directory
- * inode 258 with references "zz" and "zz_link" on inode 257
- * inode 259 with reference "a" on inode 257
- *
- * And in the current (uncommitted) transaction we have:
- *
- * inode 257 a directory, unchanged
- * inode 258 with references "a" and "a2" on inode 257
- * inode 259 with reference "zz_link" on inode 257
- * inode 261 with reference "zz" on inode 257
- *
- * When logging inode 261 the following infinite loop could
- * happen if we don't skip already logged inodes:
- *
- * - we detect inode 258 as a conflicting inode, with inode 261
- * on reference "zz", and log it;
+ * Here we can use need_log_inode() because we only need to log
+ * the inode in LOG_INODE_EXISTS mode and rename operations
+ * update the log, so that the log ends up with the new name and
+ * without the old name.
*
- * - we detect inode 259 as a conflicting inode, with inode 258
- * on reference "a", and log it;
- *
- * - we detect inode 258 as a conflicting inode, with inode 259
- * on reference "zz_link", and log it - again! After this we
- * repeat the above steps forever.
- */
- spin_lock(&BTRFS_I(inode)->lock);
- /*
- * Check the inode's logged_trans only instead of
- * btrfs_inode_in_log(). This is because the last_log_commit of
- * the inode is not updated when we only log that it exists (see
- * btrfs_log_inode()).
+ * We did this check at add_conflicting_inode(), but here we do
+ * it again because if some other task logged the inode after
+ * that, we can avoid doing it again.
*/
- if (BTRFS_I(inode)->logged_trans == trans->transid) {
- spin_unlock(&BTRFS_I(inode)->lock);
- btrfs_add_delayed_iput(inode);
+ if (!need_log_inode(trans, BTRFS_I(inode))) {
+ btrfs_add_delayed_iput(BTRFS_I(inode));
continue;
}
- spin_unlock(&BTRFS_I(inode)->lock);
+
/*
* We are safe logging the other inode without acquiring its
* lock as long as we log with the LOG_INODE_EXISTS mode. We
@@ -5511,67 +5770,16 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* well because during a rename we pin the log and update the
* log with the new name before we unpin it.
*/
- ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx);
- if (ret) {
- btrfs_add_delayed_iput(inode);
- continue;
- }
-
- key.objectid = ino;
- key.type = BTRFS_INODE_REF_KEY;
- key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
- btrfs_add_delayed_iput(inode);
- continue;
- }
-
- while (true) {
- struct extent_buffer *leaf = path->nodes[0];
- int slot = path->slots[0];
- u64 other_ino = 0;
- u64 other_parent = 0;
-
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- break;
- } else if (ret > 0) {
- ret = 0;
- break;
- }
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &key, slot);
- if (key.objectid != ino ||
- (key.type != BTRFS_INODE_REF_KEY &&
- key.type != BTRFS_INODE_EXTREF_KEY)) {
- ret = 0;
- break;
- }
-
- ret = btrfs_check_ref_name_override(leaf, slot, &key,
- BTRFS_I(inode), &other_ino,
- &other_parent);
- if (ret < 0)
- break;
- if (ret > 0) {
- ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
- if (!ino_elem) {
- ret = -ENOMEM;
- break;
- }
- ino_elem->ino = other_ino;
- ino_elem->parent = other_parent;
- list_add_tail(&ino_elem->list, &inode_list);
- ret = 0;
- }
- path->slots[0]++;
- }
- btrfs_add_delayed_iput(inode);
+ ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
+ btrfs_add_delayed_iput(BTRFS_I(inode));
+ if (ret)
+ break;
}
+ ctx->logging_conflict_inodes = false;
+ if (ret)
+ free_conflicting_inodes(ctx);
+
return ret;
}
@@ -5582,7 +5790,6 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_path *dst_path,
const u64 logged_isize,
- const bool recursive_logging,
const int inode_only,
struct btrfs_log_ctx *ctx,
bool *need_log_inode_item)
@@ -5621,8 +5828,8 @@ again:
break;
} else if ((min_key->type == BTRFS_INODE_REF_KEY ||
min_key->type == BTRFS_INODE_EXTREF_KEY) &&
- inode->generation == trans->transid &&
- !recursive_logging) {
+ (inode->generation == trans->transid ||
+ ctx->logging_conflict_inodes)) {
u64 other_ino = 0;
u64 other_parent = 0;
@@ -5646,11 +5853,12 @@ again:
return ret;
ins_nr = 0;
- ret = log_conflicting_inodes(trans, root, path,
- ctx, other_ino, other_parent);
+ btrfs_release_path(path);
+ ret = add_conflicting_inode(trans, root, path,
+ other_ino,
+ other_parent, ctx);
if (ret)
return ret;
- btrfs_release_path(path);
goto next_key;
}
} else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
@@ -5733,6 +5941,371 @@ next_key:
return ret;
}
+static int insert_delayed_items_batch(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ const struct btrfs_item_batch *batch,
+ const struct btrfs_delayed_item *first_item)
+{
+ const struct btrfs_delayed_item *curr = first_item;
+ int ret;
+
+ ret = btrfs_insert_empty_items(trans, log, path, batch);
+ if (ret)
+ return ret;
+
+ for (int i = 0; i < batch->nr; i++) {
+ char *data_ptr;
+
+ data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
+ write_extent_buffer(path->nodes[0], &curr->data,
+ (unsigned long)data_ptr, curr->data_len);
+ curr = list_next_entry(curr, log_list);
+ path->slots[0]++;
+ }
+
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_ins_list,
+ struct btrfs_log_ctx *ctx)
+{
+ /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */
+ const int max_batch_size = 195;
+ const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info);
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *log = inode->root->log_root;
+ struct btrfs_item_batch batch = {
+ .nr = 0,
+ .total_data_size = 0,
+ };
+ const struct btrfs_delayed_item *first = NULL;
+ const struct btrfs_delayed_item *curr;
+ char *ins_data;
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+ u64 curr_batch_size = 0;
+ int batch_idx = 0;
+ int ret;
+
+ /* We are adding dir index items to the log tree. */
+ lockdep_assert_held(&inode->log_mutex);
+
+ /*
+ * We collect delayed items before copying index keys from the subvolume
+ * to the log tree. However just after we collected them, they may have
+ * been flushed (all of them or just some of them), and therefore we
+ * could have copied them from the subvolume tree to the log tree.
+ * So find the first delayed item that was not yet logged (they are
+ * sorted by index number).
+ */
+ list_for_each_entry(curr, delayed_ins_list, log_list) {
+ if (curr->index > inode->last_dir_index_offset) {
+ first = curr;
+ break;
+ }
+ }
+
+ /* Empty list or all delayed items were already logged. */
+ if (!first)
+ return 0;
+
+ ins_data = kmalloc(max_batch_size * sizeof(u32) +
+ max_batch_size * sizeof(struct btrfs_key), GFP_NOFS);
+ if (!ins_data)
+ return -ENOMEM;
+ ins_sizes = (u32 *)ins_data;
+ batch.data_sizes = ins_sizes;
+ ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32));
+ batch.keys = ins_keys;
+
+ curr = first;
+ while (!list_entry_is_head(curr, delayed_ins_list, log_list)) {
+ const u32 curr_size = curr->data_len + sizeof(struct btrfs_item);
+
+ if (curr_batch_size + curr_size > leaf_data_size ||
+ batch.nr == max_batch_size) {
+ ret = insert_delayed_items_batch(trans, log, path,
+ &batch, first);
+ if (ret)
+ goto out;
+ batch_idx = 0;
+ batch.nr = 0;
+ batch.total_data_size = 0;
+ curr_batch_size = 0;
+ first = curr;
+ }
+
+ ins_sizes[batch_idx] = curr->data_len;
+ ins_keys[batch_idx].objectid = ino;
+ ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY;
+ ins_keys[batch_idx].offset = curr->index;
+ curr_batch_size += curr_size;
+ batch.total_data_size += curr->data_len;
+ batch.nr++;
+ batch_idx++;
+ curr = list_next_entry(curr, log_list);
+ }
+
+ ASSERT(batch.nr >= 1);
+ ret = insert_delayed_items_batch(trans, log, path, &batch, first);
+
+ curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
+ log_list);
+ inode->last_dir_index_offset = curr->index;
+out:
+ kfree(ins_data);
+
+ return ret;
+}
+
+static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ struct btrfs_log_ctx *ctx)
+{
+ const u64 ino = btrfs_ino(inode);
+ const struct btrfs_delayed_item *curr;
+
+ curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
+ log_list);
+
+ while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
+ u64 first_dir_index = curr->index;
+ u64 last_dir_index;
+ const struct btrfs_delayed_item *next;
+ int ret;
+
+ /*
+ * Find a range of consecutive dir index items to delete. Like
+ * this we log a single dir range item spanning several contiguous
+ * dir items instead of logging one range item per dir index item.
+ */
+ next = list_next_entry(curr, log_list);
+ while (!list_entry_is_head(next, delayed_del_list, log_list)) {
+ if (next->index != curr->index + 1)
+ break;
+ curr = next;
+ next = list_next_entry(next, log_list);
+ }
+
+ last_dir_index = curr->index;
+ ASSERT(last_dir_index >= first_dir_index);
+
+ ret = insert_dir_log_key(trans, inode->root->log_root, path,
+ ino, first_dir_index, last_dir_index);
+ if (ret)
+ return ret;
+ curr = list_next_entry(curr, log_list);
+ }
+
+ return 0;
+}
+
+static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx,
+ const struct list_head *delayed_del_list,
+ const struct btrfs_delayed_item *first,
+ const struct btrfs_delayed_item **last_ret)
+{
+ const struct btrfs_delayed_item *next;
+ struct extent_buffer *leaf = path->nodes[0];
+ const int last_slot = btrfs_header_nritems(leaf) - 1;
+ int slot = path->slots[0] + 1;
+ const u64 ino = btrfs_ino(inode);
+
+ next = list_next_entry(first, log_list);
+
+ while (slot < last_slot &&
+ !list_entry_is_head(next, delayed_del_list, log_list)) {
+ struct btrfs_key key;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino ||
+ key.type != BTRFS_DIR_INDEX_KEY ||
+ key.offset != next->index)
+ break;
+
+ slot++;
+ *last_ret = next;
+ next = list_next_entry(next, log_list);
+ }
+
+ return btrfs_del_items(trans, inode->root->log_root, path,
+ path->slots[0], slot - path->slots[0]);
+}
+
+static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *log = inode->root->log_root;
+ const struct btrfs_delayed_item *curr;
+ u64 last_range_start;
+ u64 last_range_end = 0;
+ struct btrfs_key key;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_DIR_INDEX_KEY;
+ curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
+ log_list);
+
+ while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
+ const struct btrfs_delayed_item *last = curr;
+ u64 first_dir_index = curr->index;
+ u64 last_dir_index;
+ bool deleted_items = false;
+ int ret;
+
+ key.offset = curr->index;
+ ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+ if (ret < 0) {
+ return ret;
+ } else if (ret == 0) {
+ ret = batch_delete_dir_index_items(trans, inode, path, ctx,
+ delayed_del_list, curr,
+ &last);
+ if (ret)
+ return ret;
+ deleted_items = true;
+ }
+
+ btrfs_release_path(path);
+
+ /*
+ * If we deleted items from the leaf, it means we have a range
+ * item logging their range, so no need to add one or update an
+ * existing one. Otherwise we have to log a dir range item.
+ */
+ if (deleted_items)
+ goto next_batch;
+
+ last_dir_index = last->index;
+ ASSERT(last_dir_index >= first_dir_index);
+ /*
+ * If this range starts right after where the previous one ends,
+ * then we want to reuse the previous range item and change its
+ * end offset to the end of this range. This is just to minimize
+ * leaf space usage, by avoiding adding a new range item.
+ */
+ if (last_range_end != 0 && first_dir_index == last_range_end + 1)
+ first_dir_index = last_range_start;
+
+ ret = insert_dir_log_key(trans, log, path, key.objectid,
+ first_dir_index, last_dir_index);
+ if (ret)
+ return ret;
+
+ last_range_start = first_dir_index;
+ last_range_end = last_dir_index;
+next_batch:
+ curr = list_next_entry(last, log_list);
+ }
+
+ return 0;
+}
+
+static int log_delayed_deletion_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ struct btrfs_log_ctx *ctx)
+{
+ /*
+ * We are deleting dir index items from the log tree or adding range
+ * items to it.
+ */
+ lockdep_assert_held(&inode->log_mutex);
+
+ if (list_empty(delayed_del_list))
+ return 0;
+
+ if (ctx->logged_before)
+ return log_delayed_deletions_incremental(trans, inode, path,
+ delayed_del_list, ctx);
+
+ return log_delayed_deletions_full(trans, inode, path, delayed_del_list,
+ ctx);
+}
+
+/*
+ * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed
+ * items instead of the subvolume tree.
+ */
+static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ const struct list_head *delayed_ins_list,
+ struct btrfs_log_ctx *ctx)
+{
+ const bool orig_log_new_dentries = ctx->log_new_dentries;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_item *item;
+ int ret = 0;
+
+ /*
+ * No need for the log mutex, plus to avoid potential deadlocks or
+ * lockdep annotations due to nesting of delayed inode mutexes and log
+ * mutexes.
+ */
+ lockdep_assert_not_held(&inode->log_mutex);
+
+ ASSERT(!ctx->logging_new_delayed_dentries);
+ ctx->logging_new_delayed_dentries = true;
+
+ list_for_each_entry(item, delayed_ins_list, log_list) {
+ struct btrfs_dir_item *dir_item;
+ struct inode *di_inode;
+ struct btrfs_key key;
+ int log_mode = LOG_INODE_EXISTS;
+
+ dir_item = (struct btrfs_dir_item *)item->data;
+ btrfs_disk_key_to_cpu(&key, &dir_item->location);
+
+ if (key.type == BTRFS_ROOT_ITEM_KEY)
+ continue;
+
+ di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root);
+ if (IS_ERR(di_inode)) {
+ ret = PTR_ERR(di_inode);
+ break;
+ }
+
+ if (!need_log_inode(trans, BTRFS_I(di_inode))) {
+ btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ continue;
+ }
+
+ if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR)
+ log_mode = LOG_INODE_ALL;
+
+ ctx->log_new_dentries = false;
+ ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx);
+
+ if (!ret && ctx->log_new_dentries)
+ ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx);
+
+ btrfs_add_delayed_iput(BTRFS_I(di_inode));
+
+ if (ret)
+ break;
+ }
+
+ ctx->log_new_dentries = orig_log_new_dentries;
+ ctx->logging_new_delayed_dentries = false;
+
+ return ret;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -5764,9 +6337,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
u64 logged_isize = 0;
bool need_log_inode_item = true;
bool xattrs_logged = false;
- bool recursive_logging = false;
bool inode_item_dropped = true;
- const bool orig_logged_before = ctx->logged_before;
+ bool full_dir_logging = false;
+ LIST_HEAD(delayed_ins_list);
+ LIST_HEAD(delayed_del_list);
path = btrfs_alloc_path();
if (!path)
@@ -5794,27 +6368,46 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
max_key.type = (u8)-1;
max_key.offset = (u64)-1;
+ if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
+ full_dir_logging = true;
+
/*
- * Only run delayed items if we are a directory. We want to make sure
- * all directory indexes hit the fs/subvolume tree so we can find them
- * and figure out which index ranges have to be logged.
+ * If we are logging a directory while we are logging dentries of the
+ * delayed items of some other inode, then we need to flush the delayed
+ * items of this directory and not log the delayed items directly. This
+ * is to prevent more than one level of recursion into btrfs_log_inode()
+ * by having something like this:
+ *
+ * $ mkdir -p a/b/c/d/e/f/g/h/...
+ * $ xfs_io -c "fsync" a
+ *
+ * Where all directories in the path did not exist before and are
+ * created in the current transaction.
+ * So in such a case we directly log the delayed items of the main
+ * directory ("a") without flushing them first, while for each of its
+ * subdirectories we flush their delayed items before logging them.
+ * This prevents a potential unbounded recursion like this:
+ *
+ * btrfs_log_inode()
+ * log_new_delayed_dentries()
+ * btrfs_log_inode()
+ * log_new_delayed_dentries()
+ * btrfs_log_inode()
+ * log_new_delayed_dentries()
+ * (...)
+ *
+ * We have thresholds for the maximum number of delayed items to have in
+ * memory, and once they are hit, the items are flushed asynchronously.
+ * However the limit is quite high, so lets prevent deep levels of
+ * recursion to happen by limiting the maximum depth to be 1.
*/
- if (S_ISDIR(inode->vfs_inode.i_mode)) {
+ if (full_dir_logging && ctx->logging_new_delayed_dentries) {
ret = btrfs_commit_inode_delayed_items(trans, inode);
if (ret)
goto out;
}
- if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
- recursive_logging = true;
- if (inode_only == LOG_OTHER_INODE)
- inode_only = LOG_INODE_EXISTS;
- else
- inode_only = LOG_INODE_ALL;
- mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
- } else {
- mutex_lock(&inode->log_mutex);
- }
+ mutex_lock(&inode->log_mutex);
/*
* For symlinks, we must always log their content, which is stored in an
@@ -5846,9 +6439,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* to known the file was moved from A to B, so logging just A would
* result in losing the file after a log replay.
*/
- if (S_ISDIR(inode->vfs_inode.i_mode) &&
- inode_only == LOG_INODE_ALL &&
- inode->last_unlink_trans >= trans->transid) {
+ if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
btrfs_set_log_full_commit(trans);
ret = BTRFS_LOG_FORCE_COMMIT;
goto out_unlock;
@@ -5859,14 +6450,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* copies of everything.
*/
if (S_ISDIR(inode->vfs_inode.i_mode)) {
- int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
-
clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
- if (inode_only == LOG_INODE_EXISTS)
- max_key_type = BTRFS_XATTR_ITEM_KEY;
if (ctx->logged_before)
ret = drop_inode_items(trans, log, path, inode,
- max_key_type);
+ BTRFS_XATTR_ITEM_KEY);
} else {
if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
/*
@@ -5922,9 +6509,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (ret)
goto out_unlock;
+ /*
+ * If we are logging a directory in full mode, collect the delayed items
+ * before iterating the subvolume tree, so that we don't miss any new
+ * dir index items in case they get flushed while or right after we are
+ * iterating the subvolume tree.
+ */
+ if (full_dir_logging && !ctx->logging_new_delayed_dentries)
+ btrfs_log_get_delayed_items(inode, &delayed_ins_list,
+ &delayed_del_list);
+
ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
path, dst_path, logged_isize,
- recursive_logging, inode_only, ctx,
+ inode_only, ctx,
&need_log_inode_item);
if (ret)
goto out_unlock;
@@ -5977,10 +6574,18 @@ log_extents:
write_unlock(&em_tree->lock);
}
- if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
+ if (full_dir_logging) {
ret = log_directory_changes(trans, inode, path, dst_path, ctx);
if (ret)
goto out_unlock;
+ ret = log_delayed_insertion_items(trans, inode, path,
+ &delayed_ins_list, ctx);
+ if (ret)
+ goto out_unlock;
+ ret = log_delayed_deletion_items(trans, inode, path,
+ &delayed_del_list, ctx);
+ if (ret)
+ goto out_unlock;
}
spin_lock(&inode->lock);
@@ -6033,208 +6638,20 @@ out:
btrfs_free_path(path);
btrfs_free_path(dst_path);
- if (recursive_logging)
- ctx->logged_before = orig_logged_before;
-
- return ret;
-}
-
-/*
- * Check if we need to log an inode. This is used in contexts where while
- * logging an inode we need to log another inode (either that it exists or in
- * full mode). This is used instead of btrfs_inode_in_log() because the later
- * requires the inode to be in the log and have the log transaction committed,
- * while here we do not care if the log transaction was already committed - our
- * caller will commit the log later - and we want to avoid logging an inode
- * multiple times when multiple tasks have joined the same log transaction.
- */
-static bool need_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode)
-{
- /*
- * If a directory was not modified, no dentries added or removed, we can
- * and should avoid logging it.
- */
- if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
- return false;
-
- /*
- * If this inode does not have new/updated/deleted xattrs since the last
- * time it was logged and is flagged as logged in the current transaction,
- * we can skip logging it. As for new/deleted names, those are updated in
- * the log by link/unlink/rename operations.
- * In case the inode was logged and then evicted and reloaded, its
- * logged_trans will be 0, in which case we have to fully log it since
- * logged_trans is a transient field, not persisted.
- */
- if (inode->logged_trans == trans->transid &&
- !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
- return false;
-
- return true;
-}
-
-struct btrfs_dir_list {
- u64 ino;
- struct list_head list;
-};
-
-/*
- * Log the inodes of the new dentries of a directory. See log_dir_items() for
- * details about the why it is needed.
- * This is a recursive operation - if an existing dentry corresponds to a
- * directory, that directory's new entries are logged too (same behaviour as
- * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
- * the dentries point to we do not lock their i_mutex, otherwise lockdep
- * complains about the following circular lock dependency / possible deadlock:
- *
- * CPU0 CPU1
- * ---- ----
- * lock(&type->i_mutex_dir_key#3/2);
- * lock(sb_internal#2);
- * lock(&type->i_mutex_dir_key#3/2);
- * lock(&sb->s_type->i_mutex_key#14);
- *
- * Where sb_internal is the lock (a counter that works as a lock) acquired by
- * sb_start_intwrite() in btrfs_start_transaction().
- * Not locking i_mutex of the inodes is still safe because:
- *
- * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
- * that while logging the inode new references (names) are added or removed
- * from the inode, leaving the logged inode item with a link count that does
- * not match the number of logged inode reference items. This is fine because
- * at log replay time we compute the real number of links and correct the
- * link count in the inode item (see replay_one_buffer() and
- * link_to_fixup_dir());
- *
- * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
- * while logging the inode's items new index items (key type
- * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
- * has a size that doesn't match the sum of the lengths of all the logged
- * names - this is ok, not a problem, because at log replay time we set the
- * directory's i_size to the correct value (see replay_one_name() and
- * do_overwrite_item()).
- */
-static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_inode *start_inode,
- struct btrfs_log_ctx *ctx)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
- LIST_HEAD(dir_list);
- struct btrfs_dir_list *dir_elem;
- int ret = 0;
-
- /*
- * If we are logging a new name, as part of a link or rename operation,
- * don't bother logging new dentries, as we just want to log the names
- * of an inode and that any new parents exist.
- */
- if (ctx->logging_new_name)
- return 0;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
- if (!dir_elem) {
- btrfs_free_path(path);
- return -ENOMEM;
- }
- dir_elem->ino = btrfs_ino(start_inode);
- list_add_tail(&dir_elem->list, &dir_list);
-
- while (!list_empty(&dir_list)) {
- struct extent_buffer *leaf;
- struct btrfs_key min_key;
- int nritems;
- int i;
-
- dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
- list);
- if (ret)
- goto next_dir_inode;
-
- min_key.objectid = dir_elem->ino;
- min_key.type = BTRFS_DIR_INDEX_KEY;
- min_key.offset = 0;
-again:
- btrfs_release_path(path);
- ret = btrfs_search_forward(root, &min_key, path, trans->transid);
- if (ret < 0) {
- goto next_dir_inode;
- } else if (ret > 0) {
- ret = 0;
- goto next_dir_inode;
- }
-
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- for (i = path->slots[0]; i < nritems; i++) {
- struct btrfs_dir_item *di;
- struct btrfs_key di_key;
- struct inode *di_inode;
- struct btrfs_dir_list *new_dir_elem;
- int log_mode = LOG_INODE_EXISTS;
- int type;
-
- btrfs_item_key_to_cpu(leaf, &min_key, i);
- if (min_key.objectid != dir_elem->ino ||
- min_key.type != BTRFS_DIR_INDEX_KEY)
- goto next_dir_inode;
-
- di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
- type = btrfs_dir_type(leaf, di);
- if (btrfs_dir_transid(leaf, di) < trans->transid)
- continue;
- btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
- if (di_key.type == BTRFS_ROOT_ITEM_KEY)
- continue;
-
- btrfs_release_path(path);
- di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
- if (IS_ERR(di_inode)) {
- ret = PTR_ERR(di_inode);
- goto next_dir_inode;
- }
+ if (ret)
+ free_conflicting_inodes(ctx);
+ else
+ ret = log_conflicting_inodes(trans, inode->root, ctx);
- if (!need_log_inode(trans, BTRFS_I(di_inode))) {
- btrfs_add_delayed_iput(di_inode);
- break;
- }
+ if (full_dir_logging && !ctx->logging_new_delayed_dentries) {
+ if (!ret)
+ ret = log_new_delayed_dentries(trans, inode,
+ &delayed_ins_list, ctx);
- ctx->log_new_dentries = false;
- if (type == BTRFS_FT_DIR)
- log_mode = LOG_INODE_ALL;
- ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
- log_mode, ctx);
- btrfs_add_delayed_iput(di_inode);
- if (ret)
- goto next_dir_inode;
- if (ctx->log_new_dentries) {
- new_dir_elem = kmalloc(sizeof(*new_dir_elem),
- GFP_NOFS);
- if (!new_dir_elem) {
- ret = -ENOMEM;
- goto next_dir_inode;
- }
- new_dir_elem->ino = di_key.objectid;
- list_add_tail(&new_dir_elem->list, &dir_list);
- }
- break;
- }
- if (min_key.offset < (u64)-1) {
- min_key.offset++;
- goto again;
- }
-next_dir_inode:
- list_del(&dir_elem->list);
- kfree(dir_elem);
+ btrfs_log_put_delayed_items(inode, &delayed_ins_list,
+ &delayed_del_list);
}
- btrfs_free_path(path);
return ret;
}
@@ -6338,7 +6755,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
}
if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
- btrfs_add_delayed_iput(dir_inode);
+ btrfs_add_delayed_iput(BTRFS_I(dir_inode));
continue;
}
@@ -6346,9 +6763,9 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
LOG_INODE_ALL, ctx);
if (!ret && ctx->log_new_dentries)
- ret = log_new_dir_dentries(trans, root,
+ ret = log_new_dir_dentries(trans,
BTRFS_I(dir_inode), ctx);
- btrfs_add_delayed_iput(dir_inode);
+ btrfs_add_delayed_iput(BTRFS_I(dir_inode));
if (ret)
goto out;
}
@@ -6393,7 +6810,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
need_log_inode(trans, BTRFS_I(inode)))
ret = btrfs_log_inode(trans, BTRFS_I(inode),
LOG_INODE_EXISTS, ctx);
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(BTRFS_I(inode));
if (ret)
return ret;
@@ -6661,7 +7078,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
if (log_dentries)
- ret = log_new_dir_dentries(trans, root, inode, ctx);
+ ret = log_new_dir_dentries(trans, inode, ctx);
else
ret = 0;
end_trans:
@@ -6955,7 +7372,7 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
mutex_unlock(&dir->log_mutex);
}
-/**
+/*
* Update the log after adding a new name for an inode.
*
* @trans: Transaction handle.
@@ -7022,9 +7439,14 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
if (old_dir && old_dir->logged_trans == trans->transid) {
struct btrfs_root *log = old_dir->root->log_root;
struct btrfs_path *path;
+ struct fscrypt_name fname;
ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
+ ret = fscrypt_setup_filename(&old_dir->vfs_inode,
+ &old_dentry->d_name, 0, &fname);
+ if (ret)
+ goto out;
/*
* We have two inodes to update in the log, the old directory and
* the inode that got renamed, so we must pin the log to prevent
@@ -7044,6 +7466,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
+ fscrypt_free_filename(&fname);
goto out;
}
@@ -7059,8 +7482,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
*/
mutex_lock(&old_dir->log_mutex);
ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
- old_dentry->d_name.name,
- old_dentry->d_name.len, old_dir_index);
+ &fname.disk_name, old_dir_index);
if (ret > 0) {
/*
* The dentry does not exist in the log, so record its
@@ -7074,6 +7496,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
mutex_unlock(&old_dir->log_mutex);
btrfs_free_path(path);
+ fscrypt_free_filename(&fname);
if (ret < 0)
goto out;
}
@@ -7088,6 +7511,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* inconsistent state after a rename operation.
*/
btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
+ ASSERT(list_empty(&ctx.conflict_inodes));
out:
/*
* If an error happened mark the log for a full commit because it's not
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 57ab5f3b8dc7..85b43075ac58 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -6,6 +6,7 @@
#ifndef BTRFS_TREE_LOG_H
#define BTRFS_TREE_LOG_H
+#include "messages.h"
#include "ctree.h"
#include "transaction.h"
@@ -20,6 +21,7 @@ struct btrfs_log_ctx {
int log_transid;
bool log_new_dentries;
bool logging_new_name;
+ bool logging_new_delayed_dentries;
/* Indicate if the inode being logged was logged before. */
bool logged_before;
/* Tracks the last logged dir item/index key offset. */
@@ -28,6 +30,9 @@ struct btrfs_log_ctx {
struct list_head list;
/* Only used for fast fsyncs. */
struct list_head ordered_extents;
+ struct list_head conflict_inodes;
+ int num_conflict_inodes;
+ bool logging_conflict_inodes;
};
static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
@@ -37,10 +42,14 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
ctx->log_transid = 0;
ctx->log_new_dentries = false;
ctx->logging_new_name = false;
+ ctx->logging_new_delayed_dentries = false;
ctx->logged_before = false;
ctx->inode = inode;
INIT_LIST_HEAD(&ctx->list);
INIT_LIST_HEAD(&ctx->ordered_extents);
+ INIT_LIST_HEAD(&ctx->conflict_inodes);
+ ctx->num_conflict_inodes = 0;
+ ctx->logging_conflict_inodes = false;
}
static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
@@ -78,11 +87,11 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx);
void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- const char *name, int name_len,
+ const struct fscrypt_str *name,
struct btrfs_inode *dir, u64 index);
void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- const char *name, int name_len,
+ const struct fscrypt_str *name,
struct btrfs_inode *inode, u64 dirid);
void btrfs_end_log_trans(struct btrfs_root *root);
void btrfs_pin_log_trans(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
index 8a3a14686d3e..a555baa0143a 100644
--- a/fs/btrfs/tree-mod-log.c
+++ b/fs/btrfs/tree-mod-log.c
@@ -1,7 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
+#include "messages.h"
#include "tree-mod-log.h"
#include "disk-io.h"
+#include "fs.h"
+#include "accessors.h"
+#include "tree-checker.h"
struct tree_mod_root {
u64 logical;
@@ -197,12 +201,11 @@ static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
int slot,
- enum btrfs_mod_log_op op,
- gfp_t flags)
+ enum btrfs_mod_log_op op)
{
struct tree_mod_elem *tm;
- tm = kzalloc(sizeof(*tm), flags);
+ tm = kzalloc(sizeof(*tm), GFP_NOFS);
if (!tm)
return NULL;
@@ -220,7 +223,7 @@ static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
}
int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
- enum btrfs_mod_log_op op, gfp_t flags)
+ enum btrfs_mod_log_op op)
{
struct tree_mod_elem *tm;
int ret;
@@ -228,7 +231,7 @@ int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
if (!tree_mod_need_log(eb->fs_info, eb))
return 0;
- tm = alloc_tree_mod_elem(eb, slot, op, flags);
+ tm = alloc_tree_mod_elem(eb, slot, op);
if (!tm)
return -ENOMEM;
@@ -276,7 +279,7 @@ int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
- BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING);
if (!tm_list[i]) {
ret = -ENOMEM;
goto free_tms;
@@ -364,7 +367,7 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
}
for (i = 0; i < nritems; i++) {
tm_list[i] = alloc_tree_mod_elem(old_root, i,
- BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
if (!tm_list[i]) {
ret = -ENOMEM;
goto free_tms;
@@ -502,14 +505,14 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
tm_list_rem = tm_list + nr_items;
for (i = 0; i < nr_items; i++) {
tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
- BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_REMOVE);
if (!tm_list_rem[i]) {
ret = -ENOMEM;
goto free_tms;
}
tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
- BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_ADD);
if (!tm_list_add[i]) {
ret = -ENOMEM;
goto free_tms;
@@ -564,7 +567,7 @@ int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
for (i = 0; i < nritems; i++) {
tm_list[i] = alloc_tree_mod_elem(eb, i,
- BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
if (!tm_list[i]) {
ret = -ENOMEM;
goto free_tms;
@@ -694,8 +697,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
n--;
break;
case BTRFS_MOD_LOG_MOVE_KEYS:
- o_dst = btrfs_node_key_ptr_offset(tm->slot);
- o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
+ o_dst = btrfs_node_key_ptr_offset(eb, tm->slot);
+ o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot);
memmove_extent_buffer(eb, o_dst, o_src,
tm->move.nr_items * p_size);
break;
@@ -819,10 +822,15 @@ struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
tm = tree_mod_log_search(fs_info, logical, time_seq);
if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+ struct btrfs_tree_parent_check check = { 0 };
+
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
- old = read_tree_block(fs_info, logical, root->root_key.objectid,
- 0, level, NULL);
+
+ check.level = level;
+ check.owner_root = root->root_key.objectid;
+
+ old = read_tree_block(fs_info, logical, &check);
if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
if (!IS_ERR(old))
free_extent_buffer(old);
diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h
index 12605d19621b..94f10afeee97 100644
--- a/fs/btrfs/tree-mod-log.h
+++ b/fs/btrfs/tree-mod-log.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_TREE_MOD_LOG_H
#define BTRFS_TREE_MOD_LOG_H
@@ -32,7 +32,7 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
struct extent_buffer *new_root,
bool log_removal);
int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
- enum btrfs_mod_log_op op, gfp_t flags);
+ enum btrfs_mod_log_op op);
int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb);
struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 3374c9e9be67..33606025513d 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -5,6 +5,7 @@
*/
#include <linux/slab.h>
+#include "messages.h"
#include "ulist.h"
#include "ctree.h"
@@ -37,8 +38,9 @@
* loop would be similar to the above.
*/
-/**
- * ulist_init - freshly initialize a ulist
+/*
+ * Freshly initialize a ulist.
+ *
* @ulist: the ulist to initialize
*
* Note: don't use this function to init an already used ulist, use
@@ -51,8 +53,9 @@ void ulist_init(struct ulist *ulist)
ulist->nnodes = 0;
}
-/**
- * ulist_release - free up additionally allocated memory for the ulist
+/*
+ * Free up additionally allocated memory for the ulist.
+ *
* @ulist: the ulist from which to free the additional memory
*
* This is useful in cases where the base 'struct ulist' has been statically
@@ -70,8 +73,9 @@ void ulist_release(struct ulist *ulist)
INIT_LIST_HEAD(&ulist->nodes);
}
-/**
- * ulist_reinit - prepare a ulist for reuse
+/*
+ * Prepare a ulist for reuse.
+ *
* @ulist: ulist to be reused
*
* Free up all additional memory allocated for the list elements and reinit
@@ -83,8 +87,9 @@ void ulist_reinit(struct ulist *ulist)
ulist_init(ulist);
}
-/**
- * ulist_alloc - dynamically allocate a ulist
+/*
+ * Dynamically allocate a ulist.
+ *
* @gfp_mask: allocation flags to for base allocation
*
* The allocated ulist will be returned in an initialized state.
@@ -101,8 +106,9 @@ struct ulist *ulist_alloc(gfp_t gfp_mask)
return ulist;
}
-/**
- * ulist_free - free dynamically allocated ulist
+/*
+ * Free dynamically allocated ulist.
+ *
* @ulist: ulist to free
*
* It is not necessary to call ulist_release before.
@@ -163,8 +169,9 @@ static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins)
return 0;
}
-/**
- * ulist_add - add an element to the ulist
+/*
+ * Add an element to the ulist.
+ *
* @ulist: ulist to add the element to
* @val: value to add to ulist
* @aux: auxiliary value to store along with val
@@ -242,8 +249,9 @@ int ulist_del(struct ulist *ulist, u64 val, u64 aux)
return 0;
}
-/**
- * ulist_next - iterate ulist
+/*
+ * Iterate ulist.
+ *
* @ulist: ulist to iterate
* @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator)
*
@@ -258,7 +266,7 @@ int ulist_del(struct ulist *ulist, u64 val, u64 aux)
* It is allowed to call ulist_add during an enumeration. Newly added items
* are guaranteed to show up in the running enumeration.
*/
-struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
+struct ulist_node *ulist_next(const struct ulist *ulist, struct ulist_iterator *uiter)
{
struct ulist_node *node;
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 02fda0a2d4ce..b2cef187ea8e 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -66,7 +66,7 @@ static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux,
#endif
}
-struct ulist_node *ulist_next(struct ulist *ulist,
+struct ulist_node *ulist_next(const struct ulist *ulist,
struct ulist_iterator *uiter);
#define ULIST_ITER_INIT(uiter) ((uiter)->cur_list = NULL)
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index b458452a1aaf..7c7001f42b14 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -5,11 +5,14 @@
#include <linux/uuid.h>
#include <asm/unaligned.h>
+#include "messages.h"
#include "ctree.h"
#include "transaction.h"
#include "disk-io.h"
#include "print-tree.h"
-
+#include "fs.h"
+#include "accessors.h"
+#include "uuid-tree.h"
static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key)
{
diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h
new file mode 100644
index 000000000000..5350c87fe2ca
--- /dev/null
+++ b/fs/btrfs/uuid-tree.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_UUID_TREE_H
+#define BTRFS_UUID_TREE_H
+
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+ u64 subid);
+int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+ u64 subid);
+int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info);
+
+#endif
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 90eb5c2830a9..bf9eb693a6a7 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -10,11 +10,17 @@
#include <linux/iversion.h>
#include <linux/fsverity.h>
#include <linux/sched/mm.h>
+#include "messages.h"
#include "ctree.h"
#include "btrfs_inode.h"
#include "transaction.h"
#include "disk-io.h"
#include "locking.h"
+#include "fs.h"
+#include "accessors.h"
+#include "ioctl.h"
+#include "verity.h"
+#include "orphan.h"
/*
* Implementation of the interface defined in struct fsverity_operations.
@@ -659,8 +665,7 @@ rollback:
*
* Returns the size on success or a negative error code on failure.
*/
-static int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
- size_t buf_size)
+int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
{
u64 true_size;
int ret = 0;
diff --git a/fs/btrfs/verity.h b/fs/btrfs/verity.h
new file mode 100644
index 000000000000..91c10f7d0a46
--- /dev/null
+++ b/fs/btrfs/verity.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_VERITY_H
+#define BTRFS_VERITY_H
+
+#ifdef CONFIG_FS_VERITY
+
+extern const struct fsverity_operations btrfs_verityops;
+
+int btrfs_drop_verity_items(struct btrfs_inode *inode);
+int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size);
+
+#else
+
+static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
+{
+ return 0;
+}
+
+static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
+ size_t buf_size)
+{
+ return -EPERM;
+}
+
+#endif
+
+#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f63ff91e2883..aa25fa335d3e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5,12 +5,9 @@
#include <linux/sched.h>
#include <linux/sched/mm.h>
-#include <linux/bio.h>
#include <linux/slab.h>
-#include <linux/blkdev.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
-#include <linux/raid/pq.h>
#include <linux/semaphore.h>
#include <linux/uuid.h>
#include <linux/list_sort.h>
@@ -23,8 +20,6 @@
#include "print-tree.h"
#include "volumes.h"
#include "raid56.h"
-#include "async-thread.h"
-#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
#include "sysfs.h"
@@ -33,6 +28,13 @@
#include "block-group.h"
#include "discard.h"
#include "zoned.h"
+#include "fs.h"
+#include "accessors.h"
+#include "uuid-tree.h"
+#include "ioctl.h"
+#include "relocation.h"
+#include "scrub.h"
+#include "super.h"
#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
BTRFS_BLOCK_GROUP_RAID10 | \
@@ -246,11 +248,6 @@ out_overflow:;
static int init_first_rw_device(struct btrfs_trans_handle *trans);
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
-static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
- enum btrfs_map_op op,
- u64 logical, u64 *length,
- struct btrfs_io_context **bioc_ret,
- int mirror_num, int need_raid_map);
/*
* Device locking
@@ -527,14 +524,14 @@ error:
return ret;
}
-/**
- * Search and remove all stale devices (which are not mounted).
- * When both inputs are NULL, it will search and release all stale devices.
+/*
+ * Search and remove all stale devices (which are not mounted). When both
+ * inputs are NULL, it will search and release all stale devices.
*
- * @devt: Optional. When provided will it release all unmounted devices
- * matching this devt only.
+ * @devt: Optional. When provided will it release all unmounted devices
+ * matching this devt only.
* @skip_device: Optional. Will skip this device when searching for the stale
- * devices.
+ * devices.
*
* Return: 0 for success or if @devt is 0.
* -EBUSY if @devt is a mounted device.
@@ -639,6 +636,9 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
if (!bdev_nonrot(bdev))
fs_devices->rotating = true;
+ if (bdev_max_discard_sectors(bdev))
+ fs_devices->discardable = true;
+
device->bdev = bdev;
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
device->mode = flags;
@@ -833,26 +833,23 @@ static noinline struct btrfs_device *device_list_add(const char *path,
}
if (!device) {
+ unsigned int nofs_flag;
+
if (fs_devices->opened) {
mutex_unlock(&fs_devices->device_list_mutex);
return ERR_PTR(-EBUSY);
}
+ nofs_flag = memalloc_nofs_save();
device = btrfs_alloc_device(NULL, &devid,
- disk_super->dev_item.uuid);
+ disk_super->dev_item.uuid, path);
+ memalloc_nofs_restore(nofs_flag);
if (IS_ERR(device)) {
mutex_unlock(&fs_devices->device_list_mutex);
/* we can safely leave the fs_devices entry around */
return device;
}
- name = rcu_string_strdup(path, GFP_NOFS);
- if (!name) {
- btrfs_free_device(device);
- mutex_unlock(&fs_devices->device_list_mutex);
- return ERR_PTR(-ENOMEM);
- }
- rcu_assign_pointer(device->name, name);
device->devt = path_devt;
list_add_rcu(&device->dev_list, &fs_devices->devices);
@@ -932,7 +929,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
}
btrfs_info_in_rcu(NULL,
"devid %llu device path %s changed to %s scanned by %s (%d)",
- devid, rcu_str_deref(device->name),
+ devid, btrfs_dev_name(device),
path, current->comm,
task_pid_nr(current));
}
@@ -985,28 +982,32 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
fs_devices->total_devices = orig->total_devices;
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
- struct rcu_string *name;
+ const char *dev_path = NULL;
+
+ /*
+ * This is ok to do without RCU read locked because we hold the
+ * uuid mutex so nothing we touch in here is going to disappear.
+ */
+ if (orig_dev->name)
+ dev_path = orig_dev->name->str;
device = btrfs_alloc_device(NULL, &orig_dev->devid,
- orig_dev->uuid);
+ orig_dev->uuid, dev_path);
if (IS_ERR(device)) {
ret = PTR_ERR(device);
goto error;
}
- /*
- * This is ok to do without rcu read locked because we hold the
- * uuid mutex so nothing we touch in here is going to disappear.
- */
- if (orig_dev->name) {
- name = rcu_string_strdup(orig_dev->name->str,
- GFP_KERNEL);
- if (!name) {
+ if (orig_dev->zone_info) {
+ struct btrfs_zoned_device_info *zone_info;
+
+ zone_info = btrfs_clone_dev_zone_info(orig_dev);
+ if (!zone_info) {
btrfs_free_device(device);
ret = -ENOMEM;
goto error;
}
- rcu_assign_pointer(device->name, name);
+ device->zone_info = zone_info;
}
list_add(&device->dev_list, &fs_devices->devices);
@@ -1459,8 +1460,9 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
return changed;
}
-/**
- * dev_extent_hole_check - check if specified hole is suitable for allocation
+/*
+ * Check if specified hole is suitable for allocation.
+ *
* @device: the device which we have the hole
* @hole_start: starting position of the hole
* @hole_size: the size of the hole
@@ -1514,7 +1516,8 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
}
/*
- * find_free_dev_extent_start - find free space in the specified device
+ * Find free space in the specified device.
+ *
* @device: the device which we search the free space in
* @num_bytes: the size of the free space that we need
* @search_start: the position from which to begin the search
@@ -1522,9 +1525,8 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
* @len: the size of the free space. that we find, or the size
* of the max free space if we don't find suitable free space
*
- * this uses a pretty simple search, the expectation is that it is
- * called very infrequently and that a given device has a small number
- * of extents
+ * This does a pretty simple search, the expectation is that it is called very
+ * infrequently and that a given device has a small number of extents.
*
* @start is used to store the start of the free space if we find. But if we
* don't find suitable free space, it will be used to store the start position
@@ -2017,7 +2019,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
struct page *page;
int ret;
- disk_super = btrfs_read_dev_one_super(bdev, copy_num);
+ disk_super = btrfs_read_dev_one_super(bdev, copy_num, false);
if (IS_ERR(disk_super))
continue;
@@ -2087,7 +2089,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
if (btrfs_pinned_by_swapfile(fs_info, device)) {
btrfs_warn_in_rcu(fs_info,
"cannot remove device %s (devid %llu) due to active swapfile",
- rcu_str_deref(device->name), device->devid);
+ btrfs_dev_name(device), device->devid);
return -ETXTBSY;
}
@@ -2303,8 +2305,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
btrfs_free_device(tgtdev);
}
-/**
- * Populate args from device at path
+/*
+ * Populate args from device at path.
*
* @fs_info: the filesystem
* @args: the args to populate
@@ -2579,7 +2581,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
struct btrfs_device *device;
struct block_device *bdev;
struct super_block *sb = fs_info->sb;
- struct rcu_string *name;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_fs_devices *seed_devices;
u64 orig_super_total_bytes;
@@ -2620,20 +2621,13 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
}
rcu_read_unlock();
- device = btrfs_alloc_device(fs_info, NULL, NULL);
+ device = btrfs_alloc_device(fs_info, NULL, NULL, device_path);
if (IS_ERR(device)) {
/* we can safely leave the fs_devices entry around */
ret = PTR_ERR(device);
goto error;
}
- name = rcu_string_strdup(device_path, GFP_KERNEL);
- if (!name) {
- ret = -ENOMEM;
- goto error_free_device;
- }
- rcu_assign_pointer(device->name, name);
-
device->fs_info = fs_info;
device->bdev = bdev;
ret = lookup_bdev(device_path, &device->devt);
@@ -3589,16 +3583,14 @@ static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_off
if (bargs->usage_min == 0)
user_thresh_min = 0;
else
- user_thresh_min = div_factor_fine(cache->length,
- bargs->usage_min);
+ user_thresh_min = mult_perc(cache->length, bargs->usage_min);
if (bargs->usage_max == 0)
user_thresh_max = 1;
else if (bargs->usage_max > 100)
user_thresh_max = cache->length;
else
- user_thresh_max = div_factor_fine(cache->length,
- bargs->usage_max);
+ user_thresh_max = mult_perc(cache->length, bargs->usage_max);
if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
ret = 0;
@@ -3622,7 +3614,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
else if (bargs->usage > 100)
user_thresh = cache->length;
else
- user_thresh = div_factor_fine(cache->length, bargs->usage);
+ user_thresh = mult_perc(cache->length, bargs->usage);
if (chunk_used < user_thresh)
ret = 0;
@@ -4012,10 +4004,11 @@ error:
return ret;
}
-/**
- * alloc_profile_is_valid - see if a given profile is valid and reduced
- * @flags: profile to validate
- * @extended: if true @flags is treated as an extended profile
+/*
+ * See if a given profile is valid and reduced.
+ *
+ * @flags: profile to validate
+ * @extended: if true @flags is treated as an extended profile
*/
static int alloc_profile_is_valid(u64 flags, int extended)
{
@@ -5087,7 +5080,7 @@ static void init_alloc_chunk_ctl_policy_regular(
ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
/* We don't want a chunk larger than 10% of writable space */
- ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
+ ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10),
ctl->max_chunk_size);
ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
}
@@ -5118,7 +5111,7 @@ static void init_alloc_chunk_ctl_policy_zoned(
}
/* We don't want a chunk larger than 10% of writable space */
- limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
+ limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10),
zone_size),
min_chunk_size);
ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
@@ -5595,7 +5588,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- bg->chunk_item_inserted = 1;
+ set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags);
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
@@ -5894,9 +5887,11 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
* and the stripes.
*/
sizeof(u64) * (total_stripes),
- GFP_NOFS|__GFP_NOFAIL);
+ GFP_NOFS);
+
+ if (!bioc)
+ return NULL;
- atomic_set(&bioc->error, 0);
refcount_set(&bioc->refs, 1);
bioc->fs_info = fs_info;
@@ -6092,7 +6087,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
int ret = 0;
ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &length, &bioc, 0, 0);
+ logical, &length, &bioc, NULL, NULL, 0);
if (ret) {
ASSERT(bioc == NULL);
return ret;
@@ -6153,9 +6148,7 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
cache = btrfs_lookup_block_group(fs_info, logical);
- spin_lock(&cache->lock);
- ret = cache->to_copy;
- spin_unlock(&cache->lock);
+ ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
btrfs_put_block_group(cache);
return ret;
@@ -6351,11 +6344,19 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
return 0;
}
-static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
- enum btrfs_map_op op,
- u64 logical, u64 *length,
- struct btrfs_io_context **bioc_ret,
- int mirror_num, int need_raid_map)
+static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
+ u32 stripe_index, u64 stripe_offset, u64 stripe_nr)
+{
+ dst->dev = map->stripes[stripe_index].dev;
+ dst->physical = map->stripes[stripe_index].physical +
+ stripe_offset + stripe_nr * map->stripe_len;
+}
+
+int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ u64 logical, u64 *length,
+ struct btrfs_io_context **bioc_ret,
+ struct btrfs_io_stripe *smap, int *mirror_num_ret,
+ int need_raid_map)
{
struct extent_map *em;
struct map_lookup *map;
@@ -6366,6 +6367,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
int data_stripes;
int i;
int ret = 0;
+ int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
int num_stripes;
int max_errors = 0;
int tgtdev_indexes = 0;
@@ -6526,6 +6528,29 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
tgtdev_indexes = num_stripes;
}
+ /*
+ * If this I/O maps to a single device, try to return the device and
+ * physical block information on the stack instead of allocating an
+ * I/O context structure.
+ */
+ if (smap && num_alloc_stripes == 1 &&
+ !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) &&
+ (!need_full_stripe(op) || !dev_replace_is_ongoing ||
+ !dev_replace->tgtdev)) {
+ if (patch_the_first_stripe_for_dev_replace) {
+ smap->dev = dev_replace->tgtdev;
+ smap->physical = physical_to_patch_in_first_stripe;
+ *mirror_num_ret = map->num_stripes + 1;
+ } else {
+ set_io_stripe(smap, map, stripe_index, stripe_offset,
+ stripe_nr);
+ *mirror_num_ret = mirror_num;
+ }
+ *bioc_ret = NULL;
+ ret = 0;
+ goto out;
+ }
+
bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
if (!bioc) {
ret = -ENOMEM;
@@ -6533,9 +6558,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
for (i = 0; i < num_stripes; i++) {
- bioc->stripes[i].physical = map->stripes[stripe_index].physical +
- stripe_offset + stripe_nr * map->stripe_len;
- bioc->stripes[i].dev = map->stripes[stripe_index].dev;
+ set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset,
+ stripe_nr);
stripe_index++;
}
@@ -6603,7 +6627,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
struct btrfs_io_context **bioc_ret, int mirror_num)
{
return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
- mirror_num, 0);
+ NULL, &mirror_num, 0);
}
/* For Scrub/replace */
@@ -6611,185 +6635,8 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_io_context **bioc_ret)
{
- return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
-}
-
-static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_io_context *bioc)
-{
- if (bioc->orig_bio->bi_opf & REQ_META)
- return bioc->fs_info->endio_meta_workers;
- return bioc->fs_info->endio_workers;
-}
-
-static void btrfs_end_bio_work(struct work_struct *work)
-{
- struct btrfs_bio *bbio =
- container_of(work, struct btrfs_bio, end_io_work);
-
- bio_endio(&bbio->bio);
-}
-
-static void btrfs_end_bioc(struct btrfs_io_context *bioc, bool async)
-{
- struct bio *orig_bio = bioc->orig_bio;
- struct btrfs_bio *bbio = btrfs_bio(orig_bio);
-
- bbio->mirror_num = bioc->mirror_num;
- orig_bio->bi_private = bioc->private;
- orig_bio->bi_end_io = bioc->end_io;
-
- /*
- * Only send an error to the higher layers if it is beyond the tolerance
- * threshold.
- */
- if (atomic_read(&bioc->error) > bioc->max_errors)
- orig_bio->bi_status = BLK_STS_IOERR;
- else
- orig_bio->bi_status = BLK_STS_OK;
-
- if (btrfs_op(orig_bio) == BTRFS_MAP_READ && async) {
- INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
- queue_work(btrfs_end_io_wq(bioc), &bbio->end_io_work);
- } else {
- bio_endio(orig_bio);
- }
-
- btrfs_put_bioc(bioc);
-}
-
-static void btrfs_end_bio(struct bio *bio)
-{
- struct btrfs_io_stripe *stripe = bio->bi_private;
- struct btrfs_io_context *bioc = stripe->bioc;
-
- if (bio->bi_status) {
- atomic_inc(&bioc->error);
- if (bio->bi_status == BLK_STS_IOERR ||
- bio->bi_status == BLK_STS_TARGET) {
- if (btrfs_op(bio) == BTRFS_MAP_WRITE)
- btrfs_dev_stat_inc_and_print(stripe->dev,
- BTRFS_DEV_STAT_WRITE_ERRS);
- else if (!(bio->bi_opf & REQ_RAHEAD))
- btrfs_dev_stat_inc_and_print(stripe->dev,
- BTRFS_DEV_STAT_READ_ERRS);
- if (bio->bi_opf & REQ_PREFLUSH)
- btrfs_dev_stat_inc_and_print(stripe->dev,
- BTRFS_DEV_STAT_FLUSH_ERRS);
- }
- }
-
- if (bio != bioc->orig_bio)
- bio_put(bio);
-
- btrfs_bio_counter_dec(bioc->fs_info);
- if (atomic_dec_and_test(&bioc->stripes_pending))
- btrfs_end_bioc(bioc, true);
-}
-
-static void submit_stripe_bio(struct btrfs_io_context *bioc,
- struct bio *orig_bio, int dev_nr, bool clone)
-{
- struct btrfs_fs_info *fs_info = bioc->fs_info;
- struct btrfs_device *dev = bioc->stripes[dev_nr].dev;
- u64 physical = bioc->stripes[dev_nr].physical;
- struct bio *bio;
-
- if (!dev || !dev->bdev ||
- test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
- (btrfs_op(orig_bio) == BTRFS_MAP_WRITE &&
- !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
- atomic_inc(&bioc->error);
- if (atomic_dec_and_test(&bioc->stripes_pending))
- btrfs_end_bioc(bioc, false);
- return;
- }
-
- if (clone) {
- bio = bio_alloc_clone(dev->bdev, orig_bio, GFP_NOFS, &fs_bio_set);
- } else {
- bio = orig_bio;
- bio_set_dev(bio, dev->bdev);
- btrfs_bio(bio)->device = dev;
- }
-
- bioc->stripes[dev_nr].bioc = bioc;
- bio->bi_private = &bioc->stripes[dev_nr];
- bio->bi_end_io = btrfs_end_bio;
- bio->bi_iter.bi_sector = physical >> 9;
- /*
- * For zone append writing, bi_sector must point the beginning of the
- * zone
- */
- if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- if (btrfs_dev_is_sequential(dev, physical)) {
- u64 zone_start = round_down(physical, fs_info->zone_size);
-
- bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
- } else {
- bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
- bio->bi_opf |= REQ_OP_WRITE;
- }
- }
- btrfs_debug_in_rcu(fs_info,
- "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
- __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
- (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
- dev->devid, bio->bi_iter.bi_size);
-
- btrfs_bio_counter_inc_noblocked(fs_info);
-
- btrfsic_check_bio(bio);
- submit_bio(bio);
-}
-
-void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num)
-{
- u64 logical = bio->bi_iter.bi_sector << 9;
- u64 length = bio->bi_iter.bi_size;
- u64 map_length = length;
- int ret;
- int dev_nr;
- int total_devs;
- struct btrfs_io_context *bioc = NULL;
-
- btrfs_bio_counter_inc_blocked(fs_info);
- ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
- &map_length, &bioc, mirror_num, 1);
- if (ret) {
- btrfs_bio_counter_dec(fs_info);
- bio->bi_status = errno_to_blk_status(ret);
- bio_endio(bio);
- return;
- }
-
- total_devs = bioc->num_stripes;
- bioc->orig_bio = bio;
- bioc->private = bio->bi_private;
- bioc->end_io = bio->bi_end_io;
- atomic_set(&bioc->stripes_pending, total_devs);
-
- if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
- ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
- if (btrfs_op(bio) == BTRFS_MAP_WRITE)
- raid56_parity_write(bio, bioc);
- else
- raid56_parity_recover(bio, bioc, mirror_num, true);
- return;
- }
-
- if (map_length < length) {
- btrfs_crit(fs_info,
- "mapping failed logical %llu bio len %llu len %llu",
- logical, length, map_length);
- BUG();
- }
-
- for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
- const bool should_clone = (dev_nr < total_devs - 1);
-
- submit_stripe_bio(bioc, bio, dev_nr, should_clone);
- }
- btrfs_bio_counter_dec(fs_info);
+ return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
+ NULL, NULL, 1);
}
static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
@@ -6805,18 +6652,18 @@ static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
const struct btrfs_device *device)
{
- ASSERT((args->devid != (u64)-1) || args->missing);
+ if (args->missing) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
+ !device->bdev)
+ return true;
+ return false;
+ }
- if ((args->devid != (u64)-1) && device->devid != args->devid)
+ if (device->devid != args->devid)
return false;
if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
return false;
- if (!args->missing)
- return true;
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
- !device->bdev)
- return true;
- return false;
+ return true;
}
/*
@@ -6863,8 +6710,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
* always do NOFS because we use it in a lot of other GFP_KERNEL safe
* places.
*/
+
nofs_flag = memalloc_nofs_save();
- device = btrfs_alloc_device(NULL, &devid, dev_uuid);
+ device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(device))
return device;
@@ -6879,22 +6727,24 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
return device;
}
-/**
- * btrfs_alloc_device - allocate struct btrfs_device
+/*
+ * Allocate new device struct, set up devid and UUID.
+ *
* @fs_info: used only for generating a new devid, can be NULL if
* devid is provided (i.e. @devid != NULL).
* @devid: a pointer to devid for this device. If NULL a new devid
* is generated.
* @uuid: a pointer to UUID for this device. If NULL a new UUID
* is generated.
+ * @path: a pointer to device path if available, NULL otherwise.
*
* Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
* on error. Returned struct is not linked onto any lists and must be
* destroyed with btrfs_free_device.
*/
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
- const u64 *devid,
- const u8 *uuid)
+ const u64 *devid, const u8 *uuid,
+ const char *path)
{
struct btrfs_device *dev;
u64 tmp;
@@ -6912,8 +6762,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
atomic_set(&dev->dev_stats_ccnt, 0);
btrfs_device_data_ordered_init(dev);
- extent_io_tree_init(fs_info, &dev->alloc_state,
- IO_TREE_DEVICE_ALLOC_STATE, NULL);
+ extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE);
if (devid)
tmp = *devid;
@@ -6933,6 +6782,17 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
else
generate_random_uuid(dev->uuid);
+ if (path) {
+ struct rcu_string *name;
+
+ name = rcu_string_strdup(path, GFP_KERNEL);
+ if (!name) {
+ btrfs_free_device(dev);
+ return ERR_PTR(-ENOMEM);
+ }
+ rcu_assign_pointer(dev->name, name);
+ }
+
return dev;
}
@@ -7029,6 +6889,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
u64 devid;
u64 type;
u8 uuid[BTRFS_UUID_SIZE];
+ int index;
int num_stripes;
int ret;
int i;
@@ -7036,6 +6897,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
logical = key->offset;
length = btrfs_chunk_length(leaf, chunk);
type = btrfs_chunk_type(leaf, chunk);
+ index = btrfs_bg_flags_to_raid_index(type);
num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
#if BITS_PER_LONG == 32
@@ -7089,7 +6951,15 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
map->io_align = btrfs_chunk_io_align(leaf, chunk);
map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
map->type = type;
- map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+ /*
+ * We can't use the sub_stripes value, as for profiles other than
+ * RAID10, they may have 0 as sub_stripes for filesystems created by
+ * older mkfs (<v5.4).
+ * In that case, it can cause divide-by-zero errors later.
+ * Since currently sub_stripes is fixed for each profile, let's
+ * use the trusted value instead.
+ */
+ map->sub_stripes = btrfs_raid_array[index].sub_stripes;
map->verified_stripes = 0;
em->orig_block_len = btrfs_calc_stripe_length(em);
for (i = 0; i < num_stripes; i++) {
@@ -7106,8 +6976,9 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
map->stripes[i].dev = handle_missing_device(fs_info,
devid, uuid);
if (IS_ERR(map->stripes[i].dev)) {
+ ret = PTR_ERR(map->stripes[i].dev);
free_extent_map(em);
- return PTR_ERR(map->stripes[i].dev);
+ return ret;
}
}
@@ -7621,10 +7492,11 @@ error:
return ret;
}
-void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
+int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
+ int ret = 0;
fs_devices->fs_info = fs_info;
@@ -7633,12 +7505,18 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
device->fs_info = fs_info;
list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
- list_for_each_entry(device, &seed_devs->devices, dev_list)
+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
device->fs_info = fs_info;
+ ret = btrfs_get_dev_zone_info(device, false);
+ if (ret)
+ break;
+ }
seed_devs->fs_info = fs_info;
}
mutex_unlock(&fs_devices->device_list_mutex);
+
+ return ret;
}
static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
@@ -7762,7 +7640,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
if (ret < 0) {
btrfs_warn_in_rcu(fs_info,
"error %d while searching for dev_stats item for device %s",
- ret, rcu_str_deref(device->name));
+ ret, btrfs_dev_name(device));
goto out;
}
@@ -7773,7 +7651,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
if (ret != 0) {
btrfs_warn_in_rcu(fs_info,
"delete too small dev_stats item for device %s failed %d",
- rcu_str_deref(device->name), ret);
+ btrfs_dev_name(device), ret);
goto out;
}
ret = 1;
@@ -7787,7 +7665,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
if (ret < 0) {
btrfs_warn_in_rcu(fs_info,
"insert dev_stats item for device %s failed %d",
- rcu_str_deref(device->name), ret);
+ btrfs_dev_name(device), ret);
goto out;
}
}
@@ -7852,7 +7730,7 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
return;
btrfs_err_rl_in_rcu(dev->fs_info,
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
- rcu_str_deref(dev->name),
+ btrfs_dev_name(dev),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
@@ -7872,7 +7750,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
btrfs_info_in_rcu(dev->fs_info,
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
- rcu_str_deref(dev->name),
+ btrfs_dev_name(dev),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
@@ -8244,7 +8122,7 @@ static int relocating_repair_kthread(void *data)
if (!cache)
goto out;
- if (!cache->relocating_repair)
+ if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags))
goto out;
ret = btrfs_may_alloc_data_chunk(fs_info, target);
@@ -8281,14 +8159,10 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
if (!cache)
return true;
- spin_lock(&cache->lock);
- if (cache->relocating_repair) {
- spin_unlock(&cache->lock);
+ if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) {
btrfs_put_block_group(cache);
return true;
}
- cache->relocating_repair = 1;
- spin_unlock(&cache->lock);
kthread_run(relocating_repair_kthread, cache,
"btrfs-relocating-repair");
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5639961b3626..6b7a05f6cf82 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -6,10 +6,12 @@
#ifndef BTRFS_VOLUMES_H
#define BTRFS_VOLUMES_H
-#include <linux/bio.h>
#include <linux/sort.h>
#include <linux/btrfs.h>
#include "async-thread.h"
+#include "messages.h"
+#include "tree-checker.h"
+#include "rcu-string.h"
#define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G)
@@ -181,6 +183,31 @@ struct btrfs_device {
};
/*
+ * Block group or device which contains an active swapfile. Used for preventing
+ * unsafe operations while a swapfile is active.
+ *
+ * These are sorted on (ptr, inode) (note that a block group or device can
+ * contain more than one swapfile). We compare the pointer values because we
+ * don't actually care what the object is, we just need a quick check whether
+ * the object exists in the rbtree.
+ */
+struct btrfs_swapfile_pin {
+ struct rb_node node;
+ void *ptr;
+ struct inode *inode;
+ /*
+ * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr
+ * points to a struct btrfs_device.
+ */
+ bool is_block_group;
+ /*
+ * Only used when 'is_block_group' is true and it is the number of
+ * extents used by a swapfile for this block group ('ptr' field).
+ */
+ int bg_extent_count;
+};
+
+/*
* If we read those variants at the context of their own lock, we needn't
* use the following helpers, reading them directly is safe.
*/
@@ -329,6 +356,8 @@ struct btrfs_fs_devices {
* nonrot flag set
*/
bool rotating;
+ /* Devices support TRIM/discard commands */
+ bool discardable;
struct btrfs_fs_info *fs_info;
/* sysfs kobjects */
@@ -343,8 +372,6 @@ struct btrfs_fs_devices {
enum btrfs_read_policy read_policy;
};
-#define BTRFS_BIO_INLINE_CSUM_SIZE 64
-
#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \
- sizeof(struct btrfs_chunk)) \
/ sizeof(struct btrfs_stripe) + 1)
@@ -354,69 +381,6 @@ struct btrfs_fs_devices {
- 2 * sizeof(struct btrfs_chunk)) \
/ sizeof(struct btrfs_stripe) + 1)
-/*
- * Maximum number of sectors for a single bio to limit the size of the
- * checksum array. This matches the number of bio_vecs per bio and thus the
- * I/O size for buffered I/O.
- */
-#define BTRFS_MAX_BIO_SECTORS (256)
-
-/*
- * Additional info to pass along bio.
- *
- * Mostly for btrfs specific features like csum and mirror_num.
- */
-struct btrfs_bio {
- unsigned int mirror_num;
-
- /* for direct I/O */
- u64 file_offset;
-
- /* @device is for stripe IO submission. */
- struct btrfs_device *device;
- u8 *csum;
- u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
- struct bvec_iter iter;
-
- /* For read end I/O handling */
- struct work_struct end_io_work;
-
- /*
- * This member must come last, bio_alloc_bioset will allocate enough
- * bytes for entire btrfs_bio but relies on bio being last.
- */
- struct bio bio;
-};
-
-static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
-{
- return container_of(bio, struct btrfs_bio, bio);
-}
-
-static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
-{
- if (bbio->csum != bbio->csum_inline) {
- kfree(bbio->csum);
- bbio->csum = NULL;
- }
-}
-
-/*
- * Iterate through a btrfs_bio (@bbio) on a per-sector basis.
- *
- * bvl - struct bio_vec
- * bbio - struct btrfs_bio
- * iters - struct bvec_iter
- * bio_offset - unsigned int
- */
-#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \
- for ((iter) = (bbio)->iter, (bio_offset) = 0; \
- (iter).bi_size && \
- (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \
- (bio_offset) += fs_info->sectorsize, \
- bio_advance_iter_single(&(bbio)->bio, &(iter), \
- (fs_info)->sectorsize))
-
struct btrfs_io_stripe {
struct btrfs_device *dev;
union {
@@ -451,12 +415,9 @@ struct btrfs_discard_stripe {
*/
struct btrfs_io_context {
refcount_t refs;
- atomic_t stripes_pending;
struct btrfs_fs_info *fs_info;
u64 map_type; /* get from map_lookup->type */
- bio_end_io_t *end_io;
struct bio *orig_bio;
- void *private;
atomic_t error;
int max_errors;
int num_stripes;
@@ -561,6 +522,13 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio)
}
}
+static inline unsigned long btrfs_chunk_item_size(int num_stripes)
+{
+ ASSERT(num_stripes);
+ return sizeof(struct btrfs_chunk) +
+ sizeof(struct btrfs_stripe) * (num_stripes - 1);
+}
+
void btrfs_get_bioc(struct btrfs_io_context *bioc);
void btrfs_put_bioc(struct btrfs_io_context *bioc);
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
@@ -569,6 +537,11 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_io_context **bioc_ret);
+int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ u64 logical, u64 *length,
+ struct btrfs_io_context **bioc_ret,
+ struct btrfs_io_stripe *smap, int *mirror_num_ret,
+ int need_raid_map);
struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
u64 logical, u64 *length_ret,
u32 *num_stripes);
@@ -580,7 +553,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
u64 type);
void btrfs_mapping_tree_free(struct extent_map_tree *tree);
-void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder);
struct btrfs_device *btrfs_scan_one_device(const char *path,
@@ -597,8 +569,8 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
const char *path);
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
- const u64 *devid,
- const u8 *uuid);
+ const u64 *devid, const u8 *uuid,
+ const char *path);
void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
void btrfs_free_device(struct btrfs_device *device);
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
@@ -629,7 +601,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_get_dev_stats *stats);
-void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
+int btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans);
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev);
@@ -699,6 +671,14 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
atomic_inc(&dev->dev_stats_ccnt);
}
+static inline const char *btrfs_dev_name(const struct btrfs_device *device)
+{
+ if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ return "<missing disk>";
+ else
+ return rcu_str_deref(device->name);
+}
+
void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
@@ -714,4 +694,6 @@ const char *btrfs_bg_type_to_raid_name(u64 flags);
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
+bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
+
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 5bb8d8c86311..0ed4b119a7ca 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -13,12 +13,16 @@
#include <linux/iversion.h>
#include <linux/sched/mm.h>
#include "ctree.h"
+#include "fs.h"
+#include "messages.h"
#include "btrfs_inode.h"
#include "transaction.h"
#include "xattr.h"
#include "disk-io.h"
#include "props.h"
#include "locking.h"
+#include "accessors.h"
+#include "dir-item.h"
int btrfs_getxattr(struct inode *inode, const char *name,
void *buffer, size_t size)
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b4f44662cda7..01a13de11832 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -155,8 +155,8 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
in_page = find_get_page(mapping,
start >> PAGE_SHIFT);
data_in = kmap_local_page(in_page);
- memcpy(workspace->buf + i * PAGE_SIZE,
- data_in, PAGE_SIZE);
+ copy_page(workspace->buf + i * PAGE_SIZE,
+ data_in);
start += PAGE_SIZE;
}
workspace->strm.next_in = workspace->buf;
@@ -355,7 +355,7 @@ done:
return ret;
}
-int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+int zlib_decompress(struct list_head *ws, const u8 *data_in,
struct page *dest_page, unsigned long start_byte, size_t srclen,
size_t destlen)
{
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 73c6929f7be6..a759668477bb 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -15,6 +15,8 @@
#include "transaction.h"
#include "dev-replace.h"
#include "space-info.h"
+#include "fs.h"
+#include "accessors.h"
/* Maximum number of zones to report per blkdev_report_zones() call */
#define BTRFS_REPORT_NR_ZONES 4096
@@ -134,7 +136,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
super[i] = page_address(page[i]);
}
- if (super[0]->generation > super[1]->generation)
+ if (btrfs_super_generation(super[0]) >
+ btrfs_super_generation(super[1]))
sector = zones[1].start;
else
sector = zones[0].start;
@@ -391,8 +394,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
zone_sectors = bdev_zone_sectors(bdev);
}
- /* Check if it's power of 2 (see is_power_of_2) */
- ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
+ ASSERT(is_power_of_two_u64(zone_sectors));
zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
/* We reject devices with a zone size larger than 8GB */
@@ -466,7 +468,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
goto out;
}
- zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
+ zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
if (!zones) {
ret = -ENOMEM;
goto out;
@@ -585,7 +587,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
}
- kfree(zones);
+ kvfree(zones);
switch (bdev_zoned_model(bdev)) {
case BLK_ZONED_HM:
@@ -617,7 +619,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
return 0;
out:
- kfree(zones);
+ kvfree(zones);
out_free_zone_info:
btrfs_destroy_dev_zone_info(device);
@@ -639,6 +641,46 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
device->zone_info = NULL;
}
+struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev)
+{
+ struct btrfs_zoned_device_info *zone_info;
+
+ zone_info = kmemdup(orig_dev->zone_info, sizeof(*zone_info), GFP_KERNEL);
+ if (!zone_info)
+ return NULL;
+
+ zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->seq_zones)
+ goto out;
+
+ bitmap_copy(zone_info->seq_zones, orig_dev->zone_info->seq_zones,
+ zone_info->nr_zones);
+
+ zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->empty_zones)
+ goto out;
+
+ bitmap_copy(zone_info->empty_zones, orig_dev->zone_info->empty_zones,
+ zone_info->nr_zones);
+
+ zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->active_zones)
+ goto out;
+
+ bitmap_copy(zone_info->active_zones, orig_dev->zone_info->active_zones,
+ zone_info->nr_zones);
+ zone_info->zone_cache = NULL;
+
+ return zone_info;
+
+out:
+ bitmap_free(zone_info->seq_zones);
+ bitmap_free(zone_info->empty_zones);
+ bitmap_free(zone_info->active_zones);
+ kfree(zone_info);
+ return NULL;
+}
+
int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
{
@@ -652,80 +694,55 @@ int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
return 0;
}
+static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_device *device;
+
+ list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+ if (device->bdev &&
+ bdev_zoned_model(device->bdev) == BLK_ZONED_HM) {
+ btrfs_err(fs_info,
+ "zoned: mode not enabled but zoned device found: %pg",
+ device->bdev);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
- u64 zoned_devices = 0;
- u64 nr_devices = 0;
u64 zone_size = 0;
u64 max_zone_append_size = 0;
- const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
- int ret = 0;
+ int ret;
- /* Count zoned devices */
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
- enum blk_zoned_model model;
+ /*
+ * Host-Managed devices can't be used without the ZONED flag. With the
+ * ZONED all devices can be used, using zone emulation if required.
+ */
+ if (!btrfs_fs_incompat(fs_info, ZONED))
+ return btrfs_check_for_zoned_device(fs_info);
+
+ list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
if (!device->bdev)
continue;
- model = bdev_zoned_model(device->bdev);
- /*
- * A Host-Managed zoned device must be used as a zoned device.
- * A Host-Aware zoned device and a non-zoned devices can be
- * treated as a zoned device, if ZONED flag is enabled in the
- * superblock.
- */
- if (model == BLK_ZONED_HM ||
- (model == BLK_ZONED_HA && incompat_zoned) ||
- (model == BLK_ZONED_NONE && incompat_zoned)) {
- struct btrfs_zoned_device_info *zone_info;
-
- zone_info = device->zone_info;
- zoned_devices++;
- if (!zone_size) {
- zone_size = zone_info->zone_size;
- } else if (zone_info->zone_size != zone_size) {
- btrfs_err(fs_info,
+ if (!zone_size) {
+ zone_size = zone_info->zone_size;
+ } else if (zone_info->zone_size != zone_size) {
+ btrfs_err(fs_info,
"zoned: unequal block device zone sizes: have %llu found %llu",
- device->zone_info->zone_size,
- zone_size);
- ret = -EINVAL;
- goto out;
- }
- if (!max_zone_append_size ||
- (zone_info->max_zone_append_size &&
- zone_info->max_zone_append_size < max_zone_append_size))
- max_zone_append_size =
- zone_info->max_zone_append_size;
+ zone_info->zone_size, zone_size);
+ return -EINVAL;
}
- nr_devices++;
- }
-
- if (!zoned_devices && !incompat_zoned)
- goto out;
-
- if (!zoned_devices && incompat_zoned) {
- /* No zoned block device found on ZONED filesystem */
- btrfs_err(fs_info,
- "zoned: no zoned devices found on a zoned filesystem");
- ret = -EINVAL;
- goto out;
- }
-
- if (zoned_devices && !incompat_zoned) {
- btrfs_err(fs_info,
- "zoned: mode not enabled but zoned device found");
- ret = -EINVAL;
- goto out;
- }
-
- if (zoned_devices != nr_devices) {
- btrfs_err(fs_info,
- "zoned: cannot mix zoned and regular devices");
- ret = -EINVAL;
- goto out;
+ if (!max_zone_append_size ||
+ (zone_info->max_zone_append_size &&
+ zone_info->max_zone_append_size < max_zone_append_size))
+ max_zone_append_size = zone_info->max_zone_append_size;
}
/*
@@ -737,14 +754,12 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
btrfs_err(fs_info,
"zoned: zone size %llu not aligned to stripe %u",
zone_size, BTRFS_STRIPE_LEN);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
btrfs_err(fs_info, "zoned: mixed block groups not supported");
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
fs_info->zone_size = zone_size;
@@ -760,11 +775,10 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
*/
ret = btrfs_check_mountopts_zoned(fs_info);
if (ret)
- goto out;
+ return ret;
btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
-out:
- return ret;
+ return 0;
}
int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
@@ -1005,8 +1019,8 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
}
-/**
- * btrfs_find_allocatable_zones - find allocatable zones within a given region
+/*
+ * Find allocatable zones within a given region.
*
* @device: the device to allocate a region on
* @hole_start: the position of the hole to allocate the region
@@ -1423,7 +1437,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
if (num_sequential > 0)
- cache->seq_zone = true;
+ set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
if (num_conventional > 0) {
/* Zone capacity is always zone size in emulation */
@@ -1436,7 +1450,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
} else if (map->num_stripes == num_conventional) {
cache->alloc_offset = last_alloc;
- cache->zone_is_active = 1;
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
goto out;
}
}
@@ -1452,7 +1466,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
cache->alloc_offset = alloc_offsets[0];
cache->zone_capacity = caps[0];
- cache->zone_is_active = test_bit(0, active);
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
break;
case BTRFS_BLOCK_GROUP_DUP:
if (map->type & BTRFS_BLOCK_GROUP_DATA) {
@@ -1486,7 +1501,9 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
} else {
- cache->zone_is_active = test_bit(0, active);
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &cache->runtime_flags);
}
cache->alloc_offset = alloc_offsets[0];
cache->zone_capacity = min(caps[0], caps[1]);
@@ -1530,7 +1547,7 @@ out:
if (!ret) {
cache->meta_write_pointer = cache->alloc_offset + cache->start;
- if (cache->zone_is_active) {
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) {
btrfs_get_block_group(cache);
spin_lock(&fs_info->zone_active_bgs_lock);
list_add_tail(&cache->active_bg_list,
@@ -1563,7 +1580,6 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
free = cache->zone_capacity - cache->alloc_offset;
/* We only need ->free_space in ALLOC_SEQ block groups */
- cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
cache->free_space_ctl->free_space = free;
cache->zone_unusable = unusable;
@@ -1633,7 +1649,7 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
if (!cache)
return false;
- ret = cache->seq_zone;
+ ret = !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
btrfs_put_block_group(cache);
return ret;
@@ -1847,7 +1863,7 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
return device;
}
-/**
+/*
* Activate block group and underlying device zones
*
* @block_group: the block group to activate
@@ -1871,7 +1887,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
- if (block_group->zone_is_active) {
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
ret = true;
goto out_unlock;
}
@@ -1897,7 +1913,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
}
/* Successfully activated all the zones */
- block_group->zone_is_active = 1;
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
space_info->active_total_bytes += block_group->length;
spin_unlock(&block_group->lock);
btrfs_try_granting_tickets(fs_info, space_info);
@@ -1960,7 +1976,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
int i;
spin_lock(&block_group->lock);
- if (!block_group->zone_is_active) {
+ if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
return 0;
}
@@ -2001,7 +2017,8 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
* Bail out if someone already deactivated the block group, or
* allocated space is left in the block group.
*/
- if (!block_group->zone_is_active) {
+ if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
btrfs_dec_block_group_ro(block_group);
return 0;
@@ -2014,7 +2031,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
}
}
- block_group->zone_is_active = 0;
+ clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
block_group->alloc_offset = block_group->zone_capacity;
block_group->free_space_ctl->free_space = 0;
btrfs_clear_treelog_bg(block_group);
@@ -2137,7 +2154,8 @@ static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
struct extent_buffer *eb)
{
- if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
+ if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &bg->runtime_flags) ||
+ eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
return;
if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) {
@@ -2222,13 +2240,14 @@ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logica
ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));
spin_lock(&block_group->lock);
- if (!block_group->zoned_data_reloc_ongoing)
+ if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))
goto out;
/* All relocation extents are written. */
if (block_group->start + block_group->alloc_offset == logical + length) {
/* Now, release this block group for further allocations. */
- block_group->zoned_data_reloc_ongoing = 0;
+ clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
+ &block_group->runtime_flags);
}
out:
@@ -2300,7 +2319,9 @@ int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
list) {
if (!spin_trylock(&bg->lock))
continue;
- if (btrfs_zoned_bg_is_full(bg) || bg->zone_is_active) {
+ if (btrfs_zoned_bg_is_full(bg) ||
+ test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &bg->runtime_flags)) {
spin_unlock(&bg->lock);
continue;
}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index e17462db3a84..f43990985d80 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -5,6 +5,7 @@
#include <linux/types.h>
#include <linux/blkdev.h>
+#include "messages.h"
#include "volumes.h"
#include "disk-io.h"
#include "block-group.h"
@@ -36,6 +37,7 @@ int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info);
int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache);
void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
+struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev);
int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info);
int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
@@ -103,6 +105,16 @@ static inline int btrfs_get_dev_zone_info(struct btrfs_device *device,
static inline void btrfs_destroy_dev_zone_info(struct btrfs_device *device) { }
+/*
+ * In case the kernel is compiled without CONFIG_BLK_DEV_ZONED we'll never call
+ * into btrfs_clone_dev_zone_info() so it's safe to return NULL here.
+ */
+static inline struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(
+ struct btrfs_device *orig_dev)
+{
+ return NULL;
+}
+
static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info)
{
if (!btrfs_is_zoned(fs_info))
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 35a0224d4eb7..e34f1ab99d56 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -94,7 +94,7 @@ static inline struct workspace *list_to_workspace(struct list_head *list)
void zstd_free_workspace(struct list_head *ws);
struct list_head *zstd_alloc_workspace(unsigned int level);
-/**
+/*
* Timer callback to free unused workspaces.
*
* @t: timer
@@ -616,7 +616,7 @@ done:
return ret;
}
-int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+int zstd_decompress(struct list_head *ws, const u8 *data_in,
struct page *dest_page, unsigned long start_byte, size_t srclen,
size_t destlen)
{
diff --git a/fs/buffer.c b/fs/buffer.c
index 55e762a58eb6..d9c6d1fbb6dd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -52,8 +52,8 @@
#include "internal.h"
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
-static int submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
- struct writeback_control *wbc);
+static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
+ struct writeback_control *wbc);
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
@@ -152,7 +152,7 @@ static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
/*
* Default synchronous end-of-IO handler.. Just mark it up-to-date and
- * unlock the buffer. This is what ll_rw_block uses too.
+ * unlock the buffer.
*/
void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
{
@@ -491,8 +491,8 @@ int inode_has_buffers(struct inode *inode)
* all already-submitted IO to complete, but does not queue any new
* writes to the disk.
*
- * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
- * you dirty the buffers, and then use osync_inode_buffers to wait for
+ * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer
+ * as you dirty the buffers, and then use osync_inode_buffers to wait for
* completion. Any other dirty buffers which are not yet queued for
* write will not be flushed to disk by the osync.
*/
@@ -562,7 +562,7 @@ void write_boundary_block(struct block_device *bdev,
struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
if (bh) {
if (buffer_dirty(bh))
- ll_rw_block(REQ_OP_WRITE, 1, &bh);
+ write_dirty_buffer(bh, 0);
put_bh(bh);
}
}
@@ -1342,23 +1342,12 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
{
struct buffer_head *bh = __getblk(bdev, block, size);
if (likely(bh)) {
- ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, &bh);
+ bh_readahead(bh, REQ_RAHEAD);
brelse(bh);
}
}
EXPORT_SYMBOL(__breadahead);
-void __breadahead_gfp(struct block_device *bdev, sector_t block, unsigned size,
- gfp_t gfp)
-{
- struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
- if (likely(bh)) {
- ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, &bh);
- brelse(bh);
- }
-}
-EXPORT_SYMBOL(__breadahead_gfp);
-
/**
* __bread_gfp() - reads a specified block and returns the bh
* @bdev: the block_device to read from
@@ -1464,19 +1453,15 @@ EXPORT_SYMBOL(set_bh_page);
static void discard_buffer(struct buffer_head * bh)
{
- unsigned long b_state, b_state_old;
+ unsigned long b_state;
lock_buffer(bh);
clear_buffer_dirty(bh);
bh->b_bdev = NULL;
- b_state = bh->b_state;
- for (;;) {
- b_state_old = cmpxchg(&bh->b_state, b_state,
- (b_state & ~BUFFER_FLAGS_DISCARD));
- if (b_state_old == b_state)
- break;
- b_state = b_state_old;
- }
+ b_state = READ_ONCE(bh->b_state);
+ do {
+ } while (!try_cmpxchg(&bh->b_state, &b_state,
+ b_state & ~BUFFER_FLAGS_DISCARD));
unlock_buffer(bh);
}
@@ -1817,7 +1802,7 @@ done:
/*
* The page was marked dirty, but the buffers were
* clean. Someone wrote them back by hand with
- * ll_rw_block/submit_bh. A rare case.
+ * write_dirty_buffer/submit_bh. A rare case.
*/
end_page_writeback(page);
@@ -2033,7 +2018,7 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
!buffer_unwritten(bh) &&
(block_start < from || block_end > to)) {
- ll_rw_block(REQ_OP_READ, 1, &bh);
+ bh_read_nowait(bh, 0);
*wait_bh++=bh;
}
}
@@ -2352,7 +2337,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
struct address_space *mapping = inode->i_mapping;
const struct address_space_operations *aops = mapping->a_ops;
struct page *page;
- void *fsdata;
+ void *fsdata = NULL;
int err;
err = inode_newsize_ok(inode, size);
@@ -2378,7 +2363,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
const struct address_space_operations *aops = mapping->a_ops;
unsigned int blocksize = i_blocksize(inode);
struct page *page;
- void *fsdata;
+ void *fsdata = NULL;
pgoff_t index, curidx;
loff_t curpos;
unsigned zerofrom, offset, len;
@@ -2593,11 +2578,9 @@ int block_truncate_page(struct address_space *mapping,
set_buffer_uptodate(bh);
if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
- err = -EIO;
- ll_rw_block(REQ_OP_READ, 1, &bh);
- wait_on_buffer(bh);
+ err = bh_read(bh, 0);
/* Uhhuh. Read error. Complain and punt. */
- if (!buffer_uptodate(bh))
+ if (err < 0)
goto unlock;
}
@@ -2673,8 +2656,8 @@ static void end_bio_bh_io_sync(struct bio *bio)
bio_put(bio);
}
-static int submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
- struct writeback_control *wbc)
+static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
+ struct writeback_control *wbc)
{
const enum req_op op = opf & REQ_OP_MASK;
struct bio *bio;
@@ -2717,70 +2700,14 @@ static int submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
}
submit_bio(bio);
- return 0;
}
-int submit_bh(blk_opf_t opf, struct buffer_head *bh)
+void submit_bh(blk_opf_t opf, struct buffer_head *bh)
{
- return submit_bh_wbc(opf, bh, NULL);
+ submit_bh_wbc(opf, bh, NULL);
}
EXPORT_SYMBOL(submit_bh);
-/**
- * ll_rw_block: low-level access to block devices (DEPRECATED)
- * @opf: block layer request operation and flags.
- * @nr: number of &struct buffer_heads in the array
- * @bhs: array of pointers to &struct buffer_head
- *
- * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
- * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE.
- * @opf contains flags modifying the detailed I/O behavior, most notably
- * %REQ_RAHEAD.
- *
- * This function drops any buffer that it cannot get a lock on (with the
- * BH_Lock state bit), any buffer that appears to be clean when doing a write
- * request, and any buffer that appears to be up-to-date when doing read
- * request. Further it marks as clean buffers that are processed for
- * writing (the buffer cache won't assume that they are actually clean
- * until the buffer gets unlocked).
- *
- * ll_rw_block sets b_end_io to simple completion handler that marks
- * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
- * any waiters.
- *
- * All of the buffers must be for the same device, and must also be a
- * multiple of the current approved size for the device.
- */
-void ll_rw_block(const blk_opf_t opf, int nr, struct buffer_head *bhs[])
-{
- const enum req_op op = opf & REQ_OP_MASK;
- int i;
-
- for (i = 0; i < nr; i++) {
- struct buffer_head *bh = bhs[i];
-
- if (!trylock_buffer(bh))
- continue;
- if (op == REQ_OP_WRITE) {
- if (test_clear_buffer_dirty(bh)) {
- bh->b_end_io = end_buffer_write_sync;
- get_bh(bh);
- submit_bh(opf, bh);
- continue;
- }
- } else {
- if (!buffer_uptodate(bh)) {
- bh->b_end_io = end_buffer_read_sync;
- get_bh(bh);
- submit_bh(opf, bh);
- continue;
- }
- }
- unlock_buffer(bh);
- }
-}
-EXPORT_SYMBOL(ll_rw_block);
-
void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
{
lock_buffer(bh);
@@ -2801,8 +2728,6 @@ EXPORT_SYMBOL(write_dirty_buffer);
*/
int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
{
- int ret = 0;
-
WARN_ON(atomic_read(&bh->b_count) < 1);
lock_buffer(bh);
if (test_clear_buffer_dirty(bh)) {
@@ -2817,14 +2742,14 @@ int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
- ret = submit_bh(REQ_OP_WRITE | op_flags, bh);
+ submit_bh(REQ_OP_WRITE | op_flags, bh);
wait_on_buffer(bh);
- if (!ret && !buffer_uptodate(bh))
- ret = -EIO;
+ if (!buffer_uptodate(bh))
+ return -EIO;
} else {
unlock_buffer(bh);
}
- return ret;
+ return 0;
}
EXPORT_SYMBOL(__sync_dirty_buffer);
@@ -3029,29 +2954,69 @@ int bh_uptodate_or_lock(struct buffer_head *bh)
EXPORT_SYMBOL(bh_uptodate_or_lock);
/**
- * bh_submit_read - Submit a locked buffer for reading
+ * __bh_read - Submit read for a locked buffer
* @bh: struct buffer_head
+ * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
+ * @wait: wait until reading finish
*
- * Returns zero on success and -EIO on error.
+ * Returns zero on success or don't wait, and -EIO on error.
*/
-int bh_submit_read(struct buffer_head *bh)
+int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
{
- BUG_ON(!buffer_locked(bh));
+ int ret = 0;
- if (buffer_uptodate(bh)) {
- unlock_buffer(bh);
- return 0;
- }
+ BUG_ON(!buffer_locked(bh));
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, bh);
- wait_on_buffer(bh);
- if (buffer_uptodate(bh))
- return 0;
- return -EIO;
+ submit_bh(REQ_OP_READ | op_flags, bh);
+ if (wait) {
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh))
+ ret = -EIO;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__bh_read);
+
+/**
+ * __bh_read_batch - Submit read for a batch of unlocked buffers
+ * @nr: entry number of the buffer batch
+ * @bhs: a batch of struct buffer_head
+ * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
+ * @force_lock: force to get a lock on the buffer if set, otherwise drops any
+ * buffer that cannot lock.
+ *
+ * Returns zero on success or don't wait, and -EIO on error.
+ */
+void __bh_read_batch(int nr, struct buffer_head *bhs[],
+ blk_opf_t op_flags, bool force_lock)
+{
+ int i;
+
+ for (i = 0; i < nr; i++) {
+ struct buffer_head *bh = bhs[i];
+
+ if (buffer_uptodate(bh))
+ continue;
+
+ if (force_lock)
+ lock_buffer(bh);
+ else
+ if (!trylock_buffer(bh))
+ continue;
+
+ if (buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ continue;
+ }
+
+ bh->b_end_io = end_buffer_read_sync;
+ get_bh(bh);
+ submit_bh(REQ_OP_READ | op_flags, bh);
+ }
}
-EXPORT_SYMBOL(bh_submit_read);
+EXPORT_SYMBOL(__bh_read_batch);
void __init buffer_init(void)
{
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 000a28f46e59..175a25fcade8 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -385,38 +385,35 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
term_func, term_func_priv);
}
-/*
- * Prepare a read operation, shortening it to a cached/uncached
- * boundary as appropriate.
- */
-static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq,
- loff_t i_size)
+static inline enum netfs_io_source
+cachefiles_do_prepare_read(struct netfs_cache_resources *cres,
+ loff_t start, size_t *_len, loff_t i_size,
+ unsigned long *_flags, ino_t netfs_ino)
{
enum cachefiles_prepare_read_trace why;
- struct netfs_io_request *rreq = subreq->rreq;
- struct netfs_cache_resources *cres = &rreq->cache_resources;
- struct cachefiles_object *object;
+ struct cachefiles_object *object = NULL;
struct cachefiles_cache *cache;
struct fscache_cookie *cookie = fscache_cres_cookie(cres);
const struct cred *saved_cred;
struct file *file = cachefiles_cres_file(cres);
enum netfs_io_source ret = NETFS_DOWNLOAD_FROM_SERVER;
+ size_t len = *_len;
loff_t off, to;
ino_t ino = file ? file_inode(file)->i_ino : 0;
int rc;
- _enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size);
+ _enter("%zx @%llx/%llx", len, start, i_size);
- if (subreq->start >= i_size) {
+ if (start >= i_size) {
ret = NETFS_FILL_WITH_ZEROES;
why = cachefiles_trace_read_after_eof;
goto out_no_object;
}
if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) {
- __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
+ __set_bit(NETFS_SREQ_COPY_TO_CACHE, _flags);
why = cachefiles_trace_read_no_data;
- if (!test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags))
+ if (!test_bit(NETFS_SREQ_ONDEMAND, _flags))
goto out_no_object;
}
@@ -437,7 +434,7 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
retry:
off = cachefiles_inject_read_error();
if (off == 0)
- off = vfs_llseek(file, subreq->start, SEEK_DATA);
+ off = vfs_llseek(file, start, SEEK_DATA);
if (off < 0 && off >= (loff_t)-MAX_ERRNO) {
if (off == (loff_t)-ENXIO) {
why = cachefiles_trace_read_seek_nxio;
@@ -449,21 +446,22 @@ retry:
goto out;
}
- if (off >= subreq->start + subreq->len) {
+ if (off >= start + len) {
why = cachefiles_trace_read_found_hole;
goto download_and_store;
}
- if (off > subreq->start) {
+ if (off > start) {
off = round_up(off, cache->bsize);
- subreq->len = off - subreq->start;
+ len = off - start;
+ *_len = len;
why = cachefiles_trace_read_found_part;
goto download_and_store;
}
to = cachefiles_inject_read_error();
if (to == 0)
- to = vfs_llseek(file, subreq->start, SEEK_HOLE);
+ to = vfs_llseek(file, start, SEEK_HOLE);
if (to < 0 && to >= (loff_t)-MAX_ERRNO) {
trace_cachefiles_io_error(object, file_inode(file), to,
cachefiles_trace_seek_error);
@@ -471,12 +469,13 @@ retry:
goto out;
}
- if (to < subreq->start + subreq->len) {
- if (subreq->start + subreq->len >= i_size)
+ if (to < start + len) {
+ if (start + len >= i_size)
to = round_up(to, cache->bsize);
else
to = round_down(to, cache->bsize);
- subreq->len = to - subreq->start;
+ len = to - start;
+ *_len = len;
}
why = cachefiles_trace_read_have_data;
@@ -484,12 +483,11 @@ retry:
goto out;
download_and_store:
- __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
- if (test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags)) {
- rc = cachefiles_ondemand_read(object, subreq->start,
- subreq->len);
+ __set_bit(NETFS_SREQ_COPY_TO_CACHE, _flags);
+ if (test_bit(NETFS_SREQ_ONDEMAND, _flags)) {
+ rc = cachefiles_ondemand_read(object, start, len);
if (!rc) {
- __clear_bit(NETFS_SREQ_ONDEMAND, &subreq->flags);
+ __clear_bit(NETFS_SREQ_ONDEMAND, _flags);
goto retry;
}
ret = NETFS_INVALID_READ;
@@ -497,11 +495,35 @@ download_and_store:
out:
cachefiles_end_secure(cache, saved_cred);
out_no_object:
- trace_cachefiles_prep_read(subreq, ret, why, ino);
+ trace_cachefiles_prep_read(object, start, len, *_flags, ret, why, ino, netfs_ino);
return ret;
}
/*
+ * Prepare a read operation, shortening it to a cached/uncached
+ * boundary as appropriate.
+ */
+static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq,
+ loff_t i_size)
+{
+ return cachefiles_do_prepare_read(&subreq->rreq->cache_resources,
+ subreq->start, &subreq->len, i_size,
+ &subreq->flags, subreq->rreq->inode->i_ino);
+}
+
+/*
+ * Prepare an on-demand read operation, shortening it to a cached/uncached
+ * boundary as appropriate.
+ */
+static enum netfs_io_source
+cachefiles_prepare_ondemand_read(struct netfs_cache_resources *cres,
+ loff_t start, size_t *_len, loff_t i_size,
+ unsigned long *_flags, ino_t ino)
+{
+ return cachefiles_do_prepare_read(cres, start, _len, i_size, _flags, ino);
+}
+
+/*
* Prepare for a write to occur.
*/
int __cachefiles_prepare_write(struct cachefiles_object *object,
@@ -621,6 +643,7 @@ static const struct netfs_cache_ops cachefiles_netfs_cache_ops = {
.write = cachefiles_write,
.prepare_read = cachefiles_prepare_read,
.prepare_write = cachefiles_prepare_write,
+ .prepare_ondemand_read = cachefiles_prepare_ondemand_read,
.query_occupancy = cachefiles_query_occupancy,
};
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index facf2ebe464b..03ca8f2f657a 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -15,9 +15,8 @@
* file or directory. The caller must hold the inode lock.
*/
static bool __cachefiles_mark_inode_in_use(struct cachefiles_object *object,
- struct dentry *dentry)
+ struct inode *inode)
{
- struct inode *inode = d_backing_inode(dentry);
bool can_use = false;
if (!(inode->i_flags & S_KERNEL_FILE)) {
@@ -26,21 +25,18 @@ static bool __cachefiles_mark_inode_in_use(struct cachefiles_object *object,
can_use = true;
} else {
trace_cachefiles_mark_failed(object, inode);
- pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n",
- dentry, inode->i_ino);
}
return can_use;
}
static bool cachefiles_mark_inode_in_use(struct cachefiles_object *object,
- struct dentry *dentry)
+ struct inode *inode)
{
- struct inode *inode = d_backing_inode(dentry);
bool can_use;
inode_lock(inode);
- can_use = __cachefiles_mark_inode_in_use(object, dentry);
+ can_use = __cachefiles_mark_inode_in_use(object, inode);
inode_unlock(inode);
return can_use;
}
@@ -49,21 +45,17 @@ static bool cachefiles_mark_inode_in_use(struct cachefiles_object *object,
* Unmark a backing inode. The caller must hold the inode lock.
*/
static void __cachefiles_unmark_inode_in_use(struct cachefiles_object *object,
- struct dentry *dentry)
+ struct inode *inode)
{
- struct inode *inode = d_backing_inode(dentry);
-
inode->i_flags &= ~S_KERNEL_FILE;
trace_cachefiles_mark_inactive(object, inode);
}
static void cachefiles_do_unmark_inode_in_use(struct cachefiles_object *object,
- struct dentry *dentry)
+ struct inode *inode)
{
- struct inode *inode = d_backing_inode(dentry);
-
inode_lock(inode);
- __cachefiles_unmark_inode_in_use(object, dentry);
+ __cachefiles_unmark_inode_in_use(object, inode);
inode_unlock(inode);
}
@@ -77,14 +69,12 @@ void cachefiles_unmark_inode_in_use(struct cachefiles_object *object,
struct cachefiles_cache *cache = object->volume->cache;
struct inode *inode = file_inode(file);
- if (inode) {
- cachefiles_do_unmark_inode_in_use(object, file->f_path.dentry);
+ cachefiles_do_unmark_inode_in_use(object, inode);
- if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) {
- atomic_long_add(inode->i_blocks, &cache->b_released);
- if (atomic_inc_return(&cache->f_released))
- cachefiles_state_changed(cache);
- }
+ if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) {
+ atomic_long_add(inode->i_blocks, &cache->b_released);
+ if (atomic_inc_return(&cache->f_released))
+ cachefiles_state_changed(cache);
}
}
@@ -164,8 +154,11 @@ retry:
inode_lock(d_inode(subdir));
inode_unlock(d_inode(dir));
- if (!__cachefiles_mark_inode_in_use(NULL, subdir))
+ if (!__cachefiles_mark_inode_in_use(NULL, d_inode(subdir))) {
+ pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n",
+ subdir, d_inode(subdir)->i_ino);
goto mark_error;
+ }
inode_unlock(d_inode(subdir));
@@ -224,9 +217,7 @@ nomem_d_alloc:
void cachefiles_put_directory(struct dentry *dir)
{
if (dir) {
- inode_lock(dir->d_inode);
- __cachefiles_unmark_inode_in_use(NULL, dir);
- inode_unlock(dir->d_inode);
+ cachefiles_do_unmark_inode_in_use(NULL, d_inode(dir));
dput(dir);
}
}
@@ -410,7 +401,7 @@ try_again:
"Rename failed with error %d", ret);
}
- __cachefiles_unmark_inode_in_use(object, rep);
+ __cachefiles_unmark_inode_in_use(object, d_inode(rep));
unlock_rename(cache->graveyard, dir);
dput(grave);
_leave(" = 0");
@@ -451,84 +442,72 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object)
const struct cred *saved_cred;
struct dentry *fan = volume->fanout[(u8)object->cookie->key_hash];
struct file *file;
- struct path path;
+ const struct path parentpath = { .mnt = cache->mnt, .dentry = fan };
uint64_t ni_size;
long ret;
cachefiles_begin_secure(cache, &saved_cred);
- path.mnt = cache->mnt;
ret = cachefiles_inject_write_error();
- if (ret == 0)
- path.dentry = vfs_tmpfile(&init_user_ns, fan, S_IFREG, O_RDWR);
- else
- path.dentry = ERR_PTR(ret);
- if (IS_ERR(path.dentry)) {
- trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(path.dentry),
+ if (ret == 0) {
+ file = vfs_tmpfile_open(&init_user_ns, &parentpath, S_IFREG,
+ O_RDWR | O_LARGEFILE | O_DIRECT,
+ cache->cache_cred);
+ ret = PTR_ERR_OR_ZERO(file);
+ }
+ if (ret) {
+ trace_cachefiles_vfs_error(object, d_inode(fan), ret,
cachefiles_trace_tmpfile_error);
- if (PTR_ERR(path.dentry) == -EIO)
+ if (ret == -EIO)
cachefiles_io_error_obj(object, "Failed to create tmpfile");
- file = ERR_CAST(path.dentry);
- goto out;
+ goto err;
}
- trace_cachefiles_tmpfile(object, d_backing_inode(path.dentry));
+ trace_cachefiles_tmpfile(object, file_inode(file));
- if (!cachefiles_mark_inode_in_use(object, path.dentry)) {
- file = ERR_PTR(-EBUSY);
- goto out_dput;
- }
+ /* This is a newly created file with no other possible user */
+ if (!cachefiles_mark_inode_in_use(object, file_inode(file)))
+ WARN_ON(1);
ret = cachefiles_ondemand_init_object(object);
- if (ret < 0) {
- file = ERR_PTR(ret);
- goto out_unuse;
- }
+ if (ret < 0)
+ goto err_unuse;
ni_size = object->cookie->object_size;
ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE);
if (ni_size > 0) {
- trace_cachefiles_trunc(object, d_backing_inode(path.dentry), 0, ni_size,
+ trace_cachefiles_trunc(object, file_inode(file), 0, ni_size,
cachefiles_trunc_expand_tmpfile);
ret = cachefiles_inject_write_error();
if (ret == 0)
- ret = vfs_truncate(&path, ni_size);
+ ret = vfs_truncate(&file->f_path, ni_size);
if (ret < 0) {
trace_cachefiles_vfs_error(
- object, d_backing_inode(path.dentry), ret,
+ object, file_inode(file), ret,
cachefiles_trace_trunc_error);
- file = ERR_PTR(ret);
- goto out_unuse;
+ goto err_unuse;
}
}
- file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT,
- d_backing_inode(path.dentry), cache->cache_cred);
- if (IS_ERR(file)) {
- trace_cachefiles_vfs_error(object, d_backing_inode(path.dentry),
- PTR_ERR(file),
- cachefiles_trace_open_error);
- goto out_unuse;
- }
+ ret = -EINVAL;
if (unlikely(!file->f_op->read_iter) ||
unlikely(!file->f_op->write_iter)) {
fput(file);
pr_notice("Cache does not support read_iter and write_iter\n");
- file = ERR_PTR(-EINVAL);
- goto out_unuse;
+ goto err_unuse;
}
-
- goto out_dput;
-
-out_unuse:
- cachefiles_do_unmark_inode_in_use(object, path.dentry);
-out_dput:
- dput(path.dentry);
out:
cachefiles_end_secure(cache, saved_cred);
return file;
+
+err_unuse:
+ cachefiles_do_unmark_inode_in_use(object, file_inode(file));
+ fput(file);
+err:
+ file = ERR_PTR(ret);
+ goto out;
}
/*
@@ -569,8 +548,11 @@ static bool cachefiles_open_file(struct cachefiles_object *object,
_enter("%pd", dentry);
- if (!cachefiles_mark_inode_in_use(object, dentry))
+ if (!cachefiles_mark_inode_in_use(object, d_inode(dentry))) {
+ pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n",
+ dentry, d_inode(dentry)->i_ino);
return false;
+ }
/* We need to open a file interface onto a data file now as we can't do
* it on demand because writeback called from do_exit() sees
@@ -624,7 +606,7 @@ check_failed:
error_fput:
fput(file);
error:
- cachefiles_do_unmark_inode_in_use(object, dentry);
+ cachefiles_do_unmark_inode_in_use(object, d_inode(dentry));
dput(dentry);
return false;
}
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index f4fc8e0b847c..c7e8dd5b58d4 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -85,13 +85,14 @@ retry:
return acl;
}
-int ceph_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ceph_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int ret = 0, size = 0;
const char *name = NULL;
char *value = NULL;
struct iattr newattrs;
+ struct inode *inode = d_inode(dentry);
struct timespec64 old_ctime = inode->i_ctime;
umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index dcf701b05cc1..8c74871e37c9 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -288,7 +288,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
}
len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
- iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
+ iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter);
if (err == 0)
err = -EFAULT;
@@ -327,7 +327,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
}
dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
- iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
+ iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
if (err < 0) {
dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err);
@@ -1367,7 +1367,7 @@ out:
folio_put(folio);
if (check_cap)
- ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY);
return copied;
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 53cfe026b3ea..4b159f97fe7b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -754,6 +754,7 @@ void ceph_add_cap(struct inode *inode,
cap->issue_seq = seq;
cap->mseq = mseq;
cap->cap_gen = gen;
+ wake_up_all(&ci->i_cap_wq);
}
/*
@@ -1897,8 +1898,7 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
* CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
* further delay.
*/
-void ceph_check_caps(struct ceph_inode_info *ci, int flags,
- struct ceph_mds_session *session)
+void ceph_check_caps(struct ceph_inode_info *ci, int flags)
{
struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
@@ -1912,15 +1912,14 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
bool queue_invalidate = false;
bool tried_invalidate = false;
bool queue_writeback = false;
-
- if (session)
- ceph_get_mds_session(session);
+ struct ceph_mds_session *session = NULL;
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+ ci->i_ceph_flags |= CEPH_I_ASYNC_CHECK_CAPS;
+
/* Don't send messages until we get async create reply */
spin_unlock(&ci->i_ceph_lock);
- ceph_put_mds_session(session);
return;
}
@@ -2247,7 +2246,6 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req1 = NULL, *req2 = NULL;
- unsigned int max_sessions;
int ret, err = 0;
spin_lock(&ci->i_unsafe_lock);
@@ -2266,27 +2264,23 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
spin_unlock(&ci->i_unsafe_lock);
/*
- * The mdsc->max_sessions is unlikely to be changed
- * mostly, here we will retry it by reallocating the
- * sessions array memory to get rid of the mdsc->mutex
- * lock.
- */
-retry:
- max_sessions = mdsc->max_sessions;
-
- /*
* Trigger to flush the journal logs in all the relevant MDSes
* manually, or in the worst case we must wait at most 5 seconds
* to wait the journal logs to be flushed by the MDSes periodically.
*/
- if ((req1 || req2) && likely(max_sessions)) {
- struct ceph_mds_session **sessions = NULL;
- struct ceph_mds_session *s;
+ if (req1 || req2) {
struct ceph_mds_request *req;
+ struct ceph_mds_session **sessions;
+ struct ceph_mds_session *s;
+ unsigned int max_sessions;
int i;
- sessions = kzalloc(max_sessions * sizeof(s), GFP_KERNEL);
+ mutex_lock(&mdsc->mutex);
+ max_sessions = mdsc->max_sessions;
+
+ sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL);
if (!sessions) {
+ mutex_unlock(&mdsc->mutex);
err = -ENOMEM;
goto out;
}
@@ -2298,16 +2292,6 @@ retry:
s = req->r_session;
if (!s)
continue;
- if (unlikely(s->s_mds >= max_sessions)) {
- spin_unlock(&ci->i_unsafe_lock);
- for (i = 0; i < max_sessions; i++) {
- s = sessions[i];
- if (s)
- ceph_put_mds_session(s);
- }
- kfree(sessions);
- goto retry;
- }
if (!sessions[s->s_mds]) {
s = ceph_get_mds_session(s);
sessions[s->s_mds] = s;
@@ -2320,16 +2304,6 @@ retry:
s = req->r_session;
if (!s)
continue;
- if (unlikely(s->s_mds >= max_sessions)) {
- spin_unlock(&ci->i_unsafe_lock);
- for (i = 0; i < max_sessions; i++) {
- s = sessions[i];
- if (s)
- ceph_put_mds_session(s);
- }
- kfree(sessions);
- goto retry;
- }
if (!sessions[s->s_mds]) {
s = ceph_get_mds_session(s);
sessions[s->s_mds] = s;
@@ -2341,11 +2315,12 @@ retry:
/* the auth MDS */
spin_lock(&ci->i_ceph_lock);
if (ci->i_auth_cap) {
- s = ci->i_auth_cap->session;
- if (!sessions[s->s_mds])
- sessions[s->s_mds] = ceph_get_mds_session(s);
+ s = ci->i_auth_cap->session;
+ if (!sessions[s->s_mds])
+ sessions[s->s_mds] = ceph_get_mds_session(s);
}
spin_unlock(&ci->i_ceph_lock);
+ mutex_unlock(&mdsc->mutex);
/* send flush mdlog request to MDSes */
for (i = 0; i < max_sessions; i++) {
@@ -2759,13 +2734,17 @@ again:
* on transition from wanted -> needed caps. This is needed
* for WRBUFFER|WR -> WR to avoid a new WR sync write from
* going before a prior buffered writeback happens.
+ *
+ * For RDCACHE|RD -> RD, there is not need to wait and we can
+ * just exclude the revoking caps and force to sync read.
*/
int not = want & ~(have & need);
int revoking = implemented & ~have;
+ int exclude = revoking & not;
dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
inode, ceph_cap_string(have), ceph_cap_string(not),
ceph_cap_string(revoking));
- if ((revoking & not) == 0) {
+ if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) {
if (!snap_rwsem_locked &&
!ci->i_head_snapc &&
(need & CEPH_CAP_FILE_WR)) {
@@ -2787,7 +2766,7 @@ again:
snap_rwsem_locked = true;
}
if ((have & want) == want)
- *got = need | want;
+ *got = need | (want & ~exclude);
else
*got = need;
ceph_take_cap_refs(ci, *got, true);
@@ -2870,7 +2849,7 @@ static void check_max_size(struct inode *inode, loff_t endoff)
check = 1;
spin_unlock(&ci->i_ceph_lock);
if (check)
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY);
}
static inline int get_used_fmode(int caps)
@@ -3159,7 +3138,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
switch (mode) {
case PUT_CAP_REFS_SYNC:
if (last)
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
else if (flushsnaps)
ceph_flush_snaps(ci, NULL);
break;
@@ -3274,7 +3253,7 @@ unlock:
spin_unlock(&ci->i_ceph_lock);
if (last) {
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
} else if (flush_snaps) {
ceph_flush_snaps(ci, NULL);
}
@@ -3550,6 +3529,9 @@ static void handle_cap_grant(struct inode *inode,
check_caps = 1; /* check auth cap only */
else
check_caps = 2; /* check all caps */
+ /* If there is new caps, try to wake up the waiters */
+ if (~cap->issued & newcaps)
+ wake = true;
cap->issued = newcaps;
cap->implemented |= newcaps;
} else if (cap->issued == newcaps) {
@@ -3620,10 +3602,9 @@ static void handle_cap_grant(struct inode *inode,
mutex_unlock(&session->s_mutex);
if (check_caps == 1)
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL,
- session);
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL);
else if (check_caps == 2)
- ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session);
+ ceph_check_caps(ci, CHECK_CAPS_NOINVAL);
}
/*
@@ -4349,7 +4330,7 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
if (inode) {
spin_unlock(&mdsc->cap_delay_lock);
dout("check_delayed_caps on %p\n", inode);
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
iput(inode);
spin_lock(&mdsc->cap_delay_lock);
}
@@ -4378,7 +4359,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode));
spin_unlock(&mdsc->cap_dirty_lock);
ceph_wait_on_async_create(inode);
- ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH);
iput(inode);
spin_lock(&mdsc->cap_dirty_lock);
}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index e7e2ebac330d..6c7026cc8988 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -2033,7 +2033,7 @@ const struct inode_operations ceph_dir_iops = {
.getattr = ceph_getattr,
.setattr = ceph_setattr,
.listxattr = ceph_listxattr,
- .get_acl = ceph_get_acl,
+ .get_inode_acl = ceph_get_acl,
.set_acl = ceph_set_acl,
.mknod = ceph_mknod,
.symlink = ceph_symlink,
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index e0fa66ac8b9f..f780e4e0d062 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -181,6 +181,7 @@ struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino)
static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
{
struct inode *inode = __lookup_inode(sb, ino);
+ struct ceph_inode_info *ci = ceph_inode(inode);
int err;
if (IS_ERR(inode))
@@ -192,7 +193,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
return ERR_PTR(err);
}
/* -ESTALE if inode as been unlinked and no file is open */
- if ((inode->i_nlink == 0) && (atomic_read(&inode->i_count) == 1)) {
+ if ((inode->i_nlink == 0) && !__ceph_is_file_opened(ci)) {
iput(inode);
return ERR_PTR(-ESTALE);
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 04fd34557de8..764598e1efd9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -313,7 +313,7 @@ int ceph_renew_caps(struct inode *inode, int fmode)
spin_unlock(&ci->i_ceph_lock);
dout("renew caps %p want %s issued %s updating mds_wanted\n",
inode, ceph_cap_string(wanted), ceph_cap_string(issued));
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
return 0;
}
spin_unlock(&ci->i_ceph_lock);
@@ -408,7 +408,7 @@ int ceph_open(struct inode *inode, struct file *file)
if ((issued & wanted) != wanted &&
(mds_wanted & wanted) != wanted &&
ceph_snap(inode) != CEPH_SNAPDIR)
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
return ceph_init_file(inode, file, fmode);
} else if (ceph_snap(inode) != CEPH_NOSNAP &&
@@ -534,14 +534,23 @@ static void wake_async_create_waiters(struct inode *inode,
struct ceph_mds_session *session)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ bool check_cap = false;
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
+
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CHECK_CAPS) {
+ ci->i_ceph_flags &= ~CEPH_I_ASYNC_CHECK_CAPS;
+ check_cap = true;
+ }
}
ceph_kick_flushing_inode_caps(session, ci);
spin_unlock(&ci->i_ceph_lock);
+
+ if (check_cap)
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH);
}
static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
@@ -1092,7 +1101,7 @@ static void ceph_aio_complete(struct inode *inode,
loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len;
if (endoff > i_size_read(inode)) {
if (ceph_inode_set_size(inode, endoff))
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY);
}
spin_lock(&ci->i_ceph_lock);
@@ -1161,7 +1170,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
aio_req->total_len = rc + zlen;
}
- iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs,
+ iov_iter_bvec(&i, ITER_DEST, osd_data->bvec_pos.bvecs,
osd_data->num_bvecs, len);
iov_iter_advance(&i, rc);
iov_iter_zero(zlen, &i);
@@ -1400,7 +1409,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
int zlen = min_t(size_t, len - ret,
size - pos - ret);
- iov_iter_bvec(&i, READ, bvecs, num_pages, len);
+ iov_iter_bvec(&i, ITER_DEST, bvecs, num_pages, len);
iov_iter_advance(&i, ret);
iov_iter_zero(zlen, &i);
ret += zlen;
@@ -1421,8 +1430,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
if (write && pos > size) {
if (ceph_inode_set_size(inode, pos))
ceph_check_caps(ceph_inode(inode),
- CHECK_CAPS_AUTHONLY,
- NULL);
+ CHECK_CAPS_AUTHONLY);
}
}
@@ -1577,8 +1585,7 @@ out:
check_caps = ceph_inode_set_size(inode, pos);
if (check_caps)
ceph_check_caps(ceph_inode(inode),
- CHECK_CAPS_AUTHONLY,
- NULL);
+ CHECK_CAPS_AUTHONLY);
}
}
@@ -1906,7 +1913,7 @@ retry_snap:
if (dirty)
__mark_inode_dirty(inode, dirty);
if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
- ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH);
}
dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
@@ -2521,8 +2528,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
/* Let the MDS know about dst file size change */
if (ceph_inode_set_size(dst_inode, dst_off) ||
ceph_quota_is_max_bytes_approaching(dst_inode, dst_off))
- ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH,
- NULL);
+ ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH);
}
/* Mark Fw dirty */
spin_lock(&dst_ci->i_ceph_lock);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 42351d7a0dd6..23d05ec87fcc 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -126,7 +126,7 @@ const struct inode_operations ceph_file_iops = {
.setattr = ceph_setattr,
.getattr = ceph_getattr,
.listxattr = ceph_listxattr,
- .get_acl = ceph_get_acl,
+ .get_inode_acl = ceph_get_acl,
.set_acl = ceph_set_acl,
};
@@ -362,7 +362,7 @@ static int ceph_fill_fragtree(struct inode *inode,
if (nsplits != ci->i_fragtree_nsplits) {
update = true;
} else if (nsplits) {
- i = prandom_u32() % nsplits;
+ i = get_random_u32_below(nsplits);
id = le32_to_cpu(fragtree->splits[i].frag);
if (!__ceph_find_frag(ci, id))
update = true;
@@ -1909,7 +1909,7 @@ static void ceph_do_invalidate_pages(struct inode *inode)
mutex_unlock(&ci->i_truncate_mutex);
out:
if (check)
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
}
/*
@@ -1969,7 +1969,7 @@ retry:
mutex_unlock(&ci->i_truncate_mutex);
if (wrbuffer_refs == 0)
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
wake_up_all(&ci->i_cap_wq);
}
@@ -1991,7 +1991,7 @@ static void ceph_inode_work(struct work_struct *work)
__ceph_do_pending_vmtruncate(inode);
if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask))
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask))
ceph_flush_snaps(ci, NULL);
@@ -2192,6 +2192,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
&prealloc_cf);
inode->i_ctime = attr->ia_ctime;
+ inode_inc_iversion_raw(inode);
}
release &= issued;
@@ -2254,7 +2255,7 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
err = __ceph_setattr(inode, attr);
if (err >= 0 && (attr->ia_valid & ATTR_MODE))
- err = posix_acl_chmod(&init_user_ns, inode, attr->ia_mode);
+ err = posix_acl_chmod(&init_user_ns, dentry, attr->ia_mode);
return err;
}
@@ -2356,6 +2357,7 @@ int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
goto out;
}
+ req->r_feature_needed = CEPHFS_FEATURE_OP_GETVXATTR;
req->r_path2 = kstrdup(name, GFP_NOFS);
if (!req->r_path2) {
err = -ENOMEM;
@@ -2447,6 +2449,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int flags)
{
struct inode *inode = d_inode(path->dentry);
+ struct super_block *sb = inode->i_sb;
struct ceph_inode_info *ci = ceph_inode(inode);
u32 valid_mask = STATX_BASIC_STATS;
int err = 0;
@@ -2476,16 +2479,34 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
}
if (ceph_snap(inode) == CEPH_NOSNAP)
- stat->dev = inode->i_sb->s_dev;
+ stat->dev = sb->s_dev;
else
stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
if (S_ISDIR(inode->i_mode)) {
- if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
- RBYTES))
+ if (ceph_test_mount_opt(ceph_sb_to_client(sb), RBYTES)) {
stat->size = ci->i_rbytes;
- else
+ } else if (ceph_snap(inode) == CEPH_SNAPDIR) {
+ struct ceph_inode_info *pci;
+ struct ceph_snap_realm *realm;
+ struct inode *parent;
+
+ parent = ceph_lookup_inode(sb, ceph_ino(inode));
+ if (IS_ERR(parent))
+ return PTR_ERR(parent);
+
+ pci = ceph_inode(parent);
+ spin_lock(&pci->i_ceph_lock);
+ realm = pci->i_snap_realm;
+ if (realm)
+ stat->size = realm->num_snaps;
+ else
+ stat->size = 0;
+ spin_unlock(&pci->i_ceph_lock);
+ iput(parent);
+ } else {
stat->size = ci->i_files + ci->i_subdirs;
+ }
stat->blocks = 0;
stat->blksize = 65536;
/*
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 6e061bf62ad4..deac817647eb 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -253,7 +253,7 @@ static long ceph_ioctl_lazyio(struct file *file)
spin_unlock(&ci->i_ceph_lock);
dout("ioctl_layzio: file %p marked lazy\n", file);
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
} else {
dout("ioctl_layzio: file %p already lazy\n", file);
}
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 3e2843e86e27..f3b461c708a8 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -364,7 +364,7 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
*fcntl_count = 0;
*flock_count = 0;
- ctx = inode->i_flctx;
+ ctx = locks_inode_context(inode);
if (ctx) {
spin_lock(&ctx->flc_lock);
list_for_each_entry(lock, &ctx->flc_posix, fl_list)
@@ -418,7 +418,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
int num_fcntl_locks, int num_flock_locks)
{
struct file_lock *lock;
- struct file_lock_context *ctx = inode->i_flctx;
+ struct file_lock_context *ctx = locks_inode_context(inode);
int err = 0;
int seen_fcntl = 0;
int seen_flock = 0;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 80f8b9ec1a31..26a0a8b9975e 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2318,6 +2318,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
INIT_LIST_HEAD(&req->r_unsafe_dir_item);
INIT_LIST_HEAD(&req->r_unsafe_target_item);
req->r_fmode = -1;
+ req->r_feature_needed = -1;
kref_init(&req->r_kref);
RB_CLEAR_NODE(&req->r_node);
INIT_LIST_HEAD(&req->r_wait);
@@ -2916,6 +2917,16 @@ static void __do_request(struct ceph_mds_client *mdsc,
dout("do_request mds%d session %p state %s\n", mds, session,
ceph_session_state_name(session->s_state));
+
+ /*
+ * The old ceph will crash the MDSs when see unknown OPs
+ */
+ if (req->r_feature_needed > 0 &&
+ !test_bit(req->r_feature_needed, &session->s_features)) {
+ err = -EOPNOTSUPP;
+ goto out_session;
+ }
+
if (session->s_state != CEPH_MDS_SESSION_OPEN &&
session->s_state != CEPH_MDS_SESSION_HUNG) {
/*
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 256e3eada6c1..0598faa50e2e 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -31,8 +31,9 @@ enum ceph_feature_type {
CEPHFS_FEATURE_METRIC_COLLECT,
CEPHFS_FEATURE_ALTERNATE_NAME,
CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
+ CEPHFS_FEATURE_OP_GETVXATTR,
- CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
+ CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR,
};
#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
@@ -44,6 +45,7 @@ enum ceph_feature_type {
CEPHFS_FEATURE_DELEG_INO, \
CEPHFS_FEATURE_METRIC_COLLECT, \
CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \
+ CEPHFS_FEATURE_OP_GETVXATTR, \
}
/*
@@ -336,6 +338,8 @@ struct ceph_mds_request {
long long r_dir_ordered_cnt;
int r_readdir_cache_idx;
+ int r_feature_needed;
+
struct ceph_cap_reservation r_caps_reservation;
};
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 8d0a6d2c2da4..7dac21ee6ce7 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -29,7 +29,7 @@ static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy)
return -1;
/* pick */
- n = prandom_u32() % n;
+ n = get_random_u32_below(n);
for (j = 0, i = 0; i < m->possible_max_rank; i++) {
if (CEPH_MDS_IS_READY(i, ignore_laggy))
j++;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 864cdaa0d2bd..e4151852184e 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -763,7 +763,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
struct ceph_mds_snap_realm *ri; /* encoded */
__le64 *snaps; /* encoded */
__le64 *prior_parent_snaps; /* encoded */
- struct ceph_snap_realm *realm = NULL;
+ struct ceph_snap_realm *realm;
struct ceph_snap_realm *first_realm = NULL;
struct ceph_snap_realm *realm_to_rebuild = NULL;
int rebuild_snapcs;
@@ -774,6 +774,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
dout("%s deletion=%d\n", __func__, deletion);
more:
+ realm = NULL;
rebuild_snapcs = 0;
ceph_decode_need(&p, e, sizeof(*ri), bad);
ri = p;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 40630e6f691c..30bdb391a0dc 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -593,6 +593,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
#define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */
#define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT)
#define CEPH_I_SHUTDOWN (1 << 13) /* inode is no longer usable */
+#define CEPH_I_ASYNC_CHECK_CAPS (1 << 14) /* check caps immediately after async
+ creating finishes */
/*
* Masks of ceph inode work.
@@ -1117,7 +1119,7 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx);
struct posix_acl *ceph_get_acl(struct inode *, int, bool);
int ceph_set_acl(struct user_namespace *mnt_userns,
- struct inode *inode, struct posix_acl *acl, int type);
+ struct dentry *dentry, struct posix_acl *acl, int type);
int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
struct ceph_acl_sec_ctx *as_ctx);
void ceph_init_inode_acls(struct inode *inode,
@@ -1200,8 +1202,7 @@ extern void ceph_remove_capsnap(struct inode *inode,
extern void ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_session **psession);
extern bool __ceph_should_report_size(struct ceph_inode_info *ci);
-extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
- struct ceph_mds_session *session);
+extern void ceph_check_caps(struct ceph_inode_info *ci, int flags);
extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
extern int ceph_drop_caps_for_unlink(struct inode *inode);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index ba0ded7842a7..13deb45f1ec6 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -483,17 +483,24 @@ int cdev_add(struct cdev *p, dev_t dev, unsigned count)
p->dev = dev;
p->count = count;
- if (WARN_ON(dev == WHITEOUT_DEV))
- return -EBUSY;
+ if (WARN_ON(dev == WHITEOUT_DEV)) {
+ error = -EBUSY;
+ goto err;
+ }
error = kobj_map(cdev_map, dev, count, NULL,
exact_match, exact_lock, p);
if (error)
- return error;
+ goto err;
kobject_get(p->kobj.parent);
return 0;
+
+err:
+ kfree_const(p->kobj.name);
+ p->kobj.name = NULL;
+ return error;
}
/**
@@ -547,7 +554,7 @@ int cdev_device_add(struct cdev *cdev, struct device *dev)
}
rc = device_add(dev);
- if (rc)
+ if (rc && dev->devt)
cdev_del(cdev);
return rc;
diff --git a/fs/cifs/cached_dir.c b/fs/cifs/cached_dir.c
index b401339f6e73..60399081046a 100644
--- a/fs/cifs/cached_dir.c
+++ b/fs/cifs/cached_dir.c
@@ -5,12 +5,99 @@
* Copyright (c) 2022, Ronnie Sahlberg <lsahlber@redhat.com>
*/
+#include <linux/namei.h>
#include "cifsglob.h"
#include "cifsproto.h"
#include "cifs_debug.h"
#include "smb2proto.h"
#include "cached_dir.h"
+static struct cached_fid *init_cached_dir(const char *path);
+static void free_cached_dir(struct cached_fid *cfid);
+
+static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids,
+ const char *path,
+ bool lookup_only)
+{
+ struct cached_fid *cfid;
+
+ spin_lock(&cfids->cfid_list_lock);
+ list_for_each_entry(cfid, &cfids->entries, entry) {
+ if (!strcmp(cfid->path, path)) {
+ /*
+ * If it doesn't have a lease it is either not yet
+ * fully cached or it may be in the process of
+ * being deleted due to a lease break.
+ */
+ if (!cfid->has_lease) {
+ spin_unlock(&cfids->cfid_list_lock);
+ return NULL;
+ }
+ kref_get(&cfid->refcount);
+ spin_unlock(&cfids->cfid_list_lock);
+ return cfid;
+ }
+ }
+ if (lookup_only) {
+ spin_unlock(&cfids->cfid_list_lock);
+ return NULL;
+ }
+ if (cfids->num_entries >= MAX_CACHED_FIDS) {
+ spin_unlock(&cfids->cfid_list_lock);
+ return NULL;
+ }
+ cfid = init_cached_dir(path);
+ if (cfid == NULL) {
+ spin_unlock(&cfids->cfid_list_lock);
+ return NULL;
+ }
+ cfid->cfids = cfids;
+ cfids->num_entries++;
+ list_add(&cfid->entry, &cfids->entries);
+ cfid->on_list = true;
+ kref_get(&cfid->refcount);
+ spin_unlock(&cfids->cfid_list_lock);
+ return cfid;
+}
+
+static struct dentry *
+path_to_dentry(struct cifs_sb_info *cifs_sb, const char *path)
+{
+ struct dentry *dentry;
+ const char *s, *p;
+ char sep;
+
+ sep = CIFS_DIR_SEP(cifs_sb);
+ dentry = dget(cifs_sb->root);
+ s = path;
+
+ do {
+ struct inode *dir = d_inode(dentry);
+ struct dentry *child;
+
+ if (!S_ISDIR(dir->i_mode)) {
+ dput(dentry);
+ dentry = ERR_PTR(-ENOTDIR);
+ break;
+ }
+
+ /* skip separators */
+ while (*s == sep)
+ s++;
+ if (!*s)
+ break;
+ p = s++;
+ /* next separator */
+ while (*s && *s != sep)
+ s++;
+
+ child = lookup_positive_unlocked(p, dentry, s - p);
+ dput(dentry);
+ dentry = child;
+ } while (!IS_ERR(dentry));
+ return dentry;
+}
+
/*
* Open the and cache a directory handle.
* If error then *cfid is not initialized.
@@ -31,54 +118,57 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
struct kvec qi_iov[1];
int rc, flags = 0;
- __le16 utf16_path = 0; /* Null - since an open of top of share */
+ __le16 *utf16_path = NULL;
u8 oplock = SMB2_OPLOCK_LEVEL_II;
struct cifs_fid *pfid;
- struct dentry *dentry;
+ struct dentry *dentry = NULL;
struct cached_fid *cfid;
+ struct cached_fids *cfids;
- if (tcon == NULL || tcon->nohandlecache ||
+ if (tcon == NULL || tcon->cfids == NULL || tcon->nohandlecache ||
is_smb1_server(tcon->ses->server))
return -EOPNOTSUPP;
ses = tcon->ses;
server = ses->server;
+ cfids = tcon->cfids;
- if (cifs_sb->root == NULL)
- return -ENOENT;
+ if (!server->ops->new_lease_key)
+ return -EIO;
- if (strlen(path))
+ if (cifs_sb->root == NULL)
return -ENOENT;
- dentry = cifs_sb->root;
+ utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
+ if (!utf16_path)
+ return -ENOMEM;
- cfid = tcon->cfid;
- mutex_lock(&cfid->fid_mutex);
- if (cfid->is_valid) {
- cifs_dbg(FYI, "found a cached root file handle\n");
+ cfid = find_or_create_cached_dir(cfids, path, lookup_only);
+ if (cfid == NULL) {
+ kfree(utf16_path);
+ return -ENOENT;
+ }
+ /*
+ * At this point we either have a lease already and we can just
+ * return it. If not we are guaranteed to be the only thread accessing
+ * this cfid.
+ */
+ if (cfid->has_lease) {
*ret_cfid = cfid;
- kref_get(&cfid->refcount);
- mutex_unlock(&cfid->fid_mutex);
+ kfree(utf16_path);
return 0;
}
/*
* We do not hold the lock for the open because in case
- * SMB2_open needs to reconnect, it will end up calling
- * cifs_mark_open_files_invalid() which takes the lock again
- * thus causing a deadlock
+ * SMB2_open needs to reconnect.
+ * This is safe because no other thread will be able to get a ref
+ * to the cfid until we have finished opening the file and (possibly)
+ * acquired a lease.
*/
- mutex_unlock(&cfid->fid_mutex);
-
- if (lookup_only)
- return -ENOENT;
-
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- if (!server->ops->new_lease_key)
- return -EIO;
-
pfid = &cfid->fid;
server->ops->new_lease_key(pfid);
@@ -99,7 +189,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
oparms.reconnect = false;
rc = SMB2_open_init(tcon, server,
- &rqst[0], &oplock, &oparms, &utf16_path);
+ &rqst[0], &oplock, &oparms, utf16_path);
if (rc)
goto oshr_free;
smb2_set_next_command(tcon, &rqst[0]);
@@ -122,47 +212,13 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
rc = compound_send_recv(xid, ses, server,
flags, 2, rqst,
resp_buftype, rsp_iov);
- mutex_lock(&cfid->fid_mutex);
-
- /*
- * Now we need to check again as the cached root might have
- * been successfully re-opened from a concurrent process
- */
-
- if (cfid->is_valid) {
- /* work was already done */
-
- /* stash fids for close() later */
- struct cifs_fid fid = {
- .persistent_fid = pfid->persistent_fid,
- .volatile_fid = pfid->volatile_fid,
- };
-
- /*
- * caller expects this func to set the fid in cfid to valid
- * cached root, so increment the refcount.
- */
- kref_get(&cfid->refcount);
-
- mutex_unlock(&cfid->fid_mutex);
-
- if (rc == 0) {
- /* close extra handle outside of crit sec */
- SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
- }
- rc = 0;
- goto oshr_free;
- }
-
- /* Cached root is still invalid, continue normaly */
-
if (rc) {
if (rc == -EREMCHG) {
tcon->need_reconnect = true;
pr_warn_once("server share %s deleted\n",
- tcon->treeName);
+ tcon->tree_name);
}
- goto oshr_exit;
+ goto oshr_free;
}
atomic_inc(&tcon->num_remote_opens);
@@ -174,30 +230,18 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId);
#endif /* CIFS_DEBUG2 */
- cfid->tcon = tcon;
- cfid->is_valid = true;
- cfid->dentry = dentry;
- dget(dentry);
- kref_init(&cfid->refcount);
+ if (o_rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE)
+ goto oshr_free;
- /* BB TBD check to see if oplock level check can be removed below */
- if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) {
- /*
- * See commit 2f94a3125b87. Increment the refcount when we
- * get a lease for root, release it if lease break occurs
- */
- kref_get(&cfid->refcount);
- cfid->has_lease = true;
- smb2_parse_contexts(server, o_rsp,
- &oparms.fid->epoch,
- oparms.fid->lease_key, &oplock,
- NULL, NULL);
- } else
- goto oshr_exit;
+
+ smb2_parse_contexts(server, o_rsp,
+ &oparms.fid->epoch,
+ oparms.fid->lease_key, &oplock,
+ NULL, NULL);
qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info))
- goto oshr_exit;
+ goto oshr_free;
if (!smb2_validate_and_copy_iov(
le16_to_cpu(qi_rsp->OutputBufferOffset),
sizeof(struct smb2_file_all_info),
@@ -205,15 +249,42 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
(char *)&cfid->file_all_info))
cfid->file_all_info_is_valid = true;
+ if (!path[0])
+ dentry = dget(cifs_sb->root);
+ else {
+ dentry = path_to_dentry(cifs_sb, path);
+ if (IS_ERR(dentry)) {
+ rc = -ENOENT;
+ goto oshr_free;
+ }
+ }
+ cfid->dentry = dentry;
+ cfid->tcon = tcon;
cfid->time = jiffies;
+ cfid->is_open = true;
+ cfid->has_lease = true;
-oshr_exit:
- mutex_unlock(&cfid->fid_mutex);
oshr_free:
+ kfree(utf16_path);
SMB2_open_free(&rqst[0]);
SMB2_query_info_free(&rqst[1]);
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+ spin_lock(&cfids->cfid_list_lock);
+ if (!cfid->has_lease) {
+ if (cfid->on_list) {
+ list_del(&cfid->entry);
+ cfid->on_list = false;
+ cfids->num_entries--;
+ }
+ rc = -ENOENT;
+ }
+ spin_unlock(&cfids->cfid_list_lock);
+ if (rc) {
+ free_cached_dir(cfid);
+ cfid = NULL;
+ }
+
if (rc == 0)
*ret_cfid = cfid;
@@ -225,18 +296,22 @@ int open_cached_dir_by_dentry(struct cifs_tcon *tcon,
struct cached_fid **ret_cfid)
{
struct cached_fid *cfid;
+ struct cached_fids *cfids = tcon->cfids;
- cfid = tcon->cfid;
+ if (cfids == NULL)
+ return -ENOENT;
- mutex_lock(&cfid->fid_mutex);
- if (cfid->dentry == dentry) {
- cifs_dbg(FYI, "found a cached root file handle by dentry\n");
- *ret_cfid = cfid;
- kref_get(&cfid->refcount);
- mutex_unlock(&cfid->fid_mutex);
- return 0;
+ spin_lock(&cfids->cfid_list_lock);
+ list_for_each_entry(cfid, &cfids->entries, entry) {
+ if (dentry && cfid->dentry == dentry) {
+ cifs_dbg(FYI, "found a cached root file handle by dentry\n");
+ kref_get(&cfid->refcount);
+ *ret_cfid = cfid;
+ spin_unlock(&cfids->cfid_list_lock);
+ return 0;
+ }
}
- mutex_unlock(&cfid->fid_mutex);
+ spin_unlock(&cfids->cfid_list_lock);
return -ENOENT;
}
@@ -245,63 +320,50 @@ smb2_close_cached_fid(struct kref *ref)
{
struct cached_fid *cfid = container_of(ref, struct cached_fid,
refcount);
- struct cached_dirent *dirent, *q;
- if (cfid->is_valid) {
- cifs_dbg(FYI, "clear cached root file handle\n");
- SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid,
- cfid->fid.volatile_fid);
+ spin_lock(&cfid->cfids->cfid_list_lock);
+ if (cfid->on_list) {
+ list_del(&cfid->entry);
+ cfid->on_list = false;
+ cfid->cfids->num_entries--;
}
+ spin_unlock(&cfid->cfids->cfid_list_lock);
- /*
- * We only check validity above to send SMB2_close,
- * but we still need to invalidate these entries
- * when this function is called
- */
- cfid->is_valid = false;
- cfid->file_all_info_is_valid = false;
- cfid->has_lease = false;
- if (cfid->dentry) {
- dput(cfid->dentry);
- cfid->dentry = NULL;
- }
- /*
- * Delete all cached dirent names
- */
- mutex_lock(&cfid->dirents.de_mutex);
- list_for_each_entry_safe(dirent, q, &cfid->dirents.entries, entry) {
- list_del(&dirent->entry);
- kfree(dirent->name);
- kfree(dirent);
+ dput(cfid->dentry);
+ cfid->dentry = NULL;
+
+ if (cfid->is_open) {
+ SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid,
+ cfid->fid.volatile_fid);
}
- cfid->dirents.is_valid = 0;
- cfid->dirents.is_failed = 0;
- cfid->dirents.ctx = NULL;
- cfid->dirents.pos = 0;
- mutex_unlock(&cfid->dirents.de_mutex);
+ free_cached_dir(cfid);
}
-void close_cached_dir(struct cached_fid *cfid)
+void drop_cached_dir_by_name(const unsigned int xid, struct cifs_tcon *tcon,
+ const char *name, struct cifs_sb_info *cifs_sb)
{
- mutex_lock(&cfid->fid_mutex);
- kref_put(&cfid->refcount, smb2_close_cached_fid);
- mutex_unlock(&cfid->fid_mutex);
-}
+ struct cached_fid *cfid = NULL;
+ int rc;
-void close_cached_dir_lease_locked(struct cached_fid *cfid)
-{
+ rc = open_cached_dir(xid, tcon, name, cifs_sb, true, &cfid);
+ if (rc) {
+ cifs_dbg(FYI, "no cached dir found for rmdir(%s)\n", name);
+ return;
+ }
+ spin_lock(&cfid->cfids->cfid_list_lock);
if (cfid->has_lease) {
cfid->has_lease = false;
kref_put(&cfid->refcount, smb2_close_cached_fid);
}
+ spin_unlock(&cfid->cfids->cfid_list_lock);
+ close_cached_dir(cfid);
}
-void close_cached_dir_lease(struct cached_fid *cfid)
+
+void close_cached_dir(struct cached_fid *cfid)
{
- mutex_lock(&cfid->fid_mutex);
- close_cached_dir_lease_locked(cfid);
- mutex_unlock(&cfid->fid_mutex);
+ kref_put(&cfid->refcount, smb2_close_cached_fid);
}
/*
@@ -314,34 +376,60 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb)
struct cached_fid *cfid;
struct cifs_tcon *tcon;
struct tcon_link *tlink;
+ struct cached_fids *cfids;
for (node = rb_first(root); node; node = rb_next(node)) {
tlink = rb_entry(node, struct tcon_link, tl_rbnode);
tcon = tlink_tcon(tlink);
if (IS_ERR(tcon))
continue;
- cfid = tcon->cfid;
- mutex_lock(&cfid->fid_mutex);
- if (cfid->dentry) {
+ cfids = tcon->cfids;
+ if (cfids == NULL)
+ continue;
+ list_for_each_entry(cfid, &cfids->entries, entry) {
dput(cfid->dentry);
cfid->dentry = NULL;
}
- mutex_unlock(&cfid->fid_mutex);
}
}
/*
- * Invalidate and close all cached dirs when a TCON has been reset
+ * Invalidate all cached dirs when a TCON has been reset
* due to a session loss.
*/
void invalidate_all_cached_dirs(struct cifs_tcon *tcon)
{
- mutex_lock(&tcon->cfid->fid_mutex);
- tcon->cfid->is_valid = false;
- /* cached handle is not valid, so SMB2_CLOSE won't be sent below */
- close_cached_dir_lease_locked(tcon->cfid);
- memset(&tcon->cfid->fid, 0, sizeof(struct cifs_fid));
- mutex_unlock(&tcon->cfid->fid_mutex);
+ struct cached_fids *cfids = tcon->cfids;
+ struct cached_fid *cfid, *q;
+ LIST_HEAD(entry);
+
+ spin_lock(&cfids->cfid_list_lock);
+ list_for_each_entry_safe(cfid, q, &cfids->entries, entry) {
+ list_move(&cfid->entry, &entry);
+ cfids->num_entries--;
+ cfid->is_open = false;
+ cfid->on_list = false;
+ /* To prevent race with smb2_cached_lease_break() */
+ kref_get(&cfid->refcount);
+ }
+ spin_unlock(&cfids->cfid_list_lock);
+
+ list_for_each_entry_safe(cfid, q, &entry, entry) {
+ list_del(&cfid->entry);
+ cancel_work_sync(&cfid->lease_break);
+ if (cfid->has_lease) {
+ /*
+ * We lease was never cancelled from the server so we
+ * need to drop the reference.
+ */
+ spin_lock(&cfids->cfid_list_lock);
+ cfid->has_lease = false;
+ spin_unlock(&cfids->cfid_list_lock);
+ kref_put(&cfid->refcount, smb2_close_cached_fid);
+ }
+ /* Drop the extra reference opened above*/
+ kref_put(&cfid->refcount, smb2_close_cached_fid);
+ }
}
static void
@@ -350,39 +438,121 @@ smb2_cached_lease_break(struct work_struct *work)
struct cached_fid *cfid = container_of(work,
struct cached_fid, lease_break);
- close_cached_dir_lease(cfid);
+ spin_lock(&cfid->cfids->cfid_list_lock);
+ cfid->has_lease = false;
+ spin_unlock(&cfid->cfids->cfid_list_lock);
+ kref_put(&cfid->refcount, smb2_close_cached_fid);
}
int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16])
{
- if (tcon->cfid->is_valid &&
- !memcmp(lease_key,
- tcon->cfid->fid.lease_key,
- SMB2_LEASE_KEY_SIZE)) {
- tcon->cfid->time = 0;
- INIT_WORK(&tcon->cfid->lease_break,
- smb2_cached_lease_break);
- queue_work(cifsiod_wq,
- &tcon->cfid->lease_break);
- return true;
+ struct cached_fids *cfids = tcon->cfids;
+ struct cached_fid *cfid;
+
+ if (cfids == NULL)
+ return false;
+
+ spin_lock(&cfids->cfid_list_lock);
+ list_for_each_entry(cfid, &cfids->entries, entry) {
+ if (cfid->has_lease &&
+ !memcmp(lease_key,
+ cfid->fid.lease_key,
+ SMB2_LEASE_KEY_SIZE)) {
+ cfid->time = 0;
+ /*
+ * We found a lease remove it from the list
+ * so no threads can access it.
+ */
+ list_del(&cfid->entry);
+ cfid->on_list = false;
+ cfids->num_entries--;
+
+ queue_work(cifsiod_wq,
+ &cfid->lease_break);
+ spin_unlock(&cfids->cfid_list_lock);
+ return true;
+ }
}
+ spin_unlock(&cfids->cfid_list_lock);
return false;
}
-struct cached_fid *init_cached_dir(void)
+static struct cached_fid *init_cached_dir(const char *path)
{
struct cached_fid *cfid;
- cfid = kzalloc(sizeof(*cfid), GFP_KERNEL);
+ cfid = kzalloc(sizeof(*cfid), GFP_ATOMIC);
if (!cfid)
return NULL;
+ cfid->path = kstrdup(path, GFP_ATOMIC);
+ if (!cfid->path) {
+ kfree(cfid);
+ return NULL;
+ }
+
+ INIT_WORK(&cfid->lease_break, smb2_cached_lease_break);
+ INIT_LIST_HEAD(&cfid->entry);
INIT_LIST_HEAD(&cfid->dirents.entries);
mutex_init(&cfid->dirents.de_mutex);
- mutex_init(&cfid->fid_mutex);
+ spin_lock_init(&cfid->fid_lock);
+ kref_init(&cfid->refcount);
return cfid;
}
-void free_cached_dir(struct cifs_tcon *tcon)
+static void free_cached_dir(struct cached_fid *cfid)
+{
+ struct cached_dirent *dirent, *q;
+
+ dput(cfid->dentry);
+ cfid->dentry = NULL;
+
+ /*
+ * Delete all cached dirent names
+ */
+ list_for_each_entry_safe(dirent, q, &cfid->dirents.entries, entry) {
+ list_del(&dirent->entry);
+ kfree(dirent->name);
+ kfree(dirent);
+ }
+
+ kfree(cfid->path);
+ cfid->path = NULL;
+ kfree(cfid);
+}
+
+struct cached_fids *init_cached_dirs(void)
+{
+ struct cached_fids *cfids;
+
+ cfids = kzalloc(sizeof(*cfids), GFP_KERNEL);
+ if (!cfids)
+ return NULL;
+ spin_lock_init(&cfids->cfid_list_lock);
+ INIT_LIST_HEAD(&cfids->entries);
+ return cfids;
+}
+
+/*
+ * Called from tconInfoFree when we are tearing down the tcon.
+ * There are no active users or open files/directories at this point.
+ */
+void free_cached_dirs(struct cached_fids *cfids)
{
- kfree(tcon->cfid);
+ struct cached_fid *cfid, *q;
+ LIST_HEAD(entry);
+
+ spin_lock(&cfids->cfid_list_lock);
+ list_for_each_entry_safe(cfid, q, &cfids->entries, entry) {
+ cfid->on_list = false;
+ cfid->is_open = false;
+ list_move(&cfid->entry, &entry);
+ }
+ spin_unlock(&cfids->cfid_list_lock);
+
+ list_for_each_entry_safe(cfid, q, &entry, entry) {
+ list_del(&cfid->entry);
+ free_cached_dir(cfid);
+ }
+
+ kfree(cfids);
}
diff --git a/fs/cifs/cached_dir.h b/fs/cifs/cached_dir.h
index bd262dc8b179..2f4e764c9ca9 100644
--- a/fs/cifs/cached_dir.h
+++ b/fs/cifs/cached_dir.h
@@ -31,13 +31,17 @@ struct cached_dirents {
};
struct cached_fid {
- bool is_valid:1; /* Do we have a useable root fid */
- bool file_all_info_is_valid:1;
+ struct list_head entry;
+ struct cached_fids *cfids;
+ const char *path;
bool has_lease:1;
+ bool is_open:1;
+ bool on_list:1;
+ bool file_all_info_is_valid:1;
unsigned long time; /* jiffies of when lease was taken */
struct kref refcount;
struct cifs_fid fid;
- struct mutex fid_mutex;
+ spinlock_t fid_lock;
struct cifs_tcon *tcon;
struct dentry *dentry;
struct work_struct lease_break;
@@ -45,8 +49,18 @@ struct cached_fid {
struct cached_dirents dirents;
};
-extern struct cached_fid *init_cached_dir(void);
-extern void free_cached_dir(struct cifs_tcon *tcon);
+#define MAX_CACHED_FIDS 16
+struct cached_fids {
+ /* Must be held when:
+ * - accessing the cfids->entries list
+ */
+ spinlock_t cfid_list_lock;
+ int num_entries;
+ struct list_head entries;
+};
+
+extern struct cached_fids *init_cached_dirs(void);
+extern void free_cached_dirs(struct cached_fids *cfids);
extern int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
const char *path,
struct cifs_sb_info *cifs_sb,
@@ -55,8 +69,10 @@ extern int open_cached_dir_by_dentry(struct cifs_tcon *tcon,
struct dentry *dentry,
struct cached_fid **cfid);
extern void close_cached_dir(struct cached_fid *cfid);
-extern void close_cached_dir_lease(struct cached_fid *cfid);
-extern void close_cached_dir_lease_locked(struct cached_fid *cfid);
+extern void drop_cached_dir_by_name(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ const char *name,
+ struct cifs_sb_info *cifs_sb);
extern void close_all_cached_dirs(struct cifs_sb_info *cifs_sb);
extern void invalidate_all_cached_dirs(struct cifs_tcon *tcon);
extern int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]);
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index c05477e28cff..90850da390ae 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -87,7 +87,7 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon)
{
__u32 dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
- seq_printf(m, "%s Mounts: %d ", tcon->treeName, tcon->tc_count);
+ seq_printf(m, "%s Mounts: %d ", tcon->tree_name, tcon->tc_count);
if (tcon->nativeFileSystem)
seq_printf(m, "Type: %s ", tcon->nativeFileSystem);
seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x\n\tPathComponentMax: %d Status: %d",
@@ -601,7 +601,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
i++;
- seq_printf(m, "\n%d) %s", i, tcon->treeName);
+ seq_printf(m, "\n%d) %s", i, tcon->tree_name);
if (tcon->need_reconnect)
seq_puts(m, "\tDISCONNECTED ");
seq_printf(m, "\nSMBs: %d",
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index ee4ea2b60c0f..d44808263cfb 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -108,8 +108,8 @@ do { \
#define cifs_tcon_dbg_func(ratefunc, type, fmt, ...) \
do { \
const char *tn = ""; \
- if (tcon && tcon->treeName) \
- tn = tcon->treeName; \
+ if (tcon && tcon->tree_name) \
+ tn = tcon->tree_name; \
if ((type) & FYI && cifsFYI & CIFS_INFO) { \
pr_debug_ ## ratefunc("%s: %s " fmt, \
__FILE__, tn, ##__VA_ARGS__); \
@@ -150,7 +150,7 @@ do { \
#define cifs_tcon_dbg(type, fmt, ...) \
do { \
if (0) \
- pr_debug("%s " fmt, tcon->treeName, ##__VA_ARGS__); \
+ pr_debug("%s " fmt, tcon->tree_name, ##__VA_ARGS__); \
} while (0)
#define cifs_info(fmt, ...) \
diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h
index b87cbbe6d2d4..332588e77c31 100644
--- a/fs/cifs/cifs_ioctl.h
+++ b/fs/cifs/cifs_ioctl.h
@@ -91,6 +91,13 @@ struct smb3_notify {
bool watch_tree;
} __packed;
+struct smb3_notify_info {
+ __u32 completion_filter;
+ bool watch_tree;
+ __u32 data_len; /* size of notify data below */
+ __u8 notify_data[];
+} __packed;
+
#define CIFS_IOCTL_MAGIC 0xCF
#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int)
#define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4)
@@ -100,7 +107,8 @@ struct smb3_notify {
#define CIFS_DUMP_KEY _IOWR(CIFS_IOCTL_MAGIC, 8, struct smb3_key_debug_info)
#define CIFS_IOC_NOTIFY _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify)
#define CIFS_DUMP_FULL_KEY _IOWR(CIFS_IOCTL_MAGIC, 10, struct smb3_full_key_debug_info)
-#define CIFS_IOC_SHUTDOWN _IOR ('X', 125, __u32)
+#define CIFS_IOC_NOTIFY_INFO _IOWR(CIFS_IOCTL_MAGIC, 11, struct smb3_notify_info)
+#define CIFS_IOC_SHUTDOWN _IOR('X', 125, __u32)
/*
* Flags for going down operation
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 342717bf1dc2..6f3285f1dfee 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -189,7 +189,7 @@ init_cifs_spnego(void)
* spnego upcalls.
*/
- cred = prepare_kernel_cred(NULL);
+ cred = prepare_kernel_cred(&init_task);
if (!cred)
return -ENOMEM;
diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c
index 1e4c7cc5287f..7233c6a7e6d7 100644
--- a/fs/cifs/cifs_swn.c
+++ b/fs/cifs/cifs_swn.c
@@ -256,23 +256,23 @@ static struct cifs_swn_reg *cifs_find_swn_reg(struct cifs_tcon *tcon)
const char *share_name;
const char *net_name;
- net_name = extract_hostname(tcon->treeName);
+ net_name = extract_hostname(tcon->tree_name);
if (IS_ERR(net_name)) {
int ret;
ret = PTR_ERR(net_name);
cifs_dbg(VFS, "%s: failed to extract host name from target '%s': %d\n",
- __func__, tcon->treeName, ret);
+ __func__, tcon->tree_name, ret);
return ERR_PTR(-EINVAL);
}
- share_name = extract_sharename(tcon->treeName);
+ share_name = extract_sharename(tcon->tree_name);
if (IS_ERR(share_name)) {
int ret;
ret = PTR_ERR(share_name);
cifs_dbg(VFS, "%s: failed to extract share name from target '%s': %d\n",
- __func__, tcon->treeName, ret);
+ __func__, tcon->tree_name, ret);
kfree(net_name);
return ERR_PTR(-EINVAL);
}
@@ -335,14 +335,14 @@ static struct cifs_swn_reg *cifs_get_swn_reg(struct cifs_tcon *tcon)
goto fail;
}
- reg->net_name = extract_hostname(tcon->treeName);
+ reg->net_name = extract_hostname(tcon->tree_name);
if (IS_ERR(reg->net_name)) {
ret = PTR_ERR(reg->net_name);
cifs_dbg(VFS, "%s: failed to extract host name from target: %d\n", __func__, ret);
goto fail_idr;
}
- reg->share_name = extract_sharename(tcon->treeName);
+ reg->share_name = extract_sharename(tcon->tree_name);
if (IS_ERR(reg->share_name)) {
ret = PTR_ERR(reg->share_name);
cifs_dbg(VFS, "%s: failed to extract share name from target: %d\n", __func__, ret);
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index fa480d62f313..bbf58c2439da 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -13,6 +13,9 @@
#include <linux/string.h>
#include <linux/keyctl.h>
#include <linux/key-type.h>
+#include <uapi/linux/posix_acl.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
#include <keys/user-type.h>
#include "cifspdu.h"
#include "cifsglob.h"
@@ -20,6 +23,8 @@
#include "cifsproto.h"
#include "cifs_debug.h"
#include "fs_context.h"
+#include "cifs_fs_sb.h"
+#include "cifs_unicode.h"
/* security id for everyone/world system group */
static const struct cifs_sid sid_everyone = {
@@ -465,7 +470,7 @@ init_cifs_idmap(void)
* this is used to prevent malicious redirections from being installed
* with add_key().
*/
- cred = prepare_kernel_cred(NULL);
+ cred = prepare_kernel_cred(&init_task);
if (!cred)
return -ENOMEM;
@@ -1668,3 +1673,137 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode,
kfree(pntsd);
return rc;
}
+
+struct posix_acl *cifs_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, int type)
+{
+#if defined(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) && defined(CONFIG_CIFS_POSIX)
+ struct posix_acl *acl = NULL;
+ ssize_t rc = -EOPNOTSUPP;
+ unsigned int xid;
+ struct super_block *sb = dentry->d_sb;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ struct tcon_link *tlink;
+ struct cifs_tcon *pTcon;
+ const char *full_path;
+ void *page;
+
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink))
+ return ERR_CAST(tlink);
+ pTcon = tlink_tcon(tlink);
+
+ xid = get_xid();
+ page = alloc_dentry_path();
+
+ full_path = build_path_from_dentry(dentry, page);
+ if (IS_ERR(full_path)) {
+ acl = ERR_CAST(full_path);
+ goto out;
+ }
+
+ /* return alt name if available as pseudo attr */
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ if (sb->s_flags & SB_POSIXACL)
+ rc = cifs_do_get_acl(xid, pTcon, full_path, &acl,
+ ACL_TYPE_ACCESS,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ if (sb->s_flags & SB_POSIXACL)
+ rc = cifs_do_get_acl(xid, pTcon, full_path, &acl,
+ ACL_TYPE_DEFAULT,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+ break;
+ }
+
+ if (rc < 0) {
+ if (rc == -EINVAL)
+ acl = ERR_PTR(-EOPNOTSUPP);
+ else
+ acl = ERR_PTR(rc);
+ }
+
+out:
+ free_dentry_path(page);
+ free_xid(xid);
+ cifs_put_tlink(tlink);
+ return acl;
+#else
+ return ERR_PTR(-EOPNOTSUPP);
+#endif
+}
+
+int cifs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct posix_acl *acl, int type)
+{
+#if defined(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) && defined(CONFIG_CIFS_POSIX)
+ int rc = -EOPNOTSUPP;
+ unsigned int xid;
+ struct super_block *sb = dentry->d_sb;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ struct tcon_link *tlink;
+ struct cifs_tcon *pTcon;
+ const char *full_path;
+ void *page;
+
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink))
+ return PTR_ERR(tlink);
+ pTcon = tlink_tcon(tlink);
+
+ xid = get_xid();
+ page = alloc_dentry_path();
+
+ full_path = build_path_from_dentry(dentry, page);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
+ goto out;
+ }
+
+ if (!acl)
+ goto out;
+
+ /* return dos attributes as pseudo xattr */
+ /* return alt name if available as pseudo attr */
+
+ /* if proc/fs/cifs/streamstoxattr is set then
+ search server for EAs or streams to
+ returns as xattrs */
+ if (posix_acl_xattr_size(acl->a_count) > CIFSMaxBufSize) {
+ cifs_dbg(FYI, "size of EA value too large\n");
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ if (sb->s_flags & SB_POSIXACL)
+ rc = cifs_do_set_acl(xid, pTcon, full_path, acl,
+ ACL_TYPE_ACCESS,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ if (sb->s_flags & SB_POSIXACL)
+ rc = cifs_do_set_acl(xid, pTcon, full_path, acl,
+ ACL_TYPE_DEFAULT,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+ break;
+ }
+
+out:
+ free_dentry_path(page);
+ free_xid(xid);
+ cifs_put_tlink(tlink);
+ return rc;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 46f5718754f9..5db73c0f792a 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -103,26 +103,24 @@ static int cifs_calc_signature(struct smb_rqst *rqst,
if (!rqst->rq_iov || !signature || !server)
return -EINVAL;
- rc = cifs_alloc_hash("md5", &server->secmech.md5,
- &server->secmech.sdescmd5);
+ rc = cifs_alloc_hash("md5", &server->secmech.md5);
if (rc)
return -1;
- rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
+ rc = crypto_shash_init(server->secmech.md5);
if (rc) {
cifs_dbg(VFS, "%s: Could not init md5\n", __func__);
return rc;
}
- rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
+ rc = crypto_shash_update(server->secmech.md5,
server->session_key.response, server->session_key.len);
if (rc) {
cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
return rc;
}
- return __cifs_calc_signature(rqst, server, signature,
- &server->secmech.sdescmd5->shash);
+ return __cifs_calc_signature(rqst, server, signature, server->secmech.md5);
}
/* must be called with server->srv_mutex held */
@@ -412,7 +410,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
wchar_t *domain;
wchar_t *server;
- if (!ses->server->secmech.sdeschmacmd5) {
+ if (!ses->server->secmech.hmacmd5) {
cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__);
return -1;
}
@@ -420,14 +418,14 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
/* calculate md4 hash of password */
E_md4hash(ses->password, nt_hash, nls_cp);
- rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
+ rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm, nt_hash,
CIFS_NTHASH_SIZE);
if (rc) {
cifs_dbg(VFS, "%s: Could not set NT Hash as a key\n", __func__);
return rc;
}
- rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+ rc = crypto_shash_init(ses->server->secmech.hmacmd5);
if (rc) {
cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__);
return rc;
@@ -448,7 +446,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
memset(user, '\0', 2);
}
- rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+ rc = crypto_shash_update(ses->server->secmech.hmacmd5,
(char *)user, 2 * len);
kfree(user);
if (rc) {
@@ -468,7 +466,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len,
nls_cp);
rc =
- crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+ crypto_shash_update(ses->server->secmech.hmacmd5,
(char *)domain, 2 * len);
kfree(domain);
if (rc) {
@@ -488,7 +486,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
len = cifs_strtoUTF16((__le16 *)server, ses->ip_addr, len,
nls_cp);
rc =
- crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+ crypto_shash_update(ses->server->secmech.hmacmd5,
(char *)server, 2 * len);
kfree(server);
if (rc) {
@@ -498,7 +496,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
}
}
- rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+ rc = crypto_shash_final(ses->server->secmech.hmacmd5,
ntlmv2_hash);
if (rc)
cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
@@ -518,12 +516,12 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
hash_len = ses->auth_key.len - (CIFS_SESS_KEY_SIZE +
offsetof(struct ntlmv2_resp, challenge.key[0]));
- if (!ses->server->secmech.sdeschmacmd5) {
+ if (!ses->server->secmech.hmacmd5) {
cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__);
return -1;
}
- rc = crypto_shash_setkey(ses->server->secmech.hmacmd5,
+ rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm,
ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
if (rc) {
cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n",
@@ -531,7 +529,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
return rc;
}
- rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+ rc = crypto_shash_init(ses->server->secmech.hmacmd5);
if (rc) {
cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__);
return rc;
@@ -543,7 +541,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
else
memcpy(ntlmv2->challenge.key,
ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
- rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+ rc = crypto_shash_update(ses->server->secmech.hmacmd5,
ntlmv2->challenge.key, hash_len);
if (rc) {
cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
@@ -551,7 +549,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
}
/* Note that the MD5 digest over writes anon.challenge_key.key */
- rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+ rc = crypto_shash_final(ses->server->secmech.hmacmd5,
ntlmv2->ntlmv2_hash);
if (rc)
cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
@@ -627,9 +625,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
cifs_server_lock(ses->server);
- rc = cifs_alloc_hash("hmac(md5)",
- &ses->server->secmech.hmacmd5,
- &ses->server->secmech.sdeschmacmd5);
+ rc = cifs_alloc_hash("hmac(md5)", &ses->server->secmech.hmacmd5);
if (rc) {
goto unlock;
}
@@ -649,7 +645,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
}
/* now calculate the session key for NTLMv2 */
- rc = crypto_shash_setkey(ses->server->secmech.hmacmd5,
+ rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm,
ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
if (rc) {
cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n",
@@ -657,13 +653,13 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
goto unlock;
}
- rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+ rc = crypto_shash_init(ses->server->secmech.hmacmd5);
if (rc) {
cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__);
goto unlock;
}
- rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+ rc = crypto_shash_update(ses->server->secmech.hmacmd5,
ntlmv2->ntlmv2_hash,
CIFS_HMAC_MD5_HASH_SIZE);
if (rc) {
@@ -671,7 +667,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
goto unlock;
}
- rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+ rc = crypto_shash_final(ses->server->secmech.hmacmd5,
ses->auth_key.response);
if (rc)
cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
@@ -679,7 +675,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
unlock:
cifs_server_unlock(ses->server);
setup_ntlmv2_rsp_ret:
- kfree(tiblob);
+ kfree_sensitive(tiblob);
return rc;
}
@@ -718,49 +714,19 @@ calc_seckey(struct cifs_ses *ses)
void
cifs_crypto_secmech_release(struct TCP_Server_Info *server)
{
- if (server->secmech.cmacaes) {
- crypto_free_shash(server->secmech.cmacaes);
- server->secmech.cmacaes = NULL;
- }
-
- if (server->secmech.hmacsha256) {
- crypto_free_shash(server->secmech.hmacsha256);
- server->secmech.hmacsha256 = NULL;
- }
-
- if (server->secmech.md5) {
- crypto_free_shash(server->secmech.md5);
- server->secmech.md5 = NULL;
- }
+ cifs_free_hash(&server->secmech.aes_cmac);
+ cifs_free_hash(&server->secmech.hmacsha256);
+ cifs_free_hash(&server->secmech.md5);
+ cifs_free_hash(&server->secmech.sha512);
+ cifs_free_hash(&server->secmech.hmacmd5);
- if (server->secmech.sha512) {
- crypto_free_shash(server->secmech.sha512);
- server->secmech.sha512 = NULL;
+ if (server->secmech.enc) {
+ crypto_free_aead(server->secmech.enc);
+ server->secmech.enc = NULL;
}
- if (server->secmech.hmacmd5) {
- crypto_free_shash(server->secmech.hmacmd5);
- server->secmech.hmacmd5 = NULL;
+ if (server->secmech.dec) {
+ crypto_free_aead(server->secmech.dec);
+ server->secmech.dec = NULL;
}
-
- if (server->secmech.ccmaesencrypt) {
- crypto_free_aead(server->secmech.ccmaesencrypt);
- server->secmech.ccmaesencrypt = NULL;
- }
-
- if (server->secmech.ccmaesdecrypt) {
- crypto_free_aead(server->secmech.ccmaesdecrypt);
- server->secmech.ccmaesdecrypt = NULL;
- }
-
- kfree(server->secmech.sdesccmacaes);
- server->secmech.sdesccmacaes = NULL;
- kfree(server->secmech.sdeschmacsha256);
- server->secmech.sdeschmacsha256 = NULL;
- kfree(server->secmech.sdeschmacmd5);
- server->secmech.sdeschmacmd5 = NULL;
- kfree(server->secmech.sdescmd5);
- server->secmech.sdescmd5 = NULL;
- kfree(server->secmech.sdescsha512);
- server->secmech.sdescsha512 = NULL;
}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8042d7280dec..914cbb9de482 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -396,6 +396,7 @@ cifs_alloc_inode(struct super_block *sb)
cifs_inode->epoch = 0;
spin_lock_init(&cifs_inode->open_file_lock);
generate_random_uuid(cifs_inode->lease_key);
+ cifs_inode->symlink_target = NULL;
/*
* Can not set i_flags here - they get immediately overwritten to zero
@@ -412,7 +413,11 @@ cifs_alloc_inode(struct super_block *sb)
static void
cifs_free_inode(struct inode *inode)
{
- kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
+ struct cifsInodeInfo *cinode = CIFS_I(inode);
+
+ if (S_ISLNK(inode->i_mode))
+ kfree(cinode->symlink_target);
+ kmem_cache_free(cifs_inode_cachep, cinode);
}
static void
@@ -673,9 +678,15 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",echo_interval=%lu",
tcon->ses->server->echo_interval / HZ);
- /* Only display max_credits if it was overridden on mount */
+ /* Only display the following if overridden on mount */
if (tcon->ses->server->max_credits != SMB2_MAX_CREDITS_AVAILABLE)
seq_printf(s, ",max_credits=%u", tcon->ses->server->max_credits);
+ if (tcon->ses->server->tcp_nodelay)
+ seq_puts(s, ",tcpnodelay");
+ if (tcon->ses->server->noautotune)
+ seq_puts(s, ",noautotune");
+ if (tcon->ses->server->noblocksnd)
+ seq_puts(s, ",noblocksend");
if (tcon->snapshot_time)
seq_printf(s, ",snapshot=%llu", tcon->snapshot_time);
@@ -1128,6 +1139,8 @@ const struct inode_operations cifs_dir_inode_ops = {
.symlink = cifs_symlink,
.mknod = cifs_mknod,
.listxattr = cifs_listxattr,
+ .get_acl = cifs_get_acl,
+ .set_acl = cifs_set_acl,
};
const struct inode_operations cifs_file_inode_ops = {
@@ -1136,8 +1149,34 @@ const struct inode_operations cifs_file_inode_ops = {
.permission = cifs_permission,
.listxattr = cifs_listxattr,
.fiemap = cifs_fiemap,
+ .get_acl = cifs_get_acl,
+ .set_acl = cifs_set_acl,
};
+const char *cifs_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *done)
+{
+ char *target_path;
+
+ target_path = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!target_path)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock(&inode->i_lock);
+ if (likely(CIFS_I(inode)->symlink_target)) {
+ strscpy(target_path, CIFS_I(inode)->symlink_target, PATH_MAX);
+ } else {
+ kfree(target_path);
+ target_path = ERR_PTR(-EOPNOTSUPP);
+ }
+ spin_unlock(&inode->i_lock);
+
+ if (!IS_ERR(target_path))
+ set_delayed_call(done, kfree_link, target_path);
+
+ return target_path;
+}
+
const struct inode_operations cifs_symlink_inode_ops = {
.get_link = cifs_get_link,
.permission = cifs_permission,
@@ -1252,7 +1291,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
rc = filemap_write_and_wait_range(src_inode->i_mapping, off,
off + len - 1);
if (rc)
- goto out;
+ goto unlock;
/* should we flush first and last page first */
truncate_inode_pages(&target_inode->i_data, 0);
@@ -1268,6 +1307,8 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
* that target is updated on the server
*/
CIFS_I(target_inode)->time = 0;
+
+unlock:
/* although unlocking in the reverse order from locking is not
* strictly necessary here it is a little cleaner to be consistent
*/
@@ -1297,8 +1338,11 @@ static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off,
ssize_t rc;
struct cifsFileInfo *cfile = dst_file->private_data;
- if (cfile->swapfile)
- return -EOPNOTSUPP;
+ if (cfile->swapfile) {
+ rc = -EOPNOTSUPP;
+ free_xid(xid);
+ return rc;
+ }
rc = cifs_file_copychunk_range(xid, src_file, off, dst_file, destoff,
len, flags);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 5b4a7a32bdc5..00a573e0ad0e 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -105,8 +105,8 @@ extern int cifs_lock(struct file *, int, struct file_lock *);
extern int cifs_fsync(struct file *, loff_t, loff_t, int);
extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int);
extern int cifs_flush(struct file *, fl_owner_t id);
-extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
-extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
+extern int cifs_file_mmap(struct file *file, struct vm_area_struct *vma);
+extern int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma);
extern const struct file_operations cifs_dir_ops;
extern int cifs_dir_open(struct inode *inode, struct file *file);
extern int cifs_readdir(struct file *file, struct dir_context *ctx);
@@ -153,6 +153,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 39
-#define CIFS_VERSION "2.39"
+#define SMB3_PRODUCT_BUILD 40
+#define CIFS_VERSION "2.40"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index ae7f571a7dba..82f2d3070c26 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -13,6 +13,8 @@
#include <linux/in6.h>
#include <linux/inet.h>
#include <linux/slab.h>
+#include <linux/scatterlist.h>
+#include <linux/mm.h>
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/utsname.h>
@@ -21,7 +23,6 @@
#include "cifs_fs_sb.h"
#include "cifsacl.h"
#include <crypto/internal/hash.h>
-#include <linux/scatterlist.h>
#include <uapi/linux/cifs/cifs_mount.h>
#include "../smbfs_common/smb2pdu.h"
#include "smb2pdu.h"
@@ -153,26 +154,16 @@ struct session_key {
char *response;
};
-/* crypto security descriptor definition */
-struct sdesc {
- struct shash_desc shash;
- char ctx[];
-};
-
/* crypto hashing related structure/fields, not specific to a sec mech */
struct cifs_secmech {
- struct crypto_shash *hmacmd5; /* hmac-md5 hash function */
- struct crypto_shash *md5; /* md5 hash function */
- struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */
- struct crypto_shash *cmacaes; /* block-cipher based MAC function */
- struct crypto_shash *sha512; /* sha512 hash function */
- struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */
- struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
- struct sdesc *sdeschmacsha256; /* ctxt to generate smb2 signature */
- struct sdesc *sdesccmacaes; /* ctxt to generate smb3 signature */
- struct sdesc *sdescsha512; /* ctxt to generate smb3.11 signing key */
- struct crypto_aead *ccmaesencrypt; /* smb3 encryption aead */
- struct crypto_aead *ccmaesdecrypt; /* smb3 decryption aead */
+ struct shash_desc *hmacmd5; /* hmacmd5 hash function, for NTLMv2/CR1 hashes */
+ struct shash_desc *md5; /* md5 hash function, for CIFS/SMB1 signatures */
+ struct shash_desc *hmacsha256; /* hmac-sha256 hash function, for SMB2 signatures */
+ struct shash_desc *sha512; /* sha512 hash function, for SMB3.1.1 preauth hash */
+ struct shash_desc *aes_cmac; /* block-cipher based MAC function, for SMB3 signatures */
+
+ struct crypto_aead *enc; /* smb3 encryption AEAD TFM (AES-CCM and AES-GCM) */
+ struct crypto_aead *dec; /* smb3 decryption AEAD TFM (AES-CCM and AES-GCM) */
};
/* per smb session structure/fields */
@@ -195,6 +186,19 @@ struct cifs_cred {
struct cifs_ace *aces;
};
+struct cifs_open_info_data {
+ char *symlink_target;
+ union {
+ struct smb2_file_all_info fi;
+ struct smb311_posix_qinfo posix_fi;
+ };
+};
+
+static inline void cifs_free_open_info(struct cifs_open_info_data *data)
+{
+ kfree(data->symlink_target);
+}
+
/*
*****************************************************************
* Except the CIFS PDUs themselves all the
@@ -317,20 +321,20 @@ struct smb_version_operations {
int (*is_path_accessible)(const unsigned int, struct cifs_tcon *,
struct cifs_sb_info *, const char *);
/* query path data from the server */
- int (*query_path_info)(const unsigned int, struct cifs_tcon *,
- struct cifs_sb_info *, const char *,
- FILE_ALL_INFO *, bool *, bool *);
+ int (*query_path_info)(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path,
+ struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse);
/* query file data from the server */
- int (*query_file_info)(const unsigned int, struct cifs_tcon *,
- struct cifs_fid *, FILE_ALL_INFO *);
+ int (*query_file_info)(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifsFileInfo *cfile, struct cifs_open_info_data *data);
/* query reparse tag from srv to determine which type of special file */
int (*query_reparse_tag)(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *path,
__u32 *reparse_tag);
/* get server index number */
- int (*get_srv_inum)(const unsigned int, struct cifs_tcon *,
- struct cifs_sb_info *, const char *,
- u64 *uniqueid, FILE_ALL_INFO *);
+ int (*get_srv_inum)(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path, u64 *uniqueid,
+ struct cifs_open_info_data *data);
/* set size by path */
int (*set_path_size)(const unsigned int, struct cifs_tcon *,
const char *, __u64, struct cifs_sb_info *, bool);
@@ -379,8 +383,8 @@ struct smb_version_operations {
struct cifs_sb_info *, const char *,
char **, bool);
/* open a file for non-posix mounts */
- int (*open)(const unsigned int, struct cifs_open_parms *,
- __u32 *, FILE_ALL_INFO *);
+ int (*open)(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock,
+ void *buf);
/* set fid protocol-specific info */
void (*set_fid)(struct cifsFileInfo *, struct cifs_fid *, __u32);
/* close a file */
@@ -451,7 +455,7 @@ struct smb_version_operations {
int (*enum_snapshots)(const unsigned int xid, struct cifs_tcon *tcon,
struct cifsFileInfo *src_file, void __user *);
int (*notify)(const unsigned int xid, struct file *pfile,
- void __user *pbuf);
+ void __user *pbuf, bool return_changes);
int (*query_mf_symlink)(unsigned int, struct cifs_tcon *,
struct cifs_sb_info *, const unsigned char *,
char *, unsigned int *);
@@ -782,6 +786,7 @@ static inline unsigned int
in_flight(struct TCP_Server_Info *server)
{
unsigned int num;
+
spin_lock(&server->req_lock);
num = server->in_flight;
spin_unlock(&server->req_lock);
@@ -792,6 +797,7 @@ static inline bool
has_credits(struct TCP_Server_Info *server, int *credits, int num_credits)
{
int num;
+
spin_lock(&server->req_lock);
num = *credits;
spin_unlock(&server->req_lock);
@@ -1022,7 +1028,7 @@ struct cifs_ses {
struct TCP_Server_Info *server; /* pointer to server info */
int ses_count; /* reference counter */
enum ses_status_enum ses_status; /* updates protected by cifs_tcp_ses_lock */
- unsigned overrideSecFlg; /* if non-zero override global sec flags */
+ unsigned int overrideSecFlg; /* if non-zero override global sec flags */
char *serverOS; /* name of operating system underlying server */
char *serverNOS; /* name of network operating system of server */
char *serverDomain; /* security realm of server */
@@ -1133,6 +1139,7 @@ struct cifs_fattr {
struct timespec64 cf_mtime;
struct timespec64 cf_ctime;
u32 cf_cifstag;
+ char *cf_symlink_target;
};
/*
@@ -1149,7 +1156,7 @@ struct cifs_tcon {
struct list_head openFileList;
spinlock_t open_file_lock; /* protects list above */
struct cifs_ses *ses; /* pointer to session associated with */
- char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */
+ char tree_name[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */
char *nativeFileSystem;
char *password; /* for share-level security */
__u32 tid; /* The 4 byte tree id */
@@ -1228,7 +1235,7 @@ struct cifs_tcon {
struct fscache_volume *fscache; /* cookie for share */
#endif
struct list_head pending_opens; /* list of incomplete opens */
- struct cached_fid *cfid; /* Cached root fid */
+ struct cached_fids *cfids;
/* BB add field for back pointer to sb struct(s)? */
#ifdef CONFIG_CIFS_DFS_UPCALL
struct list_head ulist; /* cache update list */
@@ -1377,7 +1384,7 @@ struct cifsFileInfo {
__u32 pid; /* process id who opened file */
struct cifs_fid fid; /* file id from remote */
struct list_head rlist; /* reconnect list */
- /* BB add lock scope info here if needed */ ;
+ /* BB add lock scope info here if needed */
/* lock scope id (0 if none) */
struct dentry *dentry;
struct tcon_link *tlink;
@@ -1395,6 +1402,7 @@ struct cifsFileInfo {
struct work_struct put; /* work for the final part of _put */
struct delayed_work deferred;
bool deferred_close_scheduled; /* Flag to indicate close is scheduled */
+ char *symlink_target;
};
struct cifs_io_parms {
@@ -1553,6 +1561,7 @@ struct cifsInodeInfo {
struct list_head deferred_closes; /* list of deferred closes */
spinlock_t deferred_lock; /* protection on deferred list */
bool lease_granted; /* Flag to indicate whether lease or oplock is granted. */
+ char *symlink_target;
};
static inline struct cifsInodeInfo *
@@ -1763,6 +1772,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
int number_of_items)
{
int i;
+
if ((number_of_items == 0) || (param == NULL))
return;
for (i = 0; i < number_of_items; i++) {
@@ -2121,4 +2131,80 @@ static inline size_t ntlmssp_workstation_name_size(const struct cifs_ses *ses)
return sizeof(ses->workstation_name);
}
+static inline void move_cifs_info_to_smb2(struct smb2_file_all_info *dst, const FILE_ALL_INFO *src)
+{
+ memcpy(dst, src, (size_t)((u8 *)&src->AccessFlags - (u8 *)src));
+ dst->AccessFlags = src->AccessFlags;
+ dst->CurrentByteOffset = src->CurrentByteOffset;
+ dst->Mode = src->Mode;
+ dst->AlignmentRequirement = src->AlignmentRequirement;
+ dst->FileNameLength = src->FileNameLength;
+}
+
+static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst,
+ int num_rqst,
+ const u8 *sig)
+{
+ unsigned int len, skip;
+ unsigned int nents = 0;
+ unsigned long addr;
+ int i, j;
+
+ /* Assumes the first rqst has a transform header as the first iov.
+ * I.e.
+ * rqst[0].rq_iov[0] is transform header
+ * rqst[0].rq_iov[1+] data to be encrypted/decrypted
+ * rqst[1+].rq_iov[0+] data to be encrypted/decrypted
+ */
+ for (i = 0; i < num_rqst; i++) {
+ /*
+ * The first rqst has a transform header where the
+ * first 20 bytes are not part of the encrypted blob.
+ */
+ for (j = 0; j < rqst[i].rq_nvec; j++) {
+ struct kvec *iov = &rqst[i].rq_iov[j];
+
+ skip = (i == 0) && (j == 0) ? 20 : 0;
+ addr = (unsigned long)iov->iov_base + skip;
+ if (unlikely(is_vmalloc_addr((void *)addr))) {
+ len = iov->iov_len - skip;
+ nents += DIV_ROUND_UP(offset_in_page(addr) + len,
+ PAGE_SIZE);
+ } else {
+ nents++;
+ }
+ }
+ nents += rqst[i].rq_npages;
+ }
+ nents += DIV_ROUND_UP(offset_in_page(sig) + SMB2_SIGNATURE_SIZE, PAGE_SIZE);
+ return nents;
+}
+
+/* We can not use the normal sg_set_buf() as we will sometimes pass a
+ * stack object as buf.
+ */
+static inline struct scatterlist *cifs_sg_set_buf(struct scatterlist *sg,
+ const void *buf,
+ unsigned int buflen)
+{
+ unsigned long addr = (unsigned long)buf;
+ unsigned int off = offset_in_page(addr);
+
+ addr &= PAGE_MASK;
+ if (unlikely(is_vmalloc_addr((void *)addr))) {
+ do {
+ unsigned int len = min_t(unsigned int, buflen, PAGE_SIZE - off);
+
+ sg_set_page(sg++, vmalloc_to_page((void *)addr), len, off);
+
+ off = 0;
+ addr += PAGE_SIZE;
+ buflen -= len;
+ } while (buflen);
+ } else {
+ sg_set_page(sg++, virt_to_page(addr), buflen, off);
+ }
+ return sg;
+}
+
#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index aeba371c4c70..623caece2b10 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -483,7 +483,7 @@ put_bcc(__u16 count, struct smb_hdr *hdr)
typedef struct negotiate_req {
struct smb_hdr hdr; /* wct = 0 */
__le16 ByteCount;
- unsigned char DialectsArray[1];
+ unsigned char DialectsArray[];
} __attribute__((packed)) NEGOTIATE_REQ;
#define MIN_TZ_ADJ (15 * 60) /* minimum grid for timezones in seconds */
@@ -508,13 +508,14 @@ typedef struct negotiate_rsp {
__u8 EncryptionKeyLength;
__u16 ByteCount;
union {
- unsigned char EncryptionKey[1]; /* cap extended security off */
+ /* cap extended security off */
+ DECLARE_FLEX_ARRAY(unsigned char, EncryptionKey);
/* followed by Domain name - if extended security is off */
/* followed by 16 bytes of server GUID */
/* then security blob if cap_extended_security negotiated */
struct {
unsigned char GUID[SMB1_CLIENT_GUID_SIZE];
- unsigned char SecurityBlob[1];
+ unsigned char SecurityBlob[];
} __attribute__((packed)) extended_response;
} __attribute__((packed)) u;
} __attribute__((packed)) NEGOTIATE_RSP;
@@ -1428,7 +1429,7 @@ typedef struct smb_com_transaction_change_notify_req {
__u8 WatchTree; /* 1 = Monitor subdirectories */
__u8 Reserved2;
__le16 ByteCount;
-/* __u8 Pad[3];*/
+/* __u8 Pad[3];*/
/* __u8 Data[1];*/
} __attribute__((packed)) TRANSACT_CHANGE_NOTIFY_REQ;
@@ -1751,8 +1752,7 @@ struct smb_com_transaction2_sfi_rsp {
struct smb_hdr hdr; /* wct = 10 + SetupCount */
struct trans2_resp t2;
__u16 ByteCount;
- __u16 Reserved2; /* parameter word reserved -
- present for infolevels > 100 */
+ __u16 Reserved2; /* parameter word reserved - present for infolevels > 100 */
} __attribute__((packed));
struct smb_t2_qfi_req {
@@ -1767,8 +1767,7 @@ struct smb_t2_qfi_rsp {
struct smb_hdr hdr; /* wct = 10 + SetupCount */
struct trans2_resp t2;
__u16 ByteCount;
- __u16 Reserved2; /* parameter word reserved -
- present for infolevels > 100 */
+ __u16 Reserved2; /* parameter word reserved - present for infolevels > 100 */
} __attribute__((packed));
/*
@@ -2145,13 +2144,11 @@ typedef struct {
#define CIFS_UNIX_POSIX_PATH_OPS_CAP 0x00000020 /* Allow new POSIX path based
calls including posix open
and posix unlink */
-#define CIFS_UNIX_LARGE_READ_CAP 0x00000040 /* support reads >128K (up
- to 0xFFFF00 */
+#define CIFS_UNIX_LARGE_READ_CAP 0x00000040 /* support reads >128K (up to 0xFFFF00 */
#define CIFS_UNIX_LARGE_WRITE_CAP 0x00000080
#define CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP 0x00000100 /* can do SPNEGO crypt */
#define CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP 0x00000200 /* must do */
-#define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and
- QFS PROXY call */
+#define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and QFS PROXY call */
#ifdef CONFIG_CIFS_POSIX
/* presumably don't need the 0x20 POSIX_PATH_OPS_CAP since we never send
LockingX instead of posix locking call on unix sess (and we do not expect
@@ -2367,8 +2364,7 @@ typedef struct {
struct file_allocation_info {
__le64 AllocationSize; /* Note old Samba srvr rounds this up too much */
-} __attribute__((packed)); /* size used on disk, for level 0x103 for set,
- 0x105 for query */
+} __packed; /* size used on disk, for level 0x103 for set, 0x105 for query */
struct file_end_of_file_info {
__le64 FileSize; /* offset to end of file */
@@ -2408,8 +2404,7 @@ struct cifs_posix_acl { /* access conrol list (ACL) */
__le16 access_entry_count; /* access ACL - count of entries */
__le16 default_entry_count; /* default ACL - count of entries */
struct cifs_posix_ace ace_array[];
- /* followed by
- struct cifs_posix_ace default_ace_arraay[] */
+ /* followed by struct cifs_posix_ace default_ace_array[] */
} __attribute__((packed)); /* level 0x204 */
/* types of access control entries already defined in posix_acl.h */
@@ -2428,17 +2423,17 @@ struct cifs_posix_acl { /* access conrol list (ACL) */
/* end of POSIX ACL definitions */
/* POSIX Open Flags */
-#define SMB_O_RDONLY 0x1
-#define SMB_O_WRONLY 0x2
-#define SMB_O_RDWR 0x4
-#define SMB_O_CREAT 0x10
-#define SMB_O_EXCL 0x20
-#define SMB_O_TRUNC 0x40
-#define SMB_O_APPEND 0x80
-#define SMB_O_SYNC 0x100
-#define SMB_O_DIRECTORY 0x200
-#define SMB_O_NOFOLLOW 0x400
-#define SMB_O_DIRECT 0x800
+#define SMB_O_RDONLY 0x1
+#define SMB_O_WRONLY 0x2
+#define SMB_O_RDWR 0x4
+#define SMB_O_CREAT 0x10
+#define SMB_O_EXCL 0x20
+#define SMB_O_TRUNC 0x40
+#define SMB_O_APPEND 0x80
+#define SMB_O_SYNC 0x100
+#define SMB_O_DIRECTORY 0x200
+#define SMB_O_NOFOLLOW 0x400
+#define SMB_O_DIRECT 0x800
typedef struct {
__le32 OpenFlags; /* same as NT CreateX */
@@ -2715,15 +2710,13 @@ typedef struct file_xattr_info {
__u32 xattr_value_len;
char xattr_name[];
/* followed by xattr_value[xattr_value_len], no pad */
-} __attribute__((packed)) FILE_XATTR_INFO; /* extended attribute info
- level 0x205 */
+} __packed FILE_XATTR_INFO; /* extended attribute info level 0x205 */
/* flags for lsattr and chflags commands removed arein uapi/linux/fs.h */
typedef struct file_chattr_info {
__le64 mask; /* list of all possible attribute bits */
__le64 mode; /* list of actual attribute bits on this inode */
-} __attribute__((packed)) FILE_CHATTR_INFO; /* ext attributes
- (chattr, chflags) level 0x206 */
-#endif /* POSIX */
+} __packed FILE_CHATTR_INFO; /* ext attributes (chattr, chflags) level 0x206 */
+#endif /* POSIX */
#endif /* _CIFSPDU_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 3bc94bcc7177..7d4b37eeec98 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -124,7 +124,7 @@ extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
struct kvec * /* resp vec */);
extern int SendReceiveBlockingLock(const unsigned int xid,
struct cifs_tcon *ptcon,
- struct smb_hdr *in_buf ,
+ struct smb_hdr *in_buf,
struct smb_hdr *out_buf,
int *bytes_returned);
void
@@ -182,10 +182,9 @@ extern int cifs_unlock_range(struct cifsFileInfo *cfile,
extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile);
extern void cifs_down_write(struct rw_semaphore *sem);
-extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid,
- struct file *file,
- struct tcon_link *tlink,
- __u32 oplock);
+struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
+ struct tcon_link *tlink, __u32 oplock,
+ const char *symlink_target);
extern int cifs_posix_open(const char *full_path, struct inode **inode,
struct super_block *sb, int mode,
unsigned int f_flags, __u32 *oplock, __u16 *netfid,
@@ -200,9 +199,9 @@ extern int cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
extern struct inode *cifs_iget(struct super_block *sb,
struct cifs_fattr *fattr);
-extern int cifs_get_inode_info(struct inode **inode, const char *full_path,
- FILE_ALL_INFO *data, struct super_block *sb,
- int xid, const struct cifs_fid *fid);
+int cifs_get_inode_info(struct inode **inode, const char *full_path,
+ struct cifs_open_info_data *data, struct super_block *sb, int xid,
+ const struct cifs_fid *fid);
extern int smb311_posix_get_inode_info(struct inode **pinode, const char *search_path,
struct super_block *sb, unsigned int xid);
extern int cifs_get_inode_info_unix(struct inode **pinode,
@@ -225,6 +224,10 @@ extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
const char *, u32 *, u32);
extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *,
const struct cifs_fid *, u32 *, u32);
+extern struct posix_acl *cifs_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, int type);
+extern int cifs_set_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct posix_acl *acl, int type);
extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
const char *, int);
extern unsigned int setup_authusers_ACE(struct cifs_ace *pace);
@@ -538,14 +541,14 @@ extern int CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon,
__u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen);
extern int CIFSSMBSetCIFSACL(const unsigned int, struct cifs_tcon *, __u16,
struct cifs_ntsd *, __u32, int);
-extern int CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
- const unsigned char *searchName,
- char *acl_inf, const int buflen, const int acl_type,
- const struct nls_table *nls_codepage, int remap_special_chars);
-extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
- const unsigned char *fileName,
- const char *local_acl, const int buflen, const int acl_type,
- const struct nls_table *nls_codepage, int remap_special_chars);
+extern int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon,
+ const unsigned char *searchName,
+ struct posix_acl **acl, const int acl_type,
+ const struct nls_table *nls_codepage, int remap);
+extern int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
+ const unsigned char *fileName,
+ const struct posix_acl *acl, const int acl_type,
+ const struct nls_table *nls_codepage, int remap);
extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
#endif /* CIFS_ALLOW_INSECURE_LEGACY */
@@ -598,12 +601,11 @@ struct cifs_aio_ctx *cifs_aio_ctx_alloc(void);
void cifs_aio_ctx_release(struct kref *refcount);
int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw);
-int cifs_alloc_hash(const char *name, struct crypto_shash **shash,
- struct sdesc **sdesc);
-void cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc);
+int cifs_alloc_hash(const char *name, struct shash_desc **sdesc);
+void cifs_free_hash(struct shash_desc **sdesc);
-extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page,
- unsigned int *len, unsigned int *offset);
+void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page,
+ unsigned int *len, unsigned int *offset);
struct cifs_chan *
cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server);
int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses);
@@ -639,7 +641,7 @@ cifs_chan_is_iface_active(struct cifs_ses *ses,
int
cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server);
int
-SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon);
+SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_mount);
void extract_unc_hostname(const char *unc, const char **h, size_t *len);
int copy_path_name(char *dst, const char *src);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 7aa91e272027..23f10e0d6e7e 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -465,7 +465,7 @@ CIFSSMBNegotiate(const unsigned int xid,
for (i = 0; i < CIFS_NUM_PROT; i++) {
size_t len = strlen(protocols[i].name) + 1;
- memcpy(pSMB->DialectsArray+count, protocols[i].name, len);
+ memcpy(&pSMB->DialectsArray[count], protocols[i].name, len);
count += len;
}
inc_rfc1001_len(pSMB, count);
@@ -2305,7 +2305,7 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon,
remap);
}
rename_info->target_name_len = cpu_to_le32(2 * len_of_str);
- count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str);
+ count = sizeof(struct set_file_rename) + (2 * len_of_str);
byte_count += count;
pSMB->DataCount = cpu_to_le16(count);
pSMB->TotalDataCount = pSMB->DataCount;
@@ -2914,32 +2914,57 @@ CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
#ifdef CONFIG_CIFS_POSIX
-/*Convert an Access Control Entry from wire format to local POSIX xattr format*/
-static void cifs_convert_ace(struct posix_acl_xattr_entry *ace,
- struct cifs_posix_ace *cifs_ace)
+#ifdef CONFIG_FS_POSIX_ACL
+/**
+ * cifs_init_posix_acl - convert ACL from cifs to POSIX ACL format
+ * @ace: POSIX ACL entry to store converted ACL into
+ * @cifs_ace: ACL in cifs format
+ *
+ * Convert an Access Control Entry from wire format to local POSIX xattr
+ * format.
+ *
+ * Note that the @cifs_uid member is used to store both {g,u}id_t.
+ */
+static void cifs_init_posix_acl(struct posix_acl_entry *ace,
+ struct cifs_posix_ace *cifs_ace)
{
/* u8 cifs fields do not need le conversion */
- ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm);
- ace->e_tag = cpu_to_le16(cifs_ace->cifs_e_tag);
- ace->e_id = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid));
-/*
- cifs_dbg(FYI, "perm %d tag %d id %d\n",
- ace->e_perm, ace->e_tag, ace->e_id);
-*/
+ ace->e_perm = cifs_ace->cifs_e_perm;
+ ace->e_tag = cifs_ace->cifs_e_tag;
+ switch (ace->e_tag) {
+ case ACL_USER:
+ ace->e_uid = make_kuid(&init_user_ns,
+ le64_to_cpu(cifs_ace->cifs_uid));
+ break;
+ case ACL_GROUP:
+ ace->e_gid = make_kgid(&init_user_ns,
+ le64_to_cpu(cifs_ace->cifs_uid));
+ break;
+ }
return;
}
-/* Convert ACL from CIFS POSIX wire format to local Linux POSIX ACL xattr */
-static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
- const int acl_type, const int size_of_data_area)
+/**
+ * cifs_to_posix_acl - copy cifs ACL format to POSIX ACL format
+ * @acl: ACLs returned in POSIX ACL format
+ * @src: ACLs in cifs format
+ * @acl_type: type of POSIX ACL requested
+ * @size_of_data_area: size of SMB we got
+ *
+ * This function converts ACLs from cifs format to POSIX ACL format.
+ * If @acl is NULL then the size of the buffer required to store POSIX ACLs in
+ * their uapi format is returned.
+ */
+static int cifs_to_posix_acl(struct posix_acl **acl, char *src,
+ const int acl_type, const int size_of_data_area)
{
int size = 0;
- int i;
__u16 count;
struct cifs_posix_ace *pACE;
struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)src;
- struct posix_acl_xattr_header *local_acl = (void *)trgt;
+ struct posix_acl *kacl = NULL;
+ struct posix_acl_entry *pa, *pe;
if (le16_to_cpu(cifs_acl->version) != CIFS_ACL_VERSION)
return -EOPNOTSUPP;
@@ -2959,7 +2984,7 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
count = le16_to_cpu(cifs_acl->access_entry_count);
size = sizeof(struct cifs_posix_acl);
size += sizeof(struct cifs_posix_ace) * count;
-/* skip past access ACEs to get to default ACEs */
+ /* skip past access ACEs to get to default ACEs */
pACE = &cifs_acl->ace_array[count];
count = le16_to_cpu(cifs_acl->default_entry_count);
size += sizeof(struct cifs_posix_ace) * count;
@@ -2971,62 +2996,75 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
return -EINVAL;
}
- size = posix_acl_xattr_size(count);
- if ((buflen == 0) || (local_acl == NULL)) {
- /* used to query ACL EA size */
- } else if (size > buflen) {
- return -ERANGE;
- } else /* buffer big enough */ {
- struct posix_acl_xattr_entry *ace = (void *)(local_acl + 1);
-
- local_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
- for (i = 0; i < count ; i++) {
- cifs_convert_ace(&ace[i], pACE);
- pACE++;
- }
+ /* Allocate number of POSIX ACLs to store in VFS format. */
+ kacl = posix_acl_alloc(count, GFP_NOFS);
+ if (!kacl)
+ return -ENOMEM;
+
+ FOREACH_ACL_ENTRY(pa, kacl, pe) {
+ cifs_init_posix_acl(pa, pACE);
+ pACE++;
}
- return size;
+
+ *acl = kacl;
+ return 0;
}
-static void convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
- const struct posix_acl_xattr_entry *local_ace)
+/**
+ * cifs_init_ace - convert ACL entry from POSIX ACL to cifs format
+ * @cifs_ace: the cifs ACL entry to store into
+ * @local_ace: the POSIX ACL entry to convert
+ */
+static void cifs_init_ace(struct cifs_posix_ace *cifs_ace,
+ const struct posix_acl_entry *local_ace)
{
- cifs_ace->cifs_e_perm = le16_to_cpu(local_ace->e_perm);
- cifs_ace->cifs_e_tag = le16_to_cpu(local_ace->e_tag);
- /* BB is there a better way to handle the large uid? */
- if (local_ace->e_id == cpu_to_le32(-1)) {
- /* Probably no need to le convert -1 on any arch but can not hurt */
+ cifs_ace->cifs_e_perm = local_ace->e_perm;
+ cifs_ace->cifs_e_tag = local_ace->e_tag;
+
+ switch (local_ace->e_tag) {
+ case ACL_USER:
+ cifs_ace->cifs_uid =
+ cpu_to_le64(from_kuid(&init_user_ns, local_ace->e_uid));
+ break;
+ case ACL_GROUP:
+ cifs_ace->cifs_uid =
+ cpu_to_le64(from_kgid(&init_user_ns, local_ace->e_gid));
+ break;
+ default:
cifs_ace->cifs_uid = cpu_to_le64(-1);
- } else
- cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id));
-/*
- cifs_dbg(FYI, "perm %d tag %d id %d\n",
- ace->e_perm, ace->e_tag, ace->e_id);
-*/
+ }
}
-/* Convert ACL from local Linux POSIX xattr to CIFS POSIX ACL wire format */
-static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
- const int buflen, const int acl_type)
+/**
+ * posix_acl_to_cifs - convert ACLs from POSIX ACL to cifs format
+ * @parm_data: ACLs in cifs format to conver to
+ * @acl: ACLs in POSIX ACL format to convert from
+ * @acl_type: the type of POSIX ACLs stored in @acl
+ *
+ * Return: the number cifs ACL entries after conversion
+ */
+static __u16 posix_acl_to_cifs(char *parm_data, const struct posix_acl *acl,
+ const int acl_type)
{
__u16 rc = 0;
struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)parm_data;
- struct posix_acl_xattr_header *local_acl = (void *)pACL;
- struct posix_acl_xattr_entry *ace = (void *)(local_acl + 1);
+ const struct posix_acl_entry *pa, *pe;
int count;
- int i;
+ int i = 0;
- if ((buflen == 0) || (pACL == NULL) || (cifs_acl == NULL))
+ if ((acl == NULL) || (cifs_acl == NULL))
return 0;
- count = posix_acl_xattr_count((size_t)buflen);
- cifs_dbg(FYI, "setting acl with %d entries from buf of length %d and version of %d\n",
- count, buflen, le32_to_cpu(local_acl->a_version));
- if (le32_to_cpu(local_acl->a_version) != 2) {
- cifs_dbg(FYI, "unknown POSIX ACL version %d\n",
- le32_to_cpu(local_acl->a_version));
- return 0;
- }
+ count = acl->a_count;
+ cifs_dbg(FYI, "setting acl with %d entries\n", count);
+
+ /*
+ * Note that the uapi POSIX ACL version is verified by the VFS and is
+ * independent of the cifs ACL version. Changing the POSIX ACL version
+ * is a uapi change and if it's changed we will pass down the POSIX ACL
+ * version in struct posix_acl from the VFS. For now there's really
+ * only one that all filesystems know how to deal with.
+ */
cifs_acl->version = cpu_to_le16(1);
if (acl_type == ACL_TYPE_ACCESS) {
cifs_acl->access_entry_count = cpu_to_le16(count);
@@ -3038,8 +3076,9 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
cifs_dbg(FYI, "unknown ACL type %d\n", acl_type);
return 0;
}
- for (i = 0; i < count; i++)
- convert_ace_to_cifs_ace(&cifs_acl->ace_array[i], &ace[i]);
+ FOREACH_ACL_ENTRY(pa, acl, pe) {
+ cifs_init_ace(&cifs_acl->ace_array[i++], pa);
+ }
if (rc == 0) {
rc = (__u16)(count * sizeof(struct cifs_posix_ace));
rc += sizeof(struct cifs_posix_acl);
@@ -3048,11 +3087,10 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
return rc;
}
-int
-CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
- const unsigned char *searchName,
- char *acl_inf, const int buflen, const int acl_type,
- const struct nls_table *nls_codepage, int remap)
+int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon,
+ const unsigned char *searchName, struct posix_acl **acl,
+ const int acl_type, const struct nls_table *nls_codepage,
+ int remap)
{
/* SMB_QUERY_POSIX_ACL */
TRANSACTION2_QPI_REQ *pSMB = NULL;
@@ -3124,23 +3162,26 @@ queryAclRetry:
else {
__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
__u16 count = le16_to_cpu(pSMBr->t2.DataCount);
- rc = cifs_copy_posix_acl(acl_inf,
+ rc = cifs_to_posix_acl(acl,
(char *)&pSMBr->hdr.Protocol+data_offset,
- buflen, acl_type, count);
+ acl_type, count);
}
}
cifs_buf_release(pSMB);
+ /*
+ * The else branch after SendReceive() doesn't return EAGAIN so if we
+ * allocated @acl in cifs_to_posix_acl() we are guaranteed to return
+ * here and don't leak POSIX ACLs.
+ */
if (rc == -EAGAIN)
goto queryAclRetry;
return rc;
}
-int
-CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
- const unsigned char *fileName,
- const char *local_acl, const int buflen,
- const int acl_type,
- const struct nls_table *nls_codepage, int remap)
+int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
+ const unsigned char *fileName, const struct posix_acl *acl,
+ const int acl_type, const struct nls_table *nls_codepage,
+ int remap)
{
struct smb_com_transaction2_spi_req *pSMB = NULL;
struct smb_com_transaction2_spi_rsp *pSMBr = NULL;
@@ -3181,7 +3222,7 @@ setAclRetry:
pSMB->ParameterOffset = cpu_to_le16(param_offset);
/* convert to on the wire format for POSIX ACL */
- data_count = ACL_to_cifs_posix(parm_data, local_acl, buflen, acl_type);
+ data_count = posix_acl_to_cifs(parm_data, acl, acl_type);
if (data_count == 0) {
rc = -EOPNOTSUPP;
@@ -3211,6 +3252,23 @@ setACLerrorExit:
goto setAclRetry;
return rc;
}
+#else
+int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon,
+ const unsigned char *searchName, struct posix_acl **acl,
+ const int acl_type, const struct nls_table *nls_codepage,
+ int remap)
+{
+ return -EOPNOTSUPP;
+}
+
+int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
+ const unsigned char *fileName, const struct posix_acl *acl,
+ const int acl_type, const struct nls_table *nls_codepage,
+ int remap)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_FS_POSIX_ACL */
int
CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 7ae6f2c08153..7bc7b5e03c51 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -155,7 +155,7 @@ static void smb2_query_server_interfaces(struct work_struct *work)
/*
* query server network interfaces, in case they change
*/
- rc = SMB3_request_interfaces(0, tcon);
+ rc = SMB3_request_interfaces(0, tcon, false);
if (rc) {
cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n",
__func__, rc);
@@ -311,7 +311,7 @@ cifs_abort_connection(struct TCP_Server_Info *server)
}
server->sequence_number = 0;
server->session_estab = false;
- kfree(server->session_key.response);
+ kfree_sensitive(server->session_key.response);
server->session_key.response = NULL;
server->session_key.len = 0;
server->lstrp = jiffies;
@@ -759,7 +759,7 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
{
struct msghdr smb_msg = {};
struct kvec iov = {.iov_base = buf, .iov_len = to_read};
- iov_iter_kvec(&smb_msg.msg_iter, READ, &iov, 1, to_read);
+ iov_iter_kvec(&smb_msg.msg_iter, ITER_DEST, &iov, 1, to_read);
return cifs_readv_from_socket(server, &smb_msg);
}
@@ -774,7 +774,7 @@ cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read)
* and cifs_readv_from_socket sets msg_control and msg_controllen
* so little to initialize in struct msghdr
*/
- iov_iter_discard(&smb_msg.msg_iter, READ, to_read);
+ iov_iter_discard(&smb_msg.msg_iter, ITER_DEST, to_read);
return cifs_readv_from_socket(server, &smb_msg);
}
@@ -786,7 +786,7 @@ cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page,
struct msghdr smb_msg = {};
struct bio_vec bv = {
.bv_page = page, .bv_len = to_read, .bv_offset = page_offset};
- iov_iter_bvec(&smb_msg.msg_iter, READ, &bv, 1, to_read);
+ iov_iter_bvec(&smb_msg.msg_iter, ITER_DEST, &bv, 1, to_read);
return cifs_readv_from_socket(server, &smb_msg);
}
@@ -1580,10 +1580,11 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
cifs_crypto_secmech_release(server);
- kfree(server->session_key.response);
+ kfree_sensitive(server->session_key.response);
server->session_key.response = NULL;
server->session_key.len = 0;
kfree(server->hostname);
+ server->hostname = NULL;
task = xchg(&server->tsk, NULL);
if (task)
@@ -1940,7 +1941,8 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
spin_unlock(&ses->ses_lock);
cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count);
- cifs_dbg(FYI, "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->treeName : "NONE");
+ cifs_dbg(FYI,
+ "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->tree_name : "NONE");
spin_lock(&cifs_tcp_ses_lock);
if (--ses->ses_count > 0) {
@@ -2293,7 +2295,7 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
{
if (tcon->status == TID_EXITING)
return 0;
- if (strncmp(tcon->treeName, ctx->UNC, MAX_TREE_SIZE))
+ if (strncmp(tcon->tree_name, ctx->UNC, MAX_TREE_SIZE))
return 0;
if (tcon->seal != ctx->seal)
return 0;
@@ -2831,9 +2833,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
* sessinit is sent but no second negprot
*/
struct rfc1002_session_packet *ses_init_buf;
+ unsigned int req_noscope_len;
struct smb_hdr *smb_buf;
+
ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
GFP_KERNEL);
+
if (ses_init_buf) {
ses_init_buf->trailer.session_req.called_len = 32;
@@ -2869,8 +2874,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
ses_init_buf->trailer.session_req.scope2 = 0;
smb_buf = (struct smb_hdr *)ses_init_buf;
- /* sizeof RFC1002_SESSION_REQUEST with no scope */
- smb_buf->smb_buf_length = cpu_to_be32(0x81000044);
+ /* sizeof RFC1002_SESSION_REQUEST with no scopes */
+ req_noscope_len = sizeof(struct rfc1002_session_packet) - 2;
+
+ /* == cpu_to_be32(0x81000044) */
+ smb_buf->smb_buf_length =
+ cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | req_noscope_len);
rc = smb_send(server, smb_buf, 0x44);
kfree(ses_init_buf);
/*
@@ -2935,6 +2944,7 @@ generic_ip_connect(struct TCP_Server_Info *server)
cifs_dbg(FYI, "Socket created\n");
server->ssocket = socket;
socket->sk->sk_allocation = GFP_NOFS;
+ socket->sk->sk_use_task_frag = false;
if (sfamily == AF_INET6)
cifs_reclassify_socket6(socket);
else
@@ -3846,9 +3856,13 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
uuid_copy(&cifs_sb->dfs_mount_id, &mnt_ctx.mount_id);
out:
- free_xid(mnt_ctx.xid);
cifs_try_adding_channels(cifs_sb, mnt_ctx.ses);
- return mount_setup_tlink(cifs_sb, mnt_ctx.ses, mnt_ctx.tcon);
+ rc = mount_setup_tlink(cifs_sb, mnt_ctx.ses, mnt_ctx.tcon);
+ if (rc)
+ goto error;
+
+ free_xid(mnt_ctx.xid);
+ return rc;
error:
dfs_cache_put_refsrv_sessions(&mnt_ctx.mount_id);
@@ -3875,8 +3889,12 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
goto error;
}
+ rc = mount_setup_tlink(cifs_sb, mnt_ctx.ses, mnt_ctx.tcon);
+ if (rc)
+ goto error;
+
free_xid(mnt_ctx.xid);
- return mount_setup_tlink(cifs_sb, mnt_ctx.ses, mnt_ctx.tcon);
+ return rc;
error:
mount_put_conns(&mnt_ctx);
@@ -3921,12 +3939,11 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
pSMB->AndXCommand = 0xFF;
pSMB->Flags = cpu_to_le16(TCON_EXTENDED_SECINFO);
bcc_ptr = &pSMB->Password[0];
- if (tcon->pipe || (ses->server->sec_mode & SECMODE_USER)) {
- pSMB->PasswordLength = cpu_to_le16(1); /* minimum */
- *bcc_ptr = 0; /* password is null byte */
- bcc_ptr++; /* skip password */
- /* already aligned so no need to do it below */
- }
+
+ pSMB->PasswordLength = cpu_to_le16(1); /* minimum */
+ *bcc_ptr = 0; /* password is null byte */
+ bcc_ptr++; /* skip password */
+ /* already aligned so no need to do it below */
if (ses->server->sign)
smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
@@ -3989,7 +4006,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
}
bcc_ptr += length + 1;
bytes_left -= (length + 1);
- strscpy(tcon->treeName, tree, sizeof(tcon->treeName));
+ strscpy(tcon->tree_name, tree, sizeof(tcon->tree_name));
/* mostly informational -- no need to fail on error here */
kfree(tcon->nativeFileSystem);
@@ -4134,7 +4151,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
if (ses->auth_key.response) {
cifs_dbg(FYI, "Free previous auth_key.response = %p\n",
ses->auth_key.response);
- kfree(ses->auth_key.response);
+ kfree_sensitive(ses->auth_key.response);
ses->auth_key.response = NULL;
ses->auth_key.len = 0;
}
@@ -4197,7 +4214,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
ctx->local_nls = cifs_sb->local_nls;
ctx->linux_uid = fsuid;
ctx->cred_uid = fsuid;
- ctx->UNC = master_tcon->treeName;
+ ctx->UNC = master_tcon->tree_name;
ctx->retry = master_tcon->retry;
ctx->nocase = master_tcon->nocase;
ctx->nohandlecache = master_tcon->nohandlecache;
@@ -4663,7 +4680,7 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
/* If it is not dfs or there was no cached dfs referral, then reconnect to same share */
if (!server->current_fullpath ||
dfs_cache_noreq_find(server->current_fullpath + 1, &ref, &tl)) {
- rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, cifs_sb->local_nls);
+ rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, tcon, cifs_sb->local_nls);
goto out;
}
@@ -4707,7 +4724,7 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
tcon->status = TID_IN_TCON;
spin_unlock(&tcon->tc_lock);
- rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc);
+ rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, tcon, nlsc);
if (rc) {
spin_lock(&tcon->tc_lock);
if (tcon->status == TID_IN_TCON)
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index a9b6c3eba6de..e70915ad7541 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -98,7 +98,7 @@ static struct cifs_ses *find_ipc_from_server_path(struct cifs_ses **ses, const c
get_ipc_unc(path, unc, sizeof(unc));
for (; *ses; ses++) {
- if (!strcasecmp(unc, (*ses)->tcon_ipc->treeName))
+ if (!strcasecmp(unc, (*ses)->tcon_ipc->tree_name))
return *ses;
}
return ERR_PTR(-ENOENT);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 08f7392716e2..8b1c37158556 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -50,7 +50,7 @@ cifs_build_path_to_root(struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_s
}
if (add_treename)
- dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
+ dfsplen = strnlen(tcon->tree_name, MAX_TREE_SIZE + 1);
else
dfsplen = 0;
@@ -59,7 +59,7 @@ cifs_build_path_to_root(struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_s
return full_path;
if (dfsplen)
- memcpy(full_path, tcon->treeName, dfsplen);
+ memcpy(full_path, tcon->tree_name, dfsplen);
full_path[dfsplen] = CIFS_DIR_SEP(cifs_sb);
memcpy(full_path + dfsplen + 1, ctx->prepath, pplen);
convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
@@ -93,7 +93,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
return ERR_PTR(-ENOMEM);
if (prefix)
- dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
+ dfsplen = strnlen(tcon->tree_name, MAX_TREE_SIZE + 1);
else
dfsplen = 0;
@@ -123,7 +123,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
}
if (dfsplen) {
s -= dfsplen;
- memcpy(s, tcon->treeName, dfsplen);
+ memcpy(s, tcon->tree_name, dfsplen);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
int i;
for (i = 0; i < dfsplen; i++) {
@@ -165,10 +165,9 @@ check_name(struct dentry *direntry, struct cifs_tcon *tcon)
/* Inode operations in similar order to how they appear in Linux file fs.h */
-static int
-cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
- struct tcon_link *tlink, unsigned oflags, umode_t mode,
- __u32 *oplock, struct cifs_fid *fid)
+static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
+ struct tcon_link *tlink, unsigned int oflags, umode_t mode, __u32 *oplock,
+ struct cifs_fid *fid, struct cifs_open_info_data *buf)
{
int rc = -ENOENT;
int create_options = CREATE_NOT_DIR;
@@ -177,7 +176,6 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
struct cifs_tcon *tcon = tlink_tcon(tlink);
const char *full_path;
void *page = alloc_dentry_path();
- FILE_ALL_INFO *buf = NULL;
struct inode *newinode = NULL;
int disposition;
struct TCP_Server_Info *server = tcon->ses->server;
@@ -290,12 +288,6 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
goto out;
}
- buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
- if (buf == NULL) {
- rc = -ENOMEM;
- goto out;
- }
-
/*
* if we're not using unix extensions, see if we need to set
* ATTR_READONLY on the create call
@@ -364,8 +356,7 @@ cifs_create_get_file_info:
{
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
/* TODO: Add support for calling POSIX query info here, but passing in fid */
- rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb,
- xid, fid);
+ rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb, xid, fid);
if (newinode) {
if (server->ops->set_lease_key)
server->ops->set_lease_key(newinode, fid);
@@ -402,7 +393,6 @@ cifs_create_set_dentry:
d_add(direntry, newinode);
out:
- kfree(buf);
free_dentry_path(page);
return rc;
@@ -423,10 +413,11 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
struct tcon_link *tlink;
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
- struct cifs_fid fid;
+ struct cifs_fid fid = {};
struct cifs_pending_open open;
__u32 oplock;
struct cifsFileInfo *file_info;
+ struct cifs_open_info_data buf = {};
if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb))))
return -EIO;
@@ -484,8 +475,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
cifs_add_pending_open(&fid, tlink, &open);
rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode,
- &oplock, &fid);
-
+ &oplock, &fid, &buf);
if (rc) {
cifs_del_pending_open(&open);
goto out;
@@ -510,7 +500,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
file->f_op = &cifs_file_direct_ops;
}
- file_info = cifs_new_fileinfo(&fid, file, tlink, oplock);
+ file_info = cifs_new_fileinfo(&fid, file, tlink, oplock, buf.symlink_target);
if (file_info == NULL) {
if (server->ops->close)
server->ops->close(xid, tcon, &fid);
@@ -526,6 +516,7 @@ out:
cifs_put_tlink(tlink);
out_free_xid:
free_xid(xid);
+ cifs_free_open_info(&buf);
return rc;
}
@@ -547,12 +538,15 @@ int cifs_create(struct user_namespace *mnt_userns, struct inode *inode,
struct TCP_Server_Info *server;
struct cifs_fid fid;
__u32 oplock;
+ struct cifs_open_info_data buf = {};
cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %pd and dentry = 0x%p\n",
inode, direntry, direntry);
- if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb))))
- return -EIO;
+ if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) {
+ rc = -EIO;
+ goto out_free_xid;
+ }
tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb));
rc = PTR_ERR(tlink);
@@ -565,11 +559,11 @@ int cifs_create(struct user_namespace *mnt_userns, struct inode *inode,
if (server->ops->new_lease_key)
server->ops->new_lease_key(&fid);
- rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode,
- &oplock, &fid);
+ rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, &oplock, &fid, &buf);
if (!rc && server->ops->close)
server->ops->close(xid, tcon, &fid);
+ cifs_free_open_info(&buf);
cifs_put_tlink(tlink);
out_free_xid:
free_xid(xid);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 6f38b134a346..22dfc1f8b4f1 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -209,16 +209,14 @@ posix_open_ret:
}
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
-static int
-cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
- struct cifs_tcon *tcon, unsigned int f_flags, __u32 *oplock,
- struct cifs_fid *fid, unsigned int xid)
+static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
+ struct cifs_tcon *tcon, unsigned int f_flags, __u32 *oplock,
+ struct cifs_fid *fid, unsigned int xid, struct cifs_open_info_data *buf)
{
int rc;
int desired_access;
int disposition;
int create_options = CREATE_NOT_DIR;
- FILE_ALL_INFO *buf;
struct TCP_Server_Info *server = tcon->ses->server;
struct cifs_open_parms oparms;
@@ -255,10 +253,6 @@ cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *ci
/* BB pass O_SYNC flag through on file attributes .. BB */
- buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
/* O_SYNC also has bit for O_DSYNC so following check picks up either */
if (f_flags & O_SYNC)
create_options |= CREATE_WRITE_THROUGH;
@@ -276,9 +270,8 @@ cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *ci
oparms.reconnect = false;
rc = server->ops->open(xid, &oparms, oplock, buf);
-
if (rc)
- goto out;
+ return rc;
/* TODO: Add support for calling posix query info but with passing in fid */
if (tcon->unix_ext)
@@ -294,8 +287,6 @@ cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *ci
rc = -EOPENSTALE;
}
-out:
- kfree(buf);
return rc;
}
@@ -325,9 +316,9 @@ cifs_down_write(struct rw_semaphore *sem)
static void cifsFileInfo_put_work(struct work_struct *work);
-struct cifsFileInfo *
-cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
- struct tcon_link *tlink, __u32 oplock)
+struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
+ struct tcon_link *tlink, __u32 oplock,
+ const char *symlink_target)
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
@@ -347,6 +338,15 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
return NULL;
}
+ if (symlink_target) {
+ cfile->symlink_target = kstrdup(symlink_target, GFP_KERNEL);
+ if (!cfile->symlink_target) {
+ kfree(fdlocks);
+ kfree(cfile);
+ return NULL;
+ }
+ }
+
INIT_LIST_HEAD(&fdlocks->locks);
fdlocks->cfile = cfile;
cfile->llist = fdlocks;
@@ -440,6 +440,7 @@ static void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file)
cifs_put_tlink(cifs_file->tlink);
dput(cifs_file->dentry);
cifs_sb_deactive(sb);
+ kfree(cifs_file->symlink_target);
kfree(cifs_file);
}
@@ -488,7 +489,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file,
struct cifsInodeInfo *cifsi = CIFS_I(inode);
struct super_block *sb = inode->i_sb;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
- struct cifs_fid fid;
+ struct cifs_fid fid = {};
struct cifs_pending_open open;
bool oplock_break_cancelled;
@@ -570,8 +571,9 @@ int cifs_open(struct inode *inode, struct file *file)
void *page;
const char *full_path;
bool posix_open_ok = false;
- struct cifs_fid fid;
+ struct cifs_fid fid = {};
struct cifs_pending_open open;
+ struct cifs_open_info_data data = {};
xid = get_xid();
@@ -662,15 +664,15 @@ int cifs_open(struct inode *inode, struct file *file)
if (server->ops->get_lease_key)
server->ops->get_lease_key(inode, &fid);
- rc = cifs_nt_open(full_path, inode, cifs_sb, tcon,
- file->f_flags, &oplock, &fid, xid);
+ rc = cifs_nt_open(full_path, inode, cifs_sb, tcon, file->f_flags, &oplock, &fid,
+ xid, &data);
if (rc) {
cifs_del_pending_open(&open);
goto out;
}
}
- cfile = cifs_new_fileinfo(&fid, file, tlink, oplock);
+ cfile = cifs_new_fileinfo(&fid, file, tlink, oplock, data.symlink_target);
if (cfile == NULL) {
if (server->ops->close)
server->ops->close(xid, tcon, &fid);
@@ -712,6 +714,7 @@ out:
free_dentry_path(page);
free_xid(xid);
cifs_put_tlink(tlink);
+ cifs_free_open_info(&data);
return rc;
}
@@ -1410,7 +1413,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
struct inode *inode = d_inode(cfile->dentry);
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct file_lock *flock;
- struct file_lock_context *flctx = inode->i_flctx;
+ struct file_lock_context *flctx = locks_inode_context(inode);
unsigned int count = 0, i;
int rc = 0, xid, type;
struct list_head locks_to_send, *el;
@@ -1882,11 +1885,13 @@ int cifs_flock(struct file *file, int cmd, struct file_lock *fl)
struct cifsFileInfo *cfile;
__u32 type;
- rc = -EACCES;
xid = get_xid();
- if (!(fl->fl_flags & FL_FLOCK))
- return -ENOLCK;
+ if (!(fl->fl_flags & FL_FLOCK)) {
+ rc = -ENOLCK;
+ free_xid(xid);
+ return rc;
+ }
cfile = (struct cifsFileInfo *)file->private_data;
tcon = tlink_tcon(cfile->tlink);
@@ -1905,8 +1910,9 @@ int cifs_flock(struct file *file, int cmd, struct file_lock *fl)
* if no lock or unlock then nothing to do since we do not
* know what it is
*/
+ rc = -EOPNOTSUPP;
free_xid(xid);
- return -EOPNOTSUPP;
+ return rc;
}
rc = cifs_setlk(file, fl, type, wait_flag, posix_lck, lock, unlock,
@@ -2428,12 +2434,16 @@ cifs_writev_complete(struct work_struct *work)
struct cifs_writedata *
cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete)
{
+ struct cifs_writedata *writedata = NULL;
struct page **pages =
kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
- if (pages)
- return cifs_writedata_direct_alloc(pages, complete);
+ if (pages) {
+ writedata = cifs_writedata_direct_alloc(pages, complete);
+ if (!writedata)
+ kvfree(pages);
+ }
- return NULL;
+ return writedata;
}
struct cifs_writedata *
@@ -2636,6 +2646,21 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
return rc;
}
+static int
+cifs_writepage_locked(struct page *page, struct writeback_control *wbc);
+
+static int cifs_write_one_page(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct address_space *mapping = data;
+ int ret;
+
+ ret = cifs_writepage_locked(page, wbc);
+ unlock_page(page);
+ mapping_set_error(mapping, ret);
+ return ret;
+}
+
static int cifs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -2652,10 +2677,11 @@ static int cifs_writepages(struct address_space *mapping,
/*
* If wsize is smaller than the page cache size, default to writing
- * one page at a time via cifs_writepage
+ * one page at a time.
*/
if (cifs_sb->ctx->wsize < PAGE_SIZE)
- return generic_writepages(mapping, wbc);
+ return write_cache_pages(mapping, wbc, cifs_write_one_page,
+ mapping);
xid = get_xid();
if (wbc->range_cyclic) {
@@ -2842,13 +2868,6 @@ retry_write:
return rc;
}
-static int cifs_writepage(struct page *page, struct writeback_control *wbc)
-{
- int rc = cifs_writepage_locked(page, wbc);
- unlock_page(page);
- return rc;
-}
-
static int cifs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
@@ -3293,6 +3312,9 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
cifs_uncached_writev_complete);
if (!wdata) {
rc = -ENOMEM;
+ for (i = 0; i < nr_pages; i++)
+ put_page(pagevec[i]);
+ kvfree(pagevec);
add_credits_and_wake_if(server, credits, 0);
break;
}
@@ -3519,7 +3541,7 @@ static ssize_t __cifs_writev(
ctx->iter = *from;
ctx->len = len;
} else {
- rc = setup_aio_ctx_iter(ctx, from, WRITE);
+ rc = setup_aio_ctx_iter(ctx, from, ITER_SOURCE);
if (rc) {
kref_put(&ctx->refcount, cifs_aio_ctx_release);
return rc;
@@ -4263,7 +4285,7 @@ static ssize_t __cifs_readv(
ctx->iter = *to;
ctx->len = len;
} else {
- rc = setup_aio_ctx_iter(ctx, to, READ);
+ rc = setup_aio_ctx_iter(ctx, to, ITER_DEST);
if (rc) {
kref_put(&ctx->refcount, cifs_aio_ctx_release);
return rc;
@@ -4271,6 +4293,15 @@ static ssize_t __cifs_readv(
len = ctx->len;
}
+ if (direct) {
+ rc = filemap_write_and_wait_range(file->f_inode->i_mapping,
+ offset, offset + len - 1);
+ if (rc) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return -EAGAIN;
+ }
+ }
+
/* grab a lock here due to read response handlers can access ctx */
mutex_lock(&ctx->aio_mutex);
@@ -5209,7 +5240,6 @@ static bool cifs_dirty_folio(struct address_space *mapping, struct folio *folio)
const struct address_space_operations cifs_addr_ops = {
.read_folio = cifs_read_folio,
.readahead = cifs_readahead,
- .writepage = cifs_writepage,
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
@@ -5218,10 +5248,10 @@ const struct address_space_operations cifs_addr_ops = {
.direct_IO = cifs_direct_io,
.invalidate_folio = cifs_invalidate_folio,
.launder_folio = cifs_launder_folio,
+ .migrate_folio = filemap_migrate_folio,
/*
- * TODO: investigate and if useful we could add an cifs_migratePage
- * helper (under an CONFIG_MIGRATION) in the future, and also
- * investigate and add an is_dirty_writeback helper if needed
+ * TODO: investigate and if useful we could add an is_dirty_writeback
+ * helper if needed
*/
.swap_activate = cifs_swap_activate,
.swap_deactivate = cifs_swap_deactivate,
@@ -5234,7 +5264,6 @@ const struct address_space_operations cifs_addr_ops = {
*/
const struct address_space_operations cifs_addr_ops_smallbuf = {
.read_folio = cifs_read_folio,
- .writepage = cifs_writepage,
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
@@ -5242,4 +5271,5 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
.release_folio = cifs_release_folio,
.invalidate_folio = cifs_invalidate_folio,
.launder_folio = cifs_launder_folio,
+ .migrate_folio = filemap_migrate_folio,
};
diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index 0e13dec86b25..2c92a821e028 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -791,6 +791,13 @@ do { \
cifs_sb->ctx->field = NULL; \
} while (0)
+#define STEAL_STRING_SENSITIVE(cifs_sb, ctx, field) \
+do { \
+ kfree_sensitive(ctx->field); \
+ ctx->field = cifs_sb->ctx->field; \
+ cifs_sb->ctx->field = NULL; \
+} while (0)
+
static int smb3_reconfigure(struct fs_context *fc)
{
struct smb3_fs_context *ctx = smb3_fc2context(fc);
@@ -811,7 +818,7 @@ static int smb3_reconfigure(struct fs_context *fc)
STEAL_STRING(cifs_sb, ctx, UNC);
STEAL_STRING(cifs_sb, ctx, source);
STEAL_STRING(cifs_sb, ctx, username);
- STEAL_STRING(cifs_sb, ctx, password);
+ STEAL_STRING_SENSITIVE(cifs_sb, ctx, password);
STEAL_STRING(cifs_sb, ctx, domainname);
STEAL_STRING(cifs_sb, ctx, nodename);
STEAL_STRING(cifs_sb, ctx, iocharset);
@@ -877,16 +884,21 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->nodfs = 1;
break;
case Opt_hard:
- if (result.negated)
+ if (result.negated) {
+ if (ctx->retry == 1)
+ cifs_dbg(VFS, "conflicting hard vs. soft mount options\n");
ctx->retry = 0;
- else
+ } else
ctx->retry = 1;
break;
case Opt_soft:
if (result.negated)
ctx->retry = 1;
- else
+ else {
+ if (ctx->retry == 1)
+ cifs_dbg(VFS, "conflicting hard vs soft mount options\n");
ctx->retry = 0;
+ }
break;
case Opt_mapposix:
if (result.negated)
@@ -1162,7 +1174,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
}
break;
case Opt_pass:
- kfree(ctx->password);
+ kfree_sensitive(ctx->password);
ctx->password = NULL;
if (strlen(param->string) == 0)
break;
@@ -1470,6 +1482,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
return 0;
cifs_parse_mount_err:
+ kfree_sensitive(ctx->password);
return -EINVAL;
}
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 23ef56f55ce5..f6f3a6b75601 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -45,7 +45,7 @@ int cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
memset(&key, 0, sizeof(key));
- sharename = extract_sharename(tcon->treeName);
+ sharename = extract_sharename(tcon->tree_name);
if (IS_ERR(sharename)) {
cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__);
return -EINVAL;
@@ -150,7 +150,7 @@ static int fscache_fallback_read_page(struct inode *inode, struct page *page)
bvec[0].bv_page = page;
bvec[0].bv_offset = 0;
bvec[0].bv_len = PAGE_SIZE;
- iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
+ iov_iter_bvec(&iter, ITER_DEST, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
ret = fscache_begin_read_operation(&cres, cookie);
if (ret < 0)
@@ -180,7 +180,7 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page,
bvec[0].bv_page = page;
bvec[0].bv_offset = 0;
bvec[0].bv_len = PAGE_SIZE;
- iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
+ iov_iter_bvec(&iter, ITER_SOURCE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
ret = fscache_begin_write_operation(&cres, cookie);
if (ret < 0)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index bac08c20f559..286a5400b94e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -210,6 +210,12 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
*/
inode->i_blocks = (512 - 1 + fattr->cf_bytes) >> 9;
}
+
+ if (S_ISLNK(fattr->cf_mode)) {
+ kfree(cifs_i->symlink_target);
+ cifs_i->symlink_target = fattr->cf_symlink_target;
+ fattr->cf_symlink_target = NULL;
+ }
spin_unlock(&inode->i_lock);
if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL)
@@ -347,13 +353,22 @@ cifs_get_file_info_unix(struct file *filp)
int rc;
unsigned int xid;
FILE_UNIX_BASIC_INFO find_data;
- struct cifs_fattr fattr;
+ struct cifs_fattr fattr = {};
struct inode *inode = file_inode(filp);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifsFileInfo *cfile = filp->private_data;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
xid = get_xid();
+
+ if (cfile->symlink_target) {
+ fattr.cf_symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
+ if (!fattr.cf_symlink_target) {
+ rc = -ENOMEM;
+ goto cifs_gfiunix_out;
+ }
+ }
+
rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->fid.netfid, &find_data);
if (!rc) {
cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
@@ -378,6 +393,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
FILE_UNIX_BASIC_INFO find_data;
struct cifs_fattr fattr;
struct cifs_tcon *tcon;
+ struct TCP_Server_Info *server;
struct tcon_link *tlink;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -387,10 +403,12 @@ int cifs_get_inode_info_unix(struct inode **pinode,
if (IS_ERR(tlink))
return PTR_ERR(tlink);
tcon = tlink_tcon(tlink);
+ server = tcon->ses->server;
/* could have done a find first instead but this returns more info */
rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
cifs_sb->local_nls, cifs_remap(cifs_sb));
+ cifs_dbg(FYI, "%s: query path info: rc = %d\n", __func__, rc);
cifs_put_tlink(tlink);
if (!rc) {
@@ -410,6 +428,17 @@ int cifs_get_inode_info_unix(struct inode **pinode,
cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
}
+ if (S_ISLNK(fattr.cf_mode) && !fattr.cf_symlink_target) {
+ if (!server->ops->query_symlink)
+ return -EOPNOTSUPP;
+ rc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path,
+ &fattr.cf_symlink_target, false);
+ if (rc) {
+ cifs_dbg(FYI, "%s: query_symlink: %d\n", __func__, rc);
+ goto cgiiu_exit;
+ }
+ }
+
if (*pinode == NULL) {
/* get new inode */
cifs_fill_uniqueid(sb, &fattr);
@@ -432,6 +461,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
}
cgiiu_exit:
+ kfree(fattr.cf_symlink_target);
return rc;
}
#else
@@ -601,10 +631,12 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
}
/* Fill a cifs_fattr struct with info from POSIX info struct */
-static void
-smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct smb311_posix_qinfo *info,
- struct super_block *sb, bool adjust_tz, bool symlink)
+static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data,
+ struct cifs_sid *owner,
+ struct cifs_sid *group,
+ struct super_block *sb, bool adjust_tz, bool symlink)
{
+ struct smb311_posix_qinfo *info = &data->posix_fi;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
@@ -639,6 +671,8 @@ smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct smb311_posix_qinfo *
if (symlink) {
fattr->cf_mode |= S_IFLNK;
fattr->cf_dtype = DT_LNK;
+ fattr->cf_symlink_target = data->symlink_target;
+ data->symlink_target = NULL;
} else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
fattr->cf_mode |= S_IFDIR;
fattr->cf_dtype = DT_DIR;
@@ -648,20 +682,18 @@ smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct smb311_posix_qinfo *
}
/* else if reparse point ... TODO: add support for FIFO and blk dev; special file types */
- fattr->cf_uid = cifs_sb->ctx->linux_uid; /* TODO: map uid and gid from SID */
- fattr->cf_gid = cifs_sb->ctx->linux_gid;
+ sid_to_id(cifs_sb, owner, fattr, SIDOWNER);
+ sid_to_id(cifs_sb, group, fattr, SIDGROUP);
cifs_dbg(FYI, "POSIX query info: mode 0x%x uniqueid 0x%llx nlink %d\n",
fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink);
}
-
-/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */
-static void
-cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
- struct super_block *sb, bool adjust_tz,
- bool symlink, u32 reparse_tag)
+static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data,
+ struct super_block *sb, bool adjust_tz, bool symlink,
+ u32 reparse_tag)
{
+ struct smb2_file_all_info *info = &data->fi;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
@@ -703,7 +735,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
} else if (reparse_tag == IO_REPARSE_TAG_LX_BLK) {
fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode;
fattr->cf_dtype = DT_BLK;
- } else if (symlink) { /* TODO add more reparse tag checks */
+ } else if (symlink || reparse_tag == IO_REPARSE_TAG_SYMLINK ||
+ reparse_tag == IO_REPARSE_TAG_NFS) {
fattr->cf_mode = S_IFLNK;
fattr->cf_dtype = DT_LNK;
} else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
@@ -735,6 +768,11 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
}
}
+ if (S_ISLNK(fattr->cf_mode)) {
+ fattr->cf_symlink_target = data->symlink_target;
+ data->symlink_target = NULL;
+ }
+
fattr->cf_uid = cifs_sb->ctx->linux_uid;
fattr->cf_gid = cifs_sb->ctx->linux_gid;
}
@@ -744,23 +782,28 @@ cifs_get_file_info(struct file *filp)
{
int rc;
unsigned int xid;
- FILE_ALL_INFO find_data;
+ struct cifs_open_info_data data = {};
struct cifs_fattr fattr;
struct inode *inode = file_inode(filp);
struct cifsFileInfo *cfile = filp->private_data;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
+ bool symlink = false;
+ u32 tag = 0;
if (!server->ops->query_file_info)
return -ENOSYS;
xid = get_xid();
- rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data);
+ rc = server->ops->query_file_info(xid, tcon, cfile, &data);
switch (rc) {
case 0:
/* TODO: add support to query reparse tag */
- cifs_all_info_to_fattr(&fattr, &find_data, inode->i_sb, false,
- false, 0 /* no reparse tag */);
+ if (data.symlink_target) {
+ symlink = true;
+ tag = IO_REPARSE_TAG_SYMLINK;
+ }
+ cifs_open_info_to_fattr(&fattr, &data, inode->i_sb, false, symlink, tag);
break;
case -EREMOTE:
cifs_create_dfs_fattr(&fattr, inode->i_sb);
@@ -789,6 +832,7 @@ cifs_get_file_info(struct file *filp)
/* if filetype is different, return error */
rc = cifs_fattr_to_inode(inode, &fattr);
cgfi_exit:
+ cifs_free_open_info(&data);
free_xid(xid);
return rc;
}
@@ -860,14 +904,9 @@ cifs_backup_query_path_info(int xid,
}
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
-static void
-cifs_set_fattr_ino(int xid,
- struct cifs_tcon *tcon,
- struct super_block *sb,
- struct inode **inode,
- const char *full_path,
- FILE_ALL_INFO *data,
- struct cifs_fattr *fattr)
+static void cifs_set_fattr_ino(int xid, struct cifs_tcon *tcon, struct super_block *sb,
+ struct inode **inode, const char *full_path,
+ struct cifs_open_info_data *data, struct cifs_fattr *fattr)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct TCP_Server_Info *server = tcon->ses->server;
@@ -885,11 +924,8 @@ cifs_set_fattr_ino(int xid,
* If we have an inode pass a NULL tcon to ensure we don't
* make a round trip to the server. This only works for SMB2+.
*/
- rc = server->ops->get_srv_inum(xid,
- *inode ? NULL : tcon,
- cifs_sb, full_path,
- &fattr->cf_uniqueid,
- data);
+ rc = server->ops->get_srv_inum(xid, *inode ? NULL : tcon, cifs_sb, full_path,
+ &fattr->cf_uniqueid, data);
if (rc) {
/*
* If that fails reuse existing ino or generate one
@@ -913,7 +949,7 @@ cifs_set_fattr_ino(int xid,
} else {
/* make an ino by hashing the UNC */
fattr->cf_flags |= CIFS_FATTR_FAKE_ROOT_INO;
- fattr->cf_uniqueid = simple_hashstr(tcon->treeName);
+ fattr->cf_uniqueid = simple_hashstr(tcon->tree_name);
}
}
}
@@ -923,14 +959,10 @@ static inline bool is_inode_cache_good(struct inode *ino)
return ino && CIFS_CACHE_READ(CIFS_I(ino)) && CIFS_I(ino)->time != 0;
}
-int
-cifs_get_inode_info(struct inode **inode,
- const char *full_path,
- FILE_ALL_INFO *in_data,
- struct super_block *sb, int xid,
- const struct cifs_fid *fid)
+int cifs_get_inode_info(struct inode **inode, const char *full_path,
+ struct cifs_open_info_data *data, struct super_block *sb, int xid,
+ const struct cifs_fid *fid)
{
-
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
struct tcon_link *tlink;
@@ -938,8 +970,7 @@ cifs_get_inode_info(struct inode **inode,
bool adjust_tz = false;
struct cifs_fattr fattr = {0};
bool is_reparse_point = false;
- FILE_ALL_INFO *data = in_data;
- FILE_ALL_INFO *tmp_data = NULL;
+ struct cifs_open_info_data tmp_data = {};
void *smb1_backup_rsp_buf = NULL;
int rc = 0;
int tmprc = 0;
@@ -960,21 +991,15 @@ cifs_get_inode_info(struct inode **inode,
cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
goto out;
}
- tmp_data = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
- if (!tmp_data) {
- rc = -ENOMEM;
- goto out;
- }
- rc = server->ops->query_path_info(xid, tcon, cifs_sb,
- full_path, tmp_data,
- &adjust_tz, &is_reparse_point);
+ rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path, &tmp_data,
+ &adjust_tz, &is_reparse_point);
#ifdef CONFIG_CIFS_DFS_UPCALL
if (rc == -ENOENT && is_tcon_dfs(tcon))
rc = cifs_dfs_query_info_nonascii_quirk(xid, tcon,
cifs_sb,
full_path);
#endif
- data = tmp_data;
+ data = &tmp_data;
}
/*
@@ -988,14 +1013,24 @@ cifs_get_inode_info(struct inode **inode,
* since we have to check if its reparse tag matches a known
* special file type e.g. symlink or fifo or char etc.
*/
- if ((le32_to_cpu(data->Attributes) & ATTR_REPARSE) &&
- server->ops->query_reparse_tag) {
- rc = server->ops->query_reparse_tag(xid, tcon, cifs_sb,
- full_path, &reparse_tag);
- cifs_dbg(FYI, "reparse tag 0x%x\n", reparse_tag);
+ if (is_reparse_point && data->symlink_target) {
+ reparse_tag = IO_REPARSE_TAG_SYMLINK;
+ } else if ((le32_to_cpu(data->fi.Attributes) & ATTR_REPARSE) &&
+ server->ops->query_reparse_tag) {
+ tmprc = server->ops->query_reparse_tag(xid, tcon, cifs_sb, full_path,
+ &reparse_tag);
+ if (tmprc)
+ cifs_dbg(FYI, "%s: query_reparse_tag: rc = %d\n", __func__, tmprc);
+ if (server->ops->query_symlink) {
+ tmprc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path,
+ &data->symlink_target,
+ is_reparse_point);
+ if (tmprc)
+ cifs_dbg(FYI, "%s: query_symlink: rc = %d\n", __func__,
+ tmprc);
+ }
}
- cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz,
- is_reparse_point, reparse_tag);
+ cifs_open_info_to_fattr(&fattr, data, sb, adjust_tz, is_reparse_point, reparse_tag);
break;
case -EREMOTE:
/* DFS link, no metadata available on this server */
@@ -1014,18 +1049,20 @@ cifs_get_inode_info(struct inode **inode,
*/
if (backup_cred(cifs_sb) && is_smb1_server(server)) {
/* for easier reading */
+ FILE_ALL_INFO *fi;
FILE_DIRECTORY_INFO *fdi;
SEARCH_ID_FULL_DIR_INFO *si;
rc = cifs_backup_query_path_info(xid, tcon, sb,
full_path,
&smb1_backup_rsp_buf,
- &data);
+ &fi);
if (rc)
goto out;
- fdi = (FILE_DIRECTORY_INFO *)data;
- si = (SEARCH_ID_FULL_DIR_INFO *)data;
+ move_cifs_info_to_smb2(&data->fi, fi);
+ fdi = (FILE_DIRECTORY_INFO *)fi;
+ si = (SEARCH_ID_FULL_DIR_INFO *)fi;
cifs_dir_info_to_fattr(&fattr, fdi, cifs_sb);
fattr.cf_uniqueid = le64_to_cpu(si->UniqueId);
@@ -1123,7 +1160,8 @@ handle_mnt_opt:
out:
cifs_buf_release(smb1_backup_rsp_buf);
cifs_put_tlink(tlink);
- kfree(tmp_data);
+ cifs_free_open_info(&tmp_data);
+ kfree(fattr.cf_symlink_target);
return rc;
}
@@ -1138,7 +1176,8 @@ smb311_posix_get_inode_info(struct inode **inode,
bool adjust_tz = false;
struct cifs_fattr fattr = {0};
bool symlink = false;
- struct smb311_posix_qinfo *data = NULL;
+ struct cifs_open_info_data data = {};
+ struct cifs_sid owner, group;
int rc = 0;
int tmprc = 0;
@@ -1155,15 +1194,10 @@ smb311_posix_get_inode_info(struct inode **inode,
cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
goto out;
}
- data = kmalloc(sizeof(struct smb311_posix_qinfo), GFP_KERNEL);
- if (!data) {
- rc = -ENOMEM;
- goto out;
- }
- rc = smb311_posix_query_path_info(xid, tcon, cifs_sb,
- full_path, data,
- &adjust_tz, &symlink);
+ rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, full_path, &data,
+ &owner, &group, &adjust_tz,
+ &symlink);
/*
* 2. Convert it to internal cifs metadata (fattr)
@@ -1171,7 +1205,8 @@ smb311_posix_get_inode_info(struct inode **inode,
switch (rc) {
case 0:
- smb311_posix_info_to_fattr(&fattr, data, sb, adjust_tz, symlink);
+ smb311_posix_info_to_fattr(&fattr, &data, &owner, &group,
+ sb, adjust_tz, symlink);
break;
case -EREMOTE:
/* DFS link, no metadata available on this server */
@@ -1228,7 +1263,8 @@ smb311_posix_get_inode_info(struct inode **inode,
}
out:
cifs_put_tlink(tlink);
- kfree(data);
+ cifs_free_open_info(&data);
+ kfree(fattr.cf_symlink_target);
return rc;
}
@@ -2265,13 +2301,13 @@ cifs_dentry_needs_reval(struct dentry *dentry)
return true;
if (!open_cached_dir_by_dentry(tcon, dentry->d_parent, &cfid)) {
- mutex_lock(&cfid->fid_mutex);
+ spin_lock(&cfid->fid_lock);
if (cfid->time && cifs_i->time > cfid->time) {
- mutex_unlock(&cfid->fid_mutex);
+ spin_unlock(&cfid->fid_lock);
close_cached_dir(cfid);
return false;
}
- mutex_unlock(&cfid->fid_mutex);
+ spin_unlock(&cfid->fid_lock);
close_cached_dir(cfid);
}
/*
@@ -2327,7 +2363,7 @@ cifs_invalidate_mapping(struct inode *inode)
static int
cifs_wait_bit_killable(struct wait_bit_key *key, int mode)
{
- freezable_schedule_unsafe();
+ schedule();
if (signal_pending_state(mode, current))
return -ERESTARTSYS;
return 0;
@@ -2345,7 +2381,7 @@ cifs_revalidate_mapping(struct inode *inode)
return 0;
rc = wait_on_bit_lock_action(flags, CIFS_INO_LOCK, cifs_wait_bit_killable,
- TASK_KILLABLE);
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
if (rc)
return rc;
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index b6e6e5d6c8dd..6419ec47c2a8 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -343,7 +343,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
rc = put_user(ExtAttrBits &
FS_FL_USER_VISIBLE,
(int __user *)arg);
- if (rc != EOPNOTSUPP)
+ if (rc != -EOPNOTSUPP)
break;
}
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
@@ -373,7 +373,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
* pSMBFile->fid.netfid,
* extAttrBits,
* &ExtAttrMask);
- * if (rc != EOPNOTSUPP)
+ * if (rc != -EOPNOTSUPP)
* break;
*/
@@ -484,12 +484,35 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
tcon = tlink_tcon(tlink);
if (tcon && tcon->ses->server->ops->notify) {
rc = tcon->ses->server->ops->notify(xid,
- filep, (void __user *)arg);
+ filep, (void __user *)arg,
+ false /* no ret data */);
cifs_dbg(FYI, "ioctl notify rc %d\n", rc);
} else
rc = -EOPNOTSUPP;
cifs_put_tlink(tlink);
break;
+ case CIFS_IOC_NOTIFY_INFO:
+ if (!S_ISDIR(inode->i_mode)) {
+ /* Notify can only be done on directories */
+ rc = -EOPNOTSUPP;
+ break;
+ }
+ cifs_sb = CIFS_SB(inode->i_sb);
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink)) {
+ rc = PTR_ERR(tlink);
+ break;
+ }
+ tcon = tlink_tcon(tlink);
+ if (tcon && tcon->ses->server->ops->notify) {
+ rc = tcon->ses->server->ops->notify(xid,
+ filep, (void __user *)arg,
+ true /* return details */);
+ cifs_dbg(FYI, "ioctl notify info rc %d\n", rc);
+ } else
+ rc = -EOPNOTSUPP;
+ cifs_put_tlink(tlink);
+ break;
case CIFS_IOC_SHUTDOWN:
rc = cifs_shutdown(inode->i_sb, arg);
break;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 6803cb27eecc..bd374feeccaa 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -38,29 +38,28 @@ static int
symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
{
int rc;
- struct crypto_shash *md5 = NULL;
- struct sdesc *sdescmd5 = NULL;
+ struct shash_desc *md5 = NULL;
- rc = cifs_alloc_hash("md5", &md5, &sdescmd5);
+ rc = cifs_alloc_hash("md5", &md5);
if (rc)
goto symlink_hash_err;
- rc = crypto_shash_init(&sdescmd5->shash);
+ rc = crypto_shash_init(md5);
if (rc) {
cifs_dbg(VFS, "%s: Could not init md5 shash\n", __func__);
goto symlink_hash_err;
}
- rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len);
+ rc = crypto_shash_update(md5, link_str, link_len);
if (rc) {
cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__);
goto symlink_hash_err;
}
- rc = crypto_shash_final(&sdescmd5->shash, md5_hash);
+ rc = crypto_shash_final(md5, md5_hash);
if (rc)
cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
symlink_hash_err:
- cifs_free_hash(&md5, &sdescmd5);
+ cifs_free_hash(&md5);
return rc;
}
@@ -202,40 +201,6 @@ out:
return rc;
}
-static int
-query_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const unsigned char *path,
- char **symlinkinfo)
-{
- int rc;
- u8 *buf = NULL;
- unsigned int link_len = 0;
- unsigned int bytes_read = 0;
-
- buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- if (tcon->ses->server->ops->query_mf_symlink)
- rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon,
- cifs_sb, path, buf, &bytes_read);
- else
- rc = -ENOSYS;
-
- if (rc)
- goto out;
-
- if (bytes_read == 0) { /* not a symlink */
- rc = -EINVAL;
- goto out;
- }
-
- rc = parse_mf_symlink(buf, bytes_read, &link_len, symlinkinfo);
-out:
- kfree(buf);
- return rc;
-}
-
int
check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
@@ -245,6 +210,7 @@ check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
u8 *buf = NULL;
unsigned int link_len = 0;
unsigned int bytes_read = 0;
+ char *symlink = NULL;
if (!couldbe_mf_symlink(fattr))
/* it's not a symlink */
@@ -266,7 +232,7 @@ check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
if (bytes_read == 0) /* not a symlink */
goto out;
- rc = parse_mf_symlink(buf, bytes_read, &link_len, NULL);
+ rc = parse_mf_symlink(buf, bytes_read, &link_len, &symlink);
if (rc == -EINVAL) {
/* it's not a symlink */
rc = 0;
@@ -281,6 +247,7 @@ check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
fattr->cf_mode &= ~S_IFMT;
fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO;
fattr->cf_dtype = DT_LNK;
+ fattr->cf_symlink_target = symlink;
out:
kfree(buf);
return rc;
@@ -600,75 +567,6 @@ cifs_hl_exit:
return rc;
}
-const char *
-cifs_get_link(struct dentry *direntry, struct inode *inode,
- struct delayed_call *done)
-{
- int rc = -ENOMEM;
- unsigned int xid;
- const char *full_path;
- void *page;
- char *target_path = NULL;
- struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
- struct tcon_link *tlink = NULL;
- struct cifs_tcon *tcon;
- struct TCP_Server_Info *server;
-
- if (!direntry)
- return ERR_PTR(-ECHILD);
-
- xid = get_xid();
-
- tlink = cifs_sb_tlink(cifs_sb);
- if (IS_ERR(tlink)) {
- free_xid(xid);
- return ERR_CAST(tlink);
- }
- tcon = tlink_tcon(tlink);
- server = tcon->ses->server;
-
- page = alloc_dentry_path();
- full_path = build_path_from_dentry(direntry, page);
- if (IS_ERR(full_path)) {
- free_xid(xid);
- cifs_put_tlink(tlink);
- free_dentry_path(page);
- return ERR_CAST(full_path);
- }
-
- cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode);
-
- rc = -EACCES;
- /*
- * First try Minshall+French Symlinks, if configured
- * and fallback to UNIX Extensions Symlinks.
- */
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
- rc = query_mf_symlink(xid, tcon, cifs_sb, full_path,
- &target_path);
-
- if (rc != 0 && server->ops->query_symlink) {
- struct cifsInodeInfo *cifsi = CIFS_I(inode);
- bool reparse_point = false;
-
- if (cifsi->cifsAttrs & ATTR_REPARSE)
- reparse_point = true;
-
- rc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path,
- &target_path, reparse_point);
- }
-
- free_dentry_path(page);
- free_xid(xid);
- cifs_put_tlink(tlink);
- if (rc != 0) {
- kfree(target_path);
- return ERR_PTR(rc);
- }
- set_delayed_call(done, kfree_link, target_path);
- return target_path;
-}
-
int
cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode,
struct dentry *direntry, const char *symname)
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 87f60f736731..1cbecd64d697 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -117,8 +117,8 @@ tconInfoAlloc(void)
ret_buf = kzalloc(sizeof(*ret_buf), GFP_KERNEL);
if (!ret_buf)
return NULL;
- ret_buf->cfid = init_cached_dir();
- if (!ret_buf->cfid) {
+ ret_buf->cfids = init_cached_dirs();
+ if (!ret_buf->cfids) {
kfree(ret_buf);
return NULL;
}
@@ -144,7 +144,7 @@ tconInfoFree(struct cifs_tcon *tcon)
cifs_dbg(FYI, "Null buffer passed to tconInfoFree\n");
return;
}
- free_cached_dir(tcon);
+ free_cached_dirs(tcon->cfids);
atomic_dec(&tconInfoAllocCount);
kfree(tcon->nativeFileSystem);
kfree_sensitive(tcon->password);
@@ -400,6 +400,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
{
struct smb_hdr *buf = (struct smb_hdr *)buffer;
struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf;
+ struct TCP_Server_Info *pserver;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
struct cifsInodeInfo *pCifsInode;
@@ -464,9 +465,12 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE))
return false;
+ /* If server is a channel, select the primary channel */
+ pserver = CIFS_SERVER_IS_CHAN(srv) ? srv->primary_server : srv;
+
/* look up tcon based on tid & uid */
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(ses, &srv->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
if (tcon->tid != buf->Tid)
continue;
@@ -525,7 +529,7 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
cifs_sb->mnt_cifs_serverino_autodisabled = true;
cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s\n",
- tcon ? tcon->treeName : "new server");
+ tcon ? tcon->tree_name : "new server");
cifs_dbg(VFS, "The server doesn't seem to support them properly or the files might be on different servers (DFS)\n");
cifs_dbg(VFS, "Hardlinks will not be recognized on this mount. Consider mounting with the \"noserverino\" option to silence this message.\n");
@@ -824,7 +828,7 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path)
free_dentry_path(page);
}
-/* parses DFS refferal V3 structure
+/* parses DFS referral V3 structure
* caller is responsible for freeing target_nodes
* returns:
* - on success - 0
@@ -1071,59 +1075,58 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
/**
* cifs_alloc_hash - allocate hash and hash context together
* @name: The name of the crypto hash algo
- * @shash: Where to put the pointer to the hash algo
- * @sdesc: Where to put the pointer to the hash descriptor
+ * @sdesc: SHASH descriptor where to put the pointer to the hash TFM
*
* The caller has to make sure @sdesc is initialized to either NULL or
- * a valid context. Both can be freed via cifs_free_hash().
+ * a valid context. It can be freed via cifs_free_hash().
*/
int
-cifs_alloc_hash(const char *name,
- struct crypto_shash **shash, struct sdesc **sdesc)
+cifs_alloc_hash(const char *name, struct shash_desc **sdesc)
{
int rc = 0;
- size_t size;
+ struct crypto_shash *alg = NULL;
- if (*sdesc != NULL)
+ if (*sdesc)
return 0;
- *shash = crypto_alloc_shash(name, 0, 0);
- if (IS_ERR(*shash)) {
- cifs_dbg(VFS, "Could not allocate crypto %s\n", name);
- rc = PTR_ERR(*shash);
- *shash = NULL;
+ alg = crypto_alloc_shash(name, 0, 0);
+ if (IS_ERR(alg)) {
+ cifs_dbg(VFS, "Could not allocate shash TFM '%s'\n", name);
+ rc = PTR_ERR(alg);
*sdesc = NULL;
return rc;
}
- size = sizeof(struct shash_desc) + crypto_shash_descsize(*shash);
- *sdesc = kmalloc(size, GFP_KERNEL);
+ *sdesc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(alg), GFP_KERNEL);
if (*sdesc == NULL) {
- cifs_dbg(VFS, "no memory left to allocate crypto %s\n", name);
- crypto_free_shash(*shash);
- *shash = NULL;
+ cifs_dbg(VFS, "no memory left to allocate shash TFM '%s'\n", name);
+ crypto_free_shash(alg);
return -ENOMEM;
}
- (*sdesc)->shash.tfm = *shash;
+ (*sdesc)->tfm = alg;
return 0;
}
/**
* cifs_free_hash - free hash and hash context together
- * @shash: Where to find the pointer to the hash algo
- * @sdesc: Where to find the pointer to the hash descriptor
+ * @sdesc: Where to find the pointer to the hash TFM
*
- * Freeing a NULL hash or context is safe.
+ * Freeing a NULL descriptor is safe.
*/
void
-cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc)
+cifs_free_hash(struct shash_desc **sdesc)
{
- kfree(*sdesc);
+ if (unlikely(!sdesc) || !*sdesc)
+ return;
+
+ if ((*sdesc)->tfm) {
+ crypto_free_shash((*sdesc)->tfm);
+ (*sdesc)->tfm = NULL;
+ }
+
+ kfree_sensitive(*sdesc);
*sdesc = NULL;
- if (*shash)
- crypto_free_shash(*shash);
- *shash = NULL;
}
/**
@@ -1133,8 +1136,8 @@ cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc)
* @len: Where to store the length for this page:
* @offset: Where to store the offset for this page
*/
-void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page,
- unsigned int *len, unsigned int *offset)
+void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page,
+ unsigned int *len, unsigned int *offset)
{
*len = rqst->rq_pagesz;
*offset = (page == 0) ? rqst->rq_offset : 0;
@@ -1328,7 +1331,7 @@ int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid,
char *treename, *dfspath, sep;
int treenamelen, linkpathlen, rc;
- treename = tcon->treeName;
+ treename = tcon->tree_name;
/* MS-DFSC: All paths in REQ_GET_DFS_REFERRAL and RESP_GET_DFS_REFERRAL
* messages MUST be encoded with exactly one leading backslash, not two
* leading backslashes.
diff --git a/fs/cifs/netlink.c b/fs/cifs/netlink.c
index 291cb606f149..147d9409252c 100644
--- a/fs/cifs/netlink.c
+++ b/fs/cifs/netlink.c
@@ -51,6 +51,7 @@ struct genl_family cifs_genl_family = {
.policy = cifs_genl_policy,
.ops = cifs_genl_ops,
.n_ops = ARRAY_SIZE(cifs_genl_ops),
+ .resv_start_op = CIFS_GENL_CMD_SWN_NOTIFY + 1,
.mcgrps = cifs_genl_mcgrps,
.n_mcgrps = ARRAY_SIZE(cifs_genl_mcgrps),
};
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 8e060c00c969..2d75ba5aaa8a 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -844,17 +844,34 @@ static bool emit_cached_dirents(struct cached_dirents *cde,
struct dir_context *ctx)
{
struct cached_dirent *dirent;
- int rc;
+ bool rc;
list_for_each_entry(dirent, &cde->entries, entry) {
- if (ctx->pos >= dirent->pos)
+ /*
+ * Skip all early entries prior to the current lseek()
+ * position.
+ */
+ if (ctx->pos > dirent->pos)
continue;
+ /*
+ * We recorded the current ->pos value for the dirent
+ * when we stored it in the cache.
+ * However, this sequence of ->pos values may have holes
+ * in it, for example dot-dirs returned from the server
+ * are suppressed.
+ * Handle this bu forcing ctx->pos to be the same as the
+ * ->pos of the current dirent we emit from the cache.
+ * This means that when we emit these entries from the cache
+ * we now emit them with the same ->pos value as in the
+ * initial scan.
+ */
ctx->pos = dirent->pos;
rc = dir_emit(ctx, dirent->name, dirent->namelen,
dirent->fattr.cf_uniqueid,
dirent->fattr.cf_dtype);
if (!rc)
return rc;
+ ctx->pos++;
}
return true;
}
@@ -994,6 +1011,8 @@ static int cifs_filldir(char *find_entry, struct file *file,
cifs_unix_basic_to_fattr(&fattr,
&((FILE_UNIX_INFO *)find_entry)->basic,
cifs_sb);
+ if (S_ISLNK(fattr.cf_mode))
+ fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
break;
case SMB_FIND_FILE_INFO_STANDARD:
cifs_std_info_to_fattr(&fattr,
@@ -1202,10 +1221,10 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
ctx->pos, tmp_buf);
cifs_save_resume_key(current_entry, cifsFile);
break;
- } else
- current_entry =
- nxt_dir_entry(current_entry, end_of_smb,
- cifsFile->srch_inf.info_level);
+ }
+ current_entry =
+ nxt_dir_entry(current_entry, end_of_smb,
+ cifsFile->srch_inf.info_level);
}
kfree(tmp_buf);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 3af3b05b6c74..9e7d9f0baa18 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -302,14 +302,14 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
/* now drop the ref to the current iface */
if (old_iface && iface) {
- kref_put(&old_iface->refcount, release_iface);
cifs_dbg(FYI, "replacing iface: %pIS with %pIS\n",
&old_iface->sockaddr,
&iface->sockaddr);
- } else if (old_iface) {
kref_put(&old_iface->refcount, release_iface);
+ } else if (old_iface) {
cifs_dbg(FYI, "releasing ref to iface: %pIS\n",
&old_iface->sockaddr);
+ kref_put(&old_iface->refcount, release_iface);
} else {
WARN_ON(!iface);
cifs_dbg(FYI, "adding new iface: %pIS\n", &iface->sockaddr);
@@ -496,6 +496,7 @@ out:
cifs_put_tcp_session(chan->server, 0);
}
+ free_xid(xid);
return rc;
}
@@ -601,11 +602,6 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
/* BB FIXME add check that strings total less
than 335 or will need to send them as arrays */
- /* unicode strings, must be word aligned before the call */
-/* if ((long) bcc_ptr % 2) {
- *bcc_ptr = 0;
- bcc_ptr++;
- } */
/* copy user */
if (ses->user_name == NULL) {
/* null user mount */
@@ -1213,10 +1209,18 @@ out_free_smb_buf:
static void
sess_free_buffer(struct sess_data *sess_data)
{
+ struct kvec *iov = sess_data->iov;
+
+ /*
+ * Zero the session data before freeing, as it might contain sensitive info (keys, etc).
+ * Note that iov[1] is already freed by caller.
+ */
+ if (sess_data->buf0_type != CIFS_NO_BUFFER && iov[0].iov_base)
+ memzero_explicit(iov[0].iov_base, iov[0].iov_len);
- free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base);
+ free_rsp_buf(sess_data->buf0_type, iov[0].iov_base);
sess_data->buf0_type = CIFS_NO_BUFFER;
- kfree(sess_data->iov[2].iov_base);
+ kfree_sensitive(iov[2].iov_base);
}
static int
@@ -1318,7 +1322,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data)
}
if (ses->capabilities & CAP_UNICODE) {
- if (sess_data->iov[0].iov_len % 2) {
+ if (!IS_ALIGNED(sess_data->iov[0].iov_len, 2)) {
*bcc_ptr = 0;
bcc_ptr++;
}
@@ -1358,7 +1362,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data)
/* no string area to decode, do nothing */
} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
/* unicode string area must be word-aligned */
- if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+ if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) {
++bcc_ptr;
--bytes_remaining;
}
@@ -1374,7 +1378,7 @@ out:
sess_data->result = rc;
sess_data->func = NULL;
sess_free_buffer(sess_data);
- kfree(ses->auth_key.response);
+ kfree_sensitive(ses->auth_key.response);
ses->auth_key.response = NULL;
}
@@ -1442,8 +1446,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
if (ses->capabilities & CAP_UNICODE) {
/* unicode strings must be word aligned */
- if ((sess_data->iov[0].iov_len
- + sess_data->iov[1].iov_len) % 2) {
+ if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) {
*bcc_ptr = 0;
bcc_ptr++;
}
@@ -1494,7 +1497,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
/* no string area to decode, do nothing */
} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
/* unicode string area must be word-aligned */
- if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+ if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) {
++bcc_ptr;
--bytes_remaining;
}
@@ -1513,7 +1516,7 @@ out:
sess_data->result = rc;
sess_data->func = NULL;
sess_free_buffer(sess_data);
- kfree(ses->auth_key.response);
+ kfree_sensitive(ses->auth_key.response);
ses->auth_key.response = NULL;
}
@@ -1546,7 +1549,7 @@ _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data)
bcc_ptr = sess_data->iov[2].iov_base;
/* unicode strings must be word aligned */
- if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 2) {
+ if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) {
*bcc_ptr = 0;
bcc_ptr++;
}
@@ -1648,7 +1651,7 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data)
rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses);
out_free_ntlmsspblob:
- kfree(ntlmsspblob);
+ kfree_sensitive(ntlmsspblob);
out:
sess_free_buffer(sess_data);
@@ -1658,9 +1661,9 @@ out:
}
/* Else error. Cleanup */
- kfree(ses->auth_key.response);
+ kfree_sensitive(ses->auth_key.response);
ses->auth_key.response = NULL;
- kfree(ses->ntlmssp);
+ kfree_sensitive(ses->ntlmssp);
ses->ntlmssp = NULL;
sess_data->func = NULL;
@@ -1747,7 +1750,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)
/* no string area to decode, do nothing */
} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
/* unicode string area must be word-aligned */
- if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+ if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) {
++bcc_ptr;
--bytes_remaining;
}
@@ -1759,7 +1762,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)
}
out_free_ntlmsspblob:
- kfree(ntlmsspblob);
+ kfree_sensitive(ntlmsspblob);
out:
sess_free_buffer(sess_data);
@@ -1767,9 +1770,9 @@ out:
rc = sess_establish_session(sess_data);
/* Cleanup */
- kfree(ses->auth_key.response);
+ kfree_sensitive(ses->auth_key.response);
ses->auth_key.response = NULL;
- kfree(ses->ntlmssp);
+ kfree_sensitive(ses->ntlmssp);
ses->ntlmssp = NULL;
sess_data->func = NULL;
@@ -1845,7 +1848,7 @@ int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
rc = sess_data->result;
out:
- kfree(sess_data);
+ kfree_sensitive(sess_data);
return rc;
}
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index f36b2d2d40ca..50480751e521 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -542,31 +542,32 @@ cifs_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
-static int
-cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
- FILE_ALL_INFO *data, bool *adjustTZ, bool *symlink)
+static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path,
+ struct cifs_open_info_data *data, bool *adjustTZ, bool *symlink)
{
int rc;
+ FILE_ALL_INFO fi = {};
*symlink = false;
/* could do find first instead but this returns more info */
- rc = CIFSSMBQPathInfo(xid, tcon, full_path, data, 0 /* not legacy */,
- cifs_sb->local_nls, cifs_remap(cifs_sb));
+ rc = CIFSSMBQPathInfo(xid, tcon, full_path, &fi, 0 /* not legacy */, cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
/*
* BB optimize code so we do not make the above call when server claims
* no NT SMB support and the above call failed at least once - set flag
* in tcon or mount.
*/
if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) {
- rc = SMBQueryInformation(xid, tcon, full_path, data,
- cifs_sb->local_nls,
+ rc = SMBQueryInformation(xid, tcon, full_path, &fi, cifs_sb->local_nls,
cifs_remap(cifs_sb));
+ if (!rc)
+ move_cifs_info_to_smb2(&data->fi, &fi);
*adjustTZ = true;
}
- if (!rc && (le32_to_cpu(data->Attributes) & ATTR_REPARSE)) {
+ if (!rc && (le32_to_cpu(fi.Attributes) & ATTR_REPARSE)) {
int tmprc;
int oplock = 0;
struct cifs_fid fid;
@@ -592,10 +593,9 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
-static int
-cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
- u64 *uniqueid, FILE_ALL_INFO *data)
+static int cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path,
+ u64 *uniqueid, struct cifs_open_info_data *unused)
{
/*
* We can not use the IndexNumber field by default from Windows or
@@ -613,11 +613,22 @@ cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon,
cifs_remap(cifs_sb));
}
-static int
-cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_fid *fid, FILE_ALL_INFO *data)
+static int cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifsFileInfo *cfile, struct cifs_open_info_data *data)
{
- return CIFSSMBQFileInfo(xid, tcon, fid->netfid, data);
+ int rc;
+ FILE_ALL_INFO fi = {};
+
+ if (cfile->symlink_target) {
+ data->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
+ if (!data->symlink_target)
+ return -ENOMEM;
+ }
+
+ rc = CIFSSMBQFileInfo(xid, tcon, cfile->fid.netfid, &fi);
+ if (!rc)
+ move_cifs_info_to_smb2(&data->fi, &fi);
+ return rc;
}
static void
@@ -702,19 +713,20 @@ cifs_mkdir_setinfo(struct inode *inode, const char *full_path,
cifsInode->cifsAttrs = dosattrs;
}
-static int
-cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
- __u32 *oplock, FILE_ALL_INFO *buf)
+static int cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock,
+ void *buf)
{
+ FILE_ALL_INFO *fi = buf;
+
if (!(oparms->tcon->ses->capabilities & CAP_NT_SMBS))
return SMBLegacyOpen(xid, oparms->tcon, oparms->path,
oparms->disposition,
oparms->desired_access,
oparms->create_options,
- &oparms->fid->netfid, oplock, buf,
+ &oparms->fid->netfid, oplock, fi,
oparms->cifs_sb->local_nls,
cifs_remap(oparms->cifs_sb));
- return CIFS_open(xid, oparms, oplock, buf);
+ return CIFS_open(xid, oparms, oplock, fi);
}
static void
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 9dfd2dd612c2..ffbd9a99fc12 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -20,40 +20,125 @@
#include "cifs_unicode.h"
#include "fscache.h"
#include "smb2proto.h"
+#include "smb2status.h"
-int
-smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
- __u32 *oplock, FILE_ALL_INFO *buf)
+static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov)
+{
+ struct smb2_err_rsp *err = iov->iov_base;
+ struct smb2_symlink_err_rsp *sym = ERR_PTR(-EINVAL);
+ u32 len;
+
+ if (err->ErrorContextCount) {
+ struct smb2_error_context_rsp *p, *end;
+
+ len = (u32)err->ErrorContextCount * (offsetof(struct smb2_error_context_rsp,
+ ErrorContextData) +
+ sizeof(struct smb2_symlink_err_rsp));
+ if (le32_to_cpu(err->ByteCount) < len || iov->iov_len < len + sizeof(*err))
+ return ERR_PTR(-EINVAL);
+
+ p = (struct smb2_error_context_rsp *)err->ErrorData;
+ end = (struct smb2_error_context_rsp *)((u8 *)err + iov->iov_len);
+ do {
+ if (le32_to_cpu(p->ErrorId) == SMB2_ERROR_ID_DEFAULT) {
+ sym = (struct smb2_symlink_err_rsp *)&p->ErrorContextData;
+ break;
+ }
+ cifs_dbg(FYI, "%s: skipping unhandled error context: 0x%x\n",
+ __func__, le32_to_cpu(p->ErrorId));
+
+ len = ALIGN(le32_to_cpu(p->ErrorDataLength), 8);
+ p = (struct smb2_error_context_rsp *)((u8 *)&p->ErrorContextData + len);
+ } while (p < end);
+ } else if (le32_to_cpu(err->ByteCount) >= sizeof(*sym) &&
+ iov->iov_len >= SMB2_SYMLINK_STRUCT_SIZE) {
+ sym = (struct smb2_symlink_err_rsp *)err->ErrorData;
+ }
+
+ if (!IS_ERR(sym) && (le32_to_cpu(sym->SymLinkErrorTag) != SYMLINK_ERROR_TAG ||
+ le32_to_cpu(sym->ReparseTag) != IO_REPARSE_TAG_SYMLINK))
+ sym = ERR_PTR(-EINVAL);
+
+ return sym;
+}
+
+int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec *iov, char **path)
+{
+ struct smb2_symlink_err_rsp *sym;
+ unsigned int sub_offs, sub_len;
+ unsigned int print_offs, print_len;
+ char *s;
+
+ if (!cifs_sb || !iov || !iov->iov_base || !iov->iov_len || !path)
+ return -EINVAL;
+
+ sym = symlink_data(iov);
+ if (IS_ERR(sym))
+ return PTR_ERR(sym);
+
+ sub_len = le16_to_cpu(sym->SubstituteNameLength);
+ sub_offs = le16_to_cpu(sym->SubstituteNameOffset);
+ print_len = le16_to_cpu(sym->PrintNameLength);
+ print_offs = le16_to_cpu(sym->PrintNameOffset);
+
+ if (iov->iov_len < SMB2_SYMLINK_STRUCT_SIZE + sub_offs + sub_len ||
+ iov->iov_len < SMB2_SYMLINK_STRUCT_SIZE + print_offs + print_len)
+ return -EINVAL;
+
+ s = cifs_strndup_from_utf16((char *)sym->PathBuffer + sub_offs, sub_len, true,
+ cifs_sb->local_nls);
+ if (!s)
+ return -ENOMEM;
+ convert_delimiter(s, '/');
+ cifs_dbg(FYI, "%s: symlink target: %s\n", __func__, s);
+
+ *path = s;
+ return 0;
+}
+
+int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, void *buf)
{
int rc;
__le16 *smb2_path;
- struct smb2_file_all_info *smb2_data = NULL;
__u8 smb2_oplock;
+ struct cifs_open_info_data *data = buf;
+ struct smb2_file_all_info file_info = {};
+ struct smb2_file_all_info *smb2_data = data ? &file_info : NULL;
+ struct kvec err_iov = {};
+ int err_buftype = CIFS_NO_BUFFER;
struct cifs_fid *fid = oparms->fid;
struct network_resiliency_req nr_ioctl_req;
smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb);
- if (smb2_path == NULL) {
- rc = -ENOMEM;
- goto out;
- }
-
- smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
- GFP_KERNEL);
- if (smb2_data == NULL) {
- rc = -ENOMEM;
- goto out;
- }
+ if (smb2_path == NULL)
+ return -ENOMEM;
oparms->desired_access |= FILE_READ_ATTRIBUTES;
smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH;
- rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL,
- NULL, NULL);
+ rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov,
+ &err_buftype);
+ if (rc && data) {
+ struct smb2_hdr *hdr = err_iov.iov_base;
+
+ if (unlikely(!err_iov.iov_base || err_buftype == CIFS_NO_BUFFER))
+ rc = -ENOMEM;
+ else if (hdr->Status == STATUS_STOPPED_ON_SYMLINK) {
+ rc = smb2_parse_symlink_response(oparms->cifs_sb, &err_iov,
+ &data->symlink_target);
+ if (!rc) {
+ memset(smb2_data, 0, sizeof(*smb2_data));
+ oparms->create_options |= OPEN_REPARSE_POINT;
+ rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data,
+ NULL, NULL, NULL);
+ oparms->create_options &= ~OPEN_REPARSE_POINT;
+ }
+ }
+ }
+
if (rc)
goto out;
-
if (oparms->tcon->use_resilient) {
/* default timeout is 0, servers pick default (120 seconds) */
nr_ioctl_req.Timeout =
@@ -73,7 +158,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
rc = 0;
}
- if (buf) {
+ if (smb2_data) {
/* if open response does not have IndexNumber field - get it */
if (smb2_data->IndexNumber == 0) {
rc = SMB2_get_srv_num(xid, oparms->tcon,
@@ -89,12 +174,12 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
rc = 0;
}
}
- move_smb2_info_to_cifs(buf, smb2_data);
+ memcpy(&data->fi, smb2_data, sizeof(data->fi));
}
*oplock = smb2_oplock;
out:
- kfree(smb2_data);
+ free_rsp_buf(err_buftype, err_iov.iov_base);
kfree(smb2_path);
return rc;
}
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index b83f59051b26..fbd46db1023a 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -24,6 +24,7 @@
#include "smb2pdu.h"
#include "smb2proto.h"
#include "cached_dir.h"
+#include "smb2status.h"
static void
free_set_inf_compound(struct smb_rqst *rqst)
@@ -50,13 +51,16 @@ struct cop_vars {
/*
* note: If cfile is passed, the reference to it is dropped here.
* So make sure that you do not reuse cfile after return from this func.
+ *
+ * If passing @err_iov and @err_buftype, ensure to make them both large enough (>= 3) to hold all
+ * error responses. Caller is also responsible for freeing them up.
*/
-static int
-smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
- __u32 desired_access, __u32 create_disposition,
- __u32 create_options, umode_t mode, void *ptr, int command,
- struct cifsFileInfo *cfile)
+static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path,
+ __u32 desired_access, __u32 create_disposition, __u32 create_options,
+ umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile,
+ __u8 **extbuf, size_t *extbuflen,
+ struct kvec *err_iov, int *err_buftype)
{
struct cop_vars *vars = NULL;
struct kvec *rsp_iov;
@@ -70,6 +74,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
int num_rqst = 0;
int resp_buftype[3];
struct smb2_query_info_rsp *qi_rsp = NULL;
+ struct cifs_open_info_data *idata;
int flags = 0;
__u8 delete_pending[8] = {1, 0, 0, 0, 0, 0, 0, 0};
unsigned int size[2];
@@ -379,20 +384,25 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_open_free(&rqst[0]);
if (rc == -EREMCHG) {
- pr_warn_once("server share %s deleted\n", tcon->treeName);
+ pr_warn_once("server share %s deleted\n", tcon->tree_name);
tcon->need_reconnect = true;
}
switch (command) {
case SMB2_OP_QUERY_INFO:
+ idata = ptr;
+ if (rc == 0 && cfile && cfile->symlink_target) {
+ idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
+ if (!idata->symlink_target)
+ rc = -ENOMEM;
+ }
if (rc == 0) {
qi_rsp = (struct smb2_query_info_rsp *)
rsp_iov[1].iov_base;
rc = smb2_validate_and_copy_iov(
le16_to_cpu(qi_rsp->OutputBufferOffset),
le32_to_cpu(qi_rsp->OutputBufferLength),
- &rsp_iov[1], sizeof(struct smb2_file_all_info),
- ptr);
+ &rsp_iov[1], sizeof(idata->fi), (char *)&idata->fi);
}
if (rqst[1].rq_iov)
SMB2_query_info_free(&rqst[1]);
@@ -406,13 +416,35 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
tcon->tid);
break;
case SMB2_OP_POSIX_QUERY_INFO:
+ idata = ptr;
+ if (rc == 0 && cfile && cfile->symlink_target) {
+ idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
+ if (!idata->symlink_target)
+ rc = -ENOMEM;
+ }
if (rc == 0) {
qi_rsp = (struct smb2_query_info_rsp *)
rsp_iov[1].iov_base;
rc = smb2_validate_and_copy_iov(
le16_to_cpu(qi_rsp->OutputBufferOffset),
le32_to_cpu(qi_rsp->OutputBufferLength),
- &rsp_iov[1], sizeof(struct smb311_posix_qinfo) /* add SIDs */, ptr);
+ &rsp_iov[1], sizeof(idata->posix_fi) /* add SIDs */,
+ (char *)&idata->posix_fi);
+ }
+ if (rc == 0) {
+ unsigned int length = le32_to_cpu(qi_rsp->OutputBufferLength);
+
+ if (length > sizeof(idata->posix_fi)) {
+ char *base = (char *)rsp_iov[1].iov_base +
+ le16_to_cpu(qi_rsp->OutputBufferOffset) +
+ sizeof(idata->posix_fi);
+ *extbuflen = length - sizeof(idata->posix_fi);
+ *extbuf = kmemdup(base, *extbuflen, GFP_KERNEL);
+ if (!*extbuf)
+ rc = -ENOMEM;
+ } else {
+ rc = -EINVAL;
+ }
}
if (rqst[1].rq_iov)
SMB2_query_info_free(&rqst[1]);
@@ -477,42 +509,33 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
free_set_inf_compound(rqst);
break;
}
- free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
- free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
- free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
+
+ if (rc && err_iov && err_buftype) {
+ memcpy(err_iov, rsp_iov, 3 * sizeof(*err_iov));
+ memcpy(err_buftype, resp_buftype, 3 * sizeof(*err_buftype));
+ } else {
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+ free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
+ }
kfree(vars);
return rc;
}
-void
-move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src)
-{
- memcpy(dst, src, (size_t)(&src->CurrentByteOffset) - (size_t)src);
- dst->CurrentByteOffset = src->CurrentByteOffset;
- dst->Mode = src->Mode;
- dst->AlignmentRequirement = src->AlignmentRequirement;
- dst->IndexNumber1 = 0; /* we don't use it */
-}
-
-int
-smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
- FILE_ALL_INFO *data, bool *adjust_tz, bool *reparse)
+int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path,
+ struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse)
{
int rc;
- struct smb2_file_all_info *smb2_data;
__u32 create_options = 0;
struct cifsFileInfo *cfile;
struct cached_fid *cfid = NULL;
+ struct kvec err_iov[3] = {};
+ int err_buftype[3] = {};
*adjust_tz = false;
*reparse = false;
- smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
- GFP_KERNEL);
- if (smb2_data == NULL)
- return -ENOMEM;
-
if (strcmp(full_path, ""))
rc = -ENOENT;
else
@@ -520,63 +543,65 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
/* If it is a root and its handle is cached then use it */
if (!rc) {
if (cfid->file_all_info_is_valid) {
- move_smb2_info_to_cifs(data,
- &cfid->file_all_info);
+ memcpy(&data->fi, &cfid->file_all_info, sizeof(data->fi));
} else {
- rc = SMB2_query_info(xid, tcon,
- cfid->fid.persistent_fid,
- cfid->fid.volatile_fid, smb2_data);
- if (!rc)
- move_smb2_info_to_cifs(data, smb2_data);
+ rc = SMB2_query_info(xid, tcon, cfid->fid.persistent_fid,
+ cfid->fid.volatile_fid, &data->fi);
}
close_cached_dir(cfid);
- goto out;
+ return rc;
}
cifs_get_readable_path(tcon, full_path, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN, create_options,
- ACL_NO_MODE, smb2_data, SMB2_OP_QUERY_INFO, cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN,
+ create_options, ACL_NO_MODE, data, SMB2_OP_QUERY_INFO, cfile,
+ NULL, NULL, err_iov, err_buftype);
if (rc == -EOPNOTSUPP) {
+ if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER &&
+ ((struct smb2_hdr *)err_iov[0].iov_base)->Command == SMB2_CREATE &&
+ ((struct smb2_hdr *)err_iov[0].iov_base)->Status == STATUS_STOPPED_ON_SYMLINK) {
+ rc = smb2_parse_symlink_response(cifs_sb, err_iov, &data->symlink_target);
+ if (rc)
+ goto out;
+ }
*reparse = true;
create_options |= OPEN_REPARSE_POINT;
/* Failed on a symbolic link - query a reparse point info */
cifs_get_readable_path(tcon, full_path, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN,
- create_options, ACL_NO_MODE,
- smb2_data, SMB2_OP_QUERY_INFO, cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES,
+ FILE_OPEN, create_options, ACL_NO_MODE, data,
+ SMB2_OP_QUERY_INFO, cfile, NULL, NULL, NULL, NULL);
}
- if (rc)
- goto out;
- move_smb2_info_to_cifs(data, smb2_data);
out:
- kfree(smb2_data);
+ free_rsp_buf(err_buftype[0], err_iov[0].iov_base);
+ free_rsp_buf(err_buftype[1], err_iov[1].iov_base);
+ free_rsp_buf(err_buftype[2], err_iov[2].iov_base);
return rc;
}
-int
-smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
- struct smb311_posix_qinfo *data, bool *adjust_tz, bool *reparse)
+int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path,
+ struct cifs_open_info_data *data,
+ struct cifs_sid *owner,
+ struct cifs_sid *group,
+ bool *adjust_tz, bool *reparse)
{
int rc;
__u32 create_options = 0;
struct cifsFileInfo *cfile;
- struct smb311_posix_qinfo *smb2_data;
+ struct kvec err_iov[3] = {};
+ int err_buftype[3] = {};
+ __u8 *sidsbuf = NULL;
+ __u8 *sidsbuf_end = NULL;
+ size_t sidsbuflen = 0;
+ size_t owner_len, group_len;
*adjust_tz = false;
*reparse = false;
- /* BB TODO: Make struct larger when add support for parsing owner SIDs */
- smb2_data = kzalloc(sizeof(struct smb311_posix_qinfo),
- GFP_KERNEL);
- if (smb2_data == NULL)
- return -ENOMEM;
-
/*
* BB TODO: Add support for using the cached root handle.
* Create SMB2_query_posix_info worker function to do non-compounded query
@@ -585,29 +610,53 @@ smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
*/
cifs_get_readable_path(tcon, full_path, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN, create_options,
- ACL_NO_MODE, smb2_data, SMB2_OP_POSIX_QUERY_INFO, cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN,
+ create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile,
+ &sidsbuf, &sidsbuflen, err_iov, err_buftype);
if (rc == -EOPNOTSUPP) {
/* BB TODO: When support for special files added to Samba re-verify this path */
+ if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER &&
+ ((struct smb2_hdr *)err_iov[0].iov_base)->Command == SMB2_CREATE &&
+ ((struct smb2_hdr *)err_iov[0].iov_base)->Status == STATUS_STOPPED_ON_SYMLINK) {
+ rc = smb2_parse_symlink_response(cifs_sb, err_iov, &data->symlink_target);
+ if (rc)
+ goto out;
+ }
*reparse = true;
create_options |= OPEN_REPARSE_POINT;
/* Failed on a symbolic link - query a reparse point info */
cifs_get_readable_path(tcon, full_path, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN,
- create_options, ACL_NO_MODE,
- smb2_data, SMB2_OP_POSIX_QUERY_INFO, cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES,
+ FILE_OPEN, create_options, ACL_NO_MODE, data,
+ SMB2_OP_POSIX_QUERY_INFO, cfile,
+ &sidsbuf, &sidsbuflen, NULL, NULL);
}
- if (rc)
- goto out;
- /* TODO: will need to allow for the 2 SIDs when add support for getting owner UID/GID */
- memcpy(data, smb2_data, sizeof(struct smb311_posix_qinfo));
+ if (rc == 0) {
+ sidsbuf_end = sidsbuf + sidsbuflen;
+
+ owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end);
+ if (owner_len == -1) {
+ rc = -EINVAL;
+ goto out;
+ }
+ memcpy(owner, sidsbuf, owner_len);
+
+ group_len = posix_info_sid_size(
+ sidsbuf + owner_len, sidsbuf_end);
+ if (group_len == -1) {
+ rc = -EINVAL;
+ goto out;
+ }
+ memcpy(group, sidsbuf + owner_len, group_len);
+ }
out:
- kfree(smb2_data);
+ kfree(sidsbuf);
+ free_rsp_buf(err_buftype[0], err_iov[0].iov_base);
+ free_rsp_buf(err_buftype[1], err_iov[1].iov_base);
+ free_rsp_buf(err_buftype[2], err_iov[2].iov_base);
return rc;
}
@@ -619,7 +668,7 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode,
return smb2_compound_op(xid, tcon, cifs_sb, name,
FILE_WRITE_ATTRIBUTES, FILE_CREATE,
CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR,
- NULL);
+ NULL, NULL, NULL, NULL, NULL);
}
void
@@ -641,7 +690,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
tmprc = smb2_compound_op(xid, tcon, cifs_sb, name,
FILE_WRITE_ATTRIBUTES, FILE_CREATE,
CREATE_NOT_FILE, ACL_NO_MODE,
- &data, SMB2_OP_SET_INFO, cfile);
+ &data, SMB2_OP_SET_INFO, cfile, NULL, NULL, NULL, NULL);
if (tmprc == 0)
cifs_i->cifsAttrs = dosattrs;
}
@@ -650,9 +699,10 @@ int
smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
+ drop_cached_dir_by_name(xid, tcon, name, cifs_sb);
return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
CREATE_NOT_FILE, ACL_NO_MODE,
- NULL, SMB2_OP_RMDIR, NULL);
+ NULL, SMB2_OP_RMDIR, NULL, NULL, NULL, NULL, NULL);
}
int
@@ -661,7 +711,7 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
{
return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
- ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL);
+ ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL, NULL, NULL, NULL, NULL);
}
static int
@@ -680,7 +730,7 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
}
rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access,
FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name,
- command, cfile);
+ command, cfile, NULL, NULL, NULL, NULL);
smb2_rename_path:
kfree(smb2_to_name);
return rc;
@@ -693,6 +743,7 @@ smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon,
{
struct cifsFileInfo *cfile;
+ drop_cached_dir_by_name(xid, tcon, from_name, cifs_sb);
cifs_get_writable_path(tcon, from_name, FIND_WR_WITH_DELETE, &cfile);
return smb2_set_path_attr(xid, tcon, from_name, to_name,
@@ -720,7 +771,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
return smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE,
- &eof, SMB2_OP_SET_EOF, cfile);
+ &eof, SMB2_OP_SET_EOF, cfile, NULL, NULL, NULL, NULL);
}
int
@@ -746,7 +797,8 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_WRITE_ATTRIBUTES, FILE_OPEN,
- 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, cfile);
+ 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, cfile,
+ NULL, NULL, NULL, NULL);
cifs_put_tlink(tlink);
return rc;
}
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index d73e5672aac4..572293c18e16 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -135,6 +135,7 @@ static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len,
int
smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server)
{
+ struct TCP_Server_Info *pserver;
struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
struct smb2_pdu *pdu = (struct smb2_pdu *)shdr;
int hdr_size = sizeof(struct smb2_hdr);
@@ -143,6 +144,9 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server)
__u32 calc_len; /* calculated length */
__u64 mid;
+ /* If server is a channel, select the primary channel */
+ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+
/*
* Add function to do table lookup of StructureSize by command
* ie Validate the wct via smb2_struct_sizes table above
@@ -155,7 +159,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server)
/* decrypt frame now that it is completely read in */
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(iter, &server->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(iter, &pserver->smb_ses_list, smb_ses_list) {
if (iter->Suid == le64_to_cpu(thdr->SessionId)) {
ses = iter;
break;
@@ -248,7 +252,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server)
* Some windows servers (win2016) will pad also the final
* PDU in a compound to 8 bytes.
*/
- if (((calc_len + 7) & ~7) == len)
+ if (ALIGN(calc_len, 8) == len)
return 0;
/*
@@ -608,51 +612,52 @@ smb2_tcon_find_pending_open_lease(struct cifs_tcon *tcon,
}
static bool
-smb2_is_valid_lease_break(char *buffer)
+smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
{
struct smb2_lease_break *rsp = (struct smb2_lease_break *)buffer;
- struct TCP_Server_Info *server;
+ struct TCP_Server_Info *pserver;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
struct cifs_pending_open *open;
cifs_dbg(FYI, "Checking for lease break\n");
+ /* If server is a channel, select the primary channel */
+ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+
/* look up tcon based on tid & uid */
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
- list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
- list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
- spin_lock(&tcon->open_file_lock);
- cifs_stats_inc(
- &tcon->stats.cifs_stats.num_oplock_brks);
- if (smb2_tcon_has_lease(tcon, rsp)) {
- spin_unlock(&tcon->open_file_lock);
- spin_unlock(&cifs_tcp_ses_lock);
- return true;
- }
- open = smb2_tcon_find_pending_open_lease(tcon,
- rsp);
- if (open) {
- __u8 lease_key[SMB2_LEASE_KEY_SIZE];
- struct tcon_link *tlink;
-
- tlink = cifs_get_tlink(open->tlink);
- memcpy(lease_key, open->lease_key,
- SMB2_LEASE_KEY_SIZE);
- spin_unlock(&tcon->open_file_lock);
- spin_unlock(&cifs_tcp_ses_lock);
- smb2_queue_pending_open_break(tlink,
- lease_key,
- rsp->NewLeaseState);
- return true;
- }
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
+ spin_lock(&tcon->open_file_lock);
+ cifs_stats_inc(
+ &tcon->stats.cifs_stats.num_oplock_brks);
+ if (smb2_tcon_has_lease(tcon, rsp)) {
spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
+ return true;
+ }
+ open = smb2_tcon_find_pending_open_lease(tcon,
+ rsp);
+ if (open) {
+ __u8 lease_key[SMB2_LEASE_KEY_SIZE];
+ struct tcon_link *tlink;
+
+ tlink = cifs_get_tlink(open->tlink);
+ memcpy(lease_key, open->lease_key,
+ SMB2_LEASE_KEY_SIZE);
+ spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
+ smb2_queue_pending_open_break(tlink,
+ lease_key,
+ rsp->NewLeaseState);
+ return true;
+ }
+ spin_unlock(&tcon->open_file_lock);
- if (cached_dir_lease_break(tcon, rsp->LeaseKey)) {
- spin_unlock(&cifs_tcp_ses_lock);
- return true;
- }
+ if (cached_dir_lease_break(tcon, rsp->LeaseKey)) {
+ spin_unlock(&cifs_tcp_ses_lock);
+ return true;
}
}
}
@@ -671,6 +676,7 @@ bool
smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
{
struct smb2_oplock_break *rsp = (struct smb2_oplock_break *)buffer;
+ struct TCP_Server_Info *pserver;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
struct cifsInodeInfo *cinode;
@@ -684,16 +690,19 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
if (rsp->StructureSize !=
smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) {
if (le16_to_cpu(rsp->StructureSize) == 44)
- return smb2_is_valid_lease_break(buffer);
+ return smb2_is_valid_lease_break(buffer, server);
else
return false;
}
cifs_dbg(FYI, "oplock level 0x%x\n", rsp->OplockLevel);
+ /* If server is a channel, select the primary channel */
+ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+
/* look up tcon based on tid & uid */
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
spin_lock(&tcon->open_file_lock);
@@ -870,8 +879,8 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct TCP_Server_Info *server,
struct kvec *iov, int nvec)
{
int i, rc;
- struct sdesc *d;
struct smb2_hdr *hdr;
+ struct shash_desc *sha512 = NULL;
hdr = (struct smb2_hdr *)iov[0].iov_base;
/* neg prot are always taken */
@@ -901,14 +910,14 @@ ok:
if (rc)
return rc;
- d = server->secmech.sdescsha512;
- rc = crypto_shash_init(&d->shash);
+ sha512 = server->secmech.sha512;
+ rc = crypto_shash_init(sha512);
if (rc) {
cifs_dbg(VFS, "%s: Could not init sha512 shash\n", __func__);
return rc;
}
- rc = crypto_shash_update(&d->shash, ses->preauth_sha_hash,
+ rc = crypto_shash_update(sha512, ses->preauth_sha_hash,
SMB2_PREAUTH_HASH_SIZE);
if (rc) {
cifs_dbg(VFS, "%s: Could not update sha512 shash\n", __func__);
@@ -916,8 +925,7 @@ ok:
}
for (i = 0; i < nvec; i++) {
- rc = crypto_shash_update(&d->shash,
- iov[i].iov_base, iov[i].iov_len);
+ rc = crypto_shash_update(sha512, iov[i].iov_base, iov[i].iov_len);
if (rc) {
cifs_dbg(VFS, "%s: Could not update sha512 shash\n",
__func__);
@@ -925,7 +933,7 @@ ok:
}
}
- rc = crypto_shash_final(&d->shash, ses->preauth_sha_hash);
+ rc = crypto_shash_final(sha512, ses->preauth_sha_hash);
if (rc) {
cifs_dbg(VFS, "%s: Could not finalize sha512 shash\n",
__func__);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 421be43af425..d33b00ac37de 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -512,8 +512,7 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
static int
parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
- size_t buf_len,
- struct cifs_ses *ses)
+ size_t buf_len, struct cifs_ses *ses, bool in_mount)
{
struct network_interface_info_ioctl_rsp *p;
struct sockaddr_in *addr4;
@@ -531,6 +530,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
p = buf;
spin_lock(&ses->iface_lock);
+ ses->iface_count = 0;
/*
* Go through iface_list and do kref_put to remove
* any unused ifaces. ifaces in use will be removed
@@ -543,6 +543,21 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
}
spin_unlock(&ses->iface_lock);
+ /*
+ * Samba server e.g. can return an empty interface list in some cases,
+ * which would only be a problem if we were requesting multichannel
+ */
+ if (bytes_left == 0) {
+ /* avoid spamming logs every 10 minutes, so log only in mount */
+ if ((ses->chan_max > 1) && in_mount)
+ cifs_dbg(VFS,
+ "multichannel not available\n"
+ "Empty network interface list returned by server %s\n",
+ ses->server->hostname);
+ rc = -EINVAL;
+ goto out;
+ }
+
while (bytes_left >= sizeof(*p)) {
memset(&tmp_iface, 0, sizeof(tmp_iface));
tmp_iface.speed = le64_to_cpu(p->LinkSpeed);
@@ -637,9 +652,9 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
kref_put(&iface->refcount, release_iface);
} else
list_add_tail(&info->iface_head, &ses->iface_list);
- spin_unlock(&ses->iface_lock);
ses->iface_count++;
+ spin_unlock(&ses->iface_lock);
ses->iface_last_update = jiffies;
next_iface:
nb_iface++;
@@ -673,7 +688,7 @@ out:
}
int
-SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
+SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_mount)
{
int rc;
unsigned int ret_data_len = 0;
@@ -693,7 +708,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
goto out;
}
- rc = parse_server_interfaces(out_buf, ret_data_len, ses);
+ rc = parse_server_interfaces(out_buf, ret_data_len, ses, in_mount);
if (rc)
goto out;
@@ -729,7 +744,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
return;
- SMB3_request_interfaces(xid, tcon);
+ SMB3_request_interfaces(xid, tcon, true /* called during mount */);
SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
FS_ATTRIBUTE_INFORMATION);
@@ -787,7 +802,7 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
rc = open_cached_dir(xid, tcon, full_path, cifs_sb, true, &cfid);
if (!rc) {
- if (cfid->is_valid) {
+ if (cfid->has_lease) {
close_cached_dir(cfid);
return 0;
}
@@ -817,33 +832,25 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
-static int
-smb2_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
- u64 *uniqueid, FILE_ALL_INFO *data)
+static int smb2_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path,
+ u64 *uniqueid, struct cifs_open_info_data *data)
{
- *uniqueid = le64_to_cpu(data->IndexNumber);
+ *uniqueid = le64_to_cpu(data->fi.IndexNumber);
return 0;
}
-static int
-smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_fid *fid, FILE_ALL_INFO *data)
+static int smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifsFileInfo *cfile, struct cifs_open_info_data *data)
{
- int rc;
- struct smb2_file_all_info *smb2_data;
+ struct cifs_fid *fid = &cfile->fid;
- smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
- GFP_KERNEL);
- if (smb2_data == NULL)
- return -ENOMEM;
-
- rc = SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid,
- smb2_data);
- if (!rc)
- move_smb2_info_to_cifs(data, smb2_data);
- kfree(smb2_data);
- return rc;
+ if (cfile->symlink_target) {
+ data->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
+ if (!data->symlink_target)
+ return -ENOMEM;
+ }
+ return SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid, &data->fi);
}
#ifdef CONFIG_CIFS_XATTR
@@ -1109,6 +1116,8 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
COMPOUND_FID, current->tgid,
FILE_FULL_EA_INFORMATION,
SMB2_O_INFO_FILE, 0, data, size);
+ if (rc)
+ goto sea_exit;
smb2_set_next_command(tcon, &rqst[1]);
smb2_set_related(&rqst[1]);
@@ -1119,6 +1128,8 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
rqst[2].rq_nvec = 1;
rc = SMB2_close_init(tcon, server,
&rqst[2], COMPOUND_FID, COMPOUND_FID, false);
+ if (rc)
+ goto sea_exit;
smb2_set_related(&rqst[2]);
rc = compound_send_recv(xid, ses, server,
@@ -1327,7 +1338,7 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon,
CIFSMaxBufSize, (char **)&res_key, &ret_data_len);
if (rc == -EOPNOTSUPP) {
- pr_warn_once("Server share %s does not support copy range\n", tcon->treeName);
+ pr_warn_once("Server share %s does not support copy range\n", tcon->tree_name);
goto req_res_key_exit;
} else if (rc) {
cifs_tcon_dbg(VFS, "refcpy ioctl error %d getting resume key\n", rc);
@@ -2012,9 +2023,10 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon,
static int
smb3_notify(const unsigned int xid, struct file *pfile,
- void __user *ioc_buf)
+ void __user *ioc_buf, bool return_changes)
{
- struct smb3_notify notify;
+ struct smb3_notify_info notify;
+ struct smb3_notify_info __user *pnotify_buf;
struct dentry *dentry = pfile->f_path.dentry;
struct inode *inode = file_inode(pfile);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -2022,10 +2034,12 @@ smb3_notify(const unsigned int xid, struct file *pfile,
struct cifs_fid fid;
struct cifs_tcon *tcon;
const unsigned char *path;
+ char *returned_ioctl_info = NULL;
void *page = alloc_dentry_path();
__le16 *utf16_path = NULL;
u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
int rc = 0;
+ __u32 ret_len = 0;
path = build_path_from_dentry(dentry, page);
if (IS_ERR(path)) {
@@ -2039,9 +2053,17 @@ smb3_notify(const unsigned int xid, struct file *pfile,
goto notify_exit;
}
- if (copy_from_user(&notify, ioc_buf, sizeof(struct smb3_notify))) {
- rc = -EFAULT;
- goto notify_exit;
+ if (return_changes) {
+ if (copy_from_user(&notify, ioc_buf, sizeof(struct smb3_notify_info))) {
+ rc = -EFAULT;
+ goto notify_exit;
+ }
+ } else {
+ if (copy_from_user(&notify, ioc_buf, sizeof(struct smb3_notify))) {
+ rc = -EFAULT;
+ goto notify_exit;
+ }
+ notify.data_len = 0;
}
tcon = cifs_sb_master_tcon(cifs_sb);
@@ -2058,12 +2080,22 @@ smb3_notify(const unsigned int xid, struct file *pfile,
goto notify_exit;
rc = SMB2_change_notify(xid, tcon, fid.persistent_fid, fid.volatile_fid,
- notify.watch_tree, notify.completion_filter);
+ notify.watch_tree, notify.completion_filter,
+ notify.data_len, &returned_ioctl_info, &ret_len);
SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
cifs_dbg(FYI, "change notify for path %s rc %d\n", path, rc);
-
+ if (return_changes && (ret_len > 0) && (notify.data_len > 0)) {
+ if (ret_len > notify.data_len)
+ ret_len = notify.data_len;
+ pnotify_buf = (struct smb3_notify_info __user *)ioc_buf;
+ if (copy_to_user(pnotify_buf->notify_data, returned_ioctl_info, ret_len))
+ rc = -EFAULT;
+ else if (copy_to_user(&pnotify_buf->data_len, &ret_len, sizeof(ret_len)))
+ rc = -EFAULT;
+ }
+ kfree(returned_ioctl_info);
notify_exit:
free_dentry_path(page);
kfree(utf16_path);
@@ -2274,14 +2306,18 @@ static void
smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
{
struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
+ struct TCP_Server_Info *pserver;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
if (shdr->Status != STATUS_NETWORK_NAME_DELETED)
return;
+ /* If server is a channel, select the primary channel */
+ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
if (tcon->tid == le32_to_cpu(shdr->Id.SyncId.TreeId)) {
spin_lock(&tcon->tc_lock);
@@ -2289,7 +2325,7 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
spin_unlock(&tcon->tc_lock);
spin_unlock(&cifs_tcp_ses_lock);
pr_warn_once("Server share %s deleted.\n",
- tcon->treeName);
+ tcon->tree_name);
return;
}
}
@@ -2498,7 +2534,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
if (rc == -EREMCHG) {
tcon->need_reconnect = true;
pr_warn_once("server share %s deleted\n",
- tcon->treeName);
+ tcon->tree_name);
}
goto qic_exit;
}
@@ -2814,9 +2850,6 @@ parse_reparse_point(struct reparse_data_buffer *buf,
}
}
-#define SMB2_SYMLINK_STRUCT_SIZE \
- (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp))
-
static int
smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
@@ -2828,13 +2861,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_open_parms oparms;
struct cifs_fid fid;
struct kvec err_iov = {NULL, 0};
- struct smb2_err_rsp *err_buf = NULL;
- struct smb2_symlink_err_rsp *symlink;
struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses);
- unsigned int sub_len;
- unsigned int sub_offset;
- unsigned int print_len;
- unsigned int print_offset;
int flags = CIFS_CP_CREATE_CLOSE_OP;
struct smb_rqst rqst[3];
int resp_buftype[3];
@@ -2951,47 +2978,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
goto querty_exit;
}
- err_buf = err_iov.iov_base;
- if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) ||
- err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE) {
- rc = -EINVAL;
- goto querty_exit;
- }
-
- symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData;
- if (le32_to_cpu(symlink->SymLinkErrorTag) != SYMLINK_ERROR_TAG ||
- le32_to_cpu(symlink->ReparseTag) != IO_REPARSE_TAG_SYMLINK) {
- rc = -EINVAL;
- goto querty_exit;
- }
-
- /* open must fail on symlink - reset rc */
- rc = 0;
- sub_len = le16_to_cpu(symlink->SubstituteNameLength);
- sub_offset = le16_to_cpu(symlink->SubstituteNameOffset);
- print_len = le16_to_cpu(symlink->PrintNameLength);
- print_offset = le16_to_cpu(symlink->PrintNameOffset);
-
- if (err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) {
- rc = -EINVAL;
- goto querty_exit;
- }
-
- if (err_iov.iov_len <
- SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) {
- rc = -EINVAL;
- goto querty_exit;
- }
-
- *target_path = cifs_strndup_from_utf16(
- (char *)symlink->PathBuffer + sub_offset,
- sub_len, true, cifs_sb->local_nls);
- if (!(*target_path)) {
- rc = -ENOMEM;
- goto querty_exit;
- }
- convert_delimiter(*target_path, '/');
- cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
+ rc = smb2_parse_symlink_response(cifs_sb, &err_iov, target_path);
querty_exit:
cifs_dbg(FYI, "query symlink rc %d\n", rc);
@@ -4217,89 +4204,104 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len,
memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8);
}
-/* We can not use the normal sg_set_buf() as we will sometimes pass a
- * stack object as buf.
- */
-static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf,
- unsigned int buflen)
+static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst *rqst,
+ int num_rqst, const u8 *sig, u8 **iv,
+ struct aead_request **req, struct scatterlist **sgl,
+ unsigned int *num_sgs)
{
- void *addr;
- /*
- * VMAP_STACK (at least) puts stack into the vmalloc address space
- */
- if (is_vmalloc_addr(buf))
- addr = vmalloc_to_page(buf);
- else
- addr = virt_to_page(buf);
- sg_set_page(sg, addr, buflen, offset_in_page(buf));
+ unsigned int req_size = sizeof(**req) + crypto_aead_reqsize(tfm);
+ unsigned int iv_size = crypto_aead_ivsize(tfm);
+ unsigned int len;
+ u8 *p;
+
+ *num_sgs = cifs_get_num_sgs(rqst, num_rqst, sig);
+
+ len = iv_size;
+ len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1);
+ len = ALIGN(len, crypto_tfm_ctx_alignment());
+ len += req_size;
+ len = ALIGN(len, __alignof__(struct scatterlist));
+ len += *num_sgs * sizeof(**sgl);
+
+ p = kmalloc(len, GFP_ATOMIC);
+ if (!p)
+ return NULL;
+
+ *iv = (u8 *)PTR_ALIGN(p, crypto_aead_alignmask(tfm) + 1);
+ *req = (struct aead_request *)PTR_ALIGN(*iv + iv_size,
+ crypto_tfm_ctx_alignment());
+ *sgl = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size,
+ __alignof__(struct scatterlist));
+ return p;
}
-/* Assumes the first rqst has a transform header as the first iov.
- * I.e.
- * rqst[0].rq_iov[0] is transform header
- * rqst[0].rq_iov[1+] data to be encrypted/decrypted
- * rqst[1+].rq_iov[0+] data to be encrypted/decrypted
- */
-static struct scatterlist *
-init_sg(int num_rqst, struct smb_rqst *rqst, u8 *sign)
+static void *smb2_get_aead_req(struct crypto_aead *tfm, const struct smb_rqst *rqst,
+ int num_rqst, const u8 *sig, u8 **iv,
+ struct aead_request **req, struct scatterlist **sgl)
{
- unsigned int sg_len;
+ unsigned int off, len, skip;
struct scatterlist *sg;
- unsigned int i;
- unsigned int j;
- unsigned int idx = 0;
- int skip;
-
- sg_len = 1;
- for (i = 0; i < num_rqst; i++)
- sg_len += rqst[i].rq_nvec + rqst[i].rq_npages;
+ unsigned int num_sgs;
+ unsigned long addr;
+ int i, j;
+ void *p;
- sg = kmalloc_array(sg_len, sizeof(struct scatterlist), GFP_KERNEL);
- if (!sg)
+ p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, sgl, &num_sgs);
+ if (!p)
return NULL;
- sg_init_table(sg, sg_len);
+ sg_init_table(*sgl, num_sgs);
+ sg = *sgl;
+
+ /* Assumes the first rqst has a transform header as the first iov.
+ * I.e.
+ * rqst[0].rq_iov[0] is transform header
+ * rqst[0].rq_iov[1+] data to be encrypted/decrypted
+ * rqst[1+].rq_iov[0+] data to be encrypted/decrypted
+ */
for (i = 0; i < num_rqst; i++) {
+ /*
+ * The first rqst has a transform header where the
+ * first 20 bytes are not part of the encrypted blob.
+ */
for (j = 0; j < rqst[i].rq_nvec; j++) {
- /*
- * The first rqst has a transform header where the
- * first 20 bytes are not part of the encrypted blob
- */
- skip = (i == 0) && (j == 0) ? 20 : 0;
- smb2_sg_set_buf(&sg[idx++],
- rqst[i].rq_iov[j].iov_base + skip,
- rqst[i].rq_iov[j].iov_len - skip);
- }
+ struct kvec *iov = &rqst[i].rq_iov[j];
+ skip = (i == 0) && (j == 0) ? 20 : 0;
+ addr = (unsigned long)iov->iov_base + skip;
+ len = iov->iov_len - skip;
+ sg = cifs_sg_set_buf(sg, (void *)addr, len);
+ }
for (j = 0; j < rqst[i].rq_npages; j++) {
- unsigned int len, offset;
-
- rqst_page_get_length(&rqst[i], j, &len, &offset);
- sg_set_page(&sg[idx++], rqst[i].rq_pages[j], len, offset);
+ rqst_page_get_length(&rqst[i], j, &len, &off);
+ sg_set_page(sg++, rqst[i].rq_pages[j], len, off);
}
}
- smb2_sg_set_buf(&sg[idx], sign, SMB2_SIGNATURE_SIZE);
- return sg;
+ cifs_sg_set_buf(sg, sig, SMB2_SIGNATURE_SIZE);
+
+ return p;
}
static int
smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key)
{
+ struct TCP_Server_Info *pserver;
struct cifs_ses *ses;
u8 *ses_enc_key;
+ /* If server is a channel, select the primary channel */
+ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
- list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
- if (ses->Suid == ses_id) {
- spin_lock(&ses->ses_lock);
- ses_enc_key = enc ? ses->smb3encryptionkey :
- ses->smb3decryptionkey;
- memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE);
- spin_unlock(&ses->ses_lock);
- spin_unlock(&cifs_tcp_ses_lock);
- return 0;
- }
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ if (ses->Suid == ses_id) {
+ spin_lock(&ses->ses_lock);
+ ses_enc_key = enc ? ses->smb3encryptionkey :
+ ses->smb3decryptionkey;
+ memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE);
+ spin_unlock(&ses->ses_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
+ return 0;
}
}
spin_unlock(&cifs_tcp_ses_lock);
@@ -4325,11 +4327,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
u8 sign[SMB2_SIGNATURE_SIZE] = {};
u8 key[SMB3_ENC_DEC_KEY_SIZE];
struct aead_request *req;
- char *iv;
- unsigned int iv_len;
+ u8 *iv;
DECLARE_CRYPTO_WAIT(wait);
struct crypto_aead *tfm;
unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
+ void *creq;
rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key);
if (rc) {
@@ -4344,8 +4346,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
return rc;
}
- tfm = enc ? server->secmech.ccmaesencrypt :
- server->secmech.ccmaesdecrypt;
+ tfm = enc ? server->secmech.enc : server->secmech.dec;
if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) ||
(server->cipher_type == SMB2_ENCRYPTION_AES256_GCM))
@@ -4364,32 +4365,15 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
return rc;
}
- req = aead_request_alloc(tfm, GFP_KERNEL);
- if (!req) {
- cifs_server_dbg(VFS, "%s: Failed to alloc aead request\n", __func__);
+ creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg);
+ if (unlikely(!creq))
return -ENOMEM;
- }
if (!enc) {
memcpy(sign, &tr_hdr->Signature, SMB2_SIGNATURE_SIZE);
crypt_len += SMB2_SIGNATURE_SIZE;
}
- sg = init_sg(num_rqst, rqst, sign);
- if (!sg) {
- cifs_server_dbg(VFS, "%s: Failed to init sg\n", __func__);
- rc = -ENOMEM;
- goto free_req;
- }
-
- iv_len = crypto_aead_ivsize(tfm);
- iv = kzalloc(iv_len, GFP_KERNEL);
- if (!iv) {
- cifs_server_dbg(VFS, "%s: Failed to alloc iv\n", __func__);
- rc = -ENOMEM;
- goto free_sg;
- }
-
if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) ||
(server->cipher_type == SMB2_ENCRYPTION_AES256_GCM))
memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES_GCM_NONCE);
@@ -4398,6 +4382,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES_CCM_NONCE);
}
+ aead_request_set_tfm(req, tfm);
aead_request_set_crypt(req, sg, sg, crypt_len, iv);
aead_request_set_ad(req, assoc_data_len);
@@ -4410,11 +4395,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
if (!rc && enc)
memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE);
- kfree(iv);
-free_sg:
- kfree(sg);
-free_req:
- kfree(req);
+ kfree_sensitive(creq);
return rc;
}
@@ -4457,21 +4438,27 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst,
int rc = -ENOMEM;
for (i = 1; i < num_rqst; i++) {
- npages = old_rq[i - 1].rq_npages;
+ struct smb_rqst *old = &old_rq[i - 1];
+ struct smb_rqst *new = &new_rq[i];
+
+ orig_len += smb_rqst_len(server, old);
+ new->rq_iov = old->rq_iov;
+ new->rq_nvec = old->rq_nvec;
+
+ npages = old->rq_npages;
+ if (!npages)
+ continue;
+
pages = kmalloc_array(npages, sizeof(struct page *),
GFP_KERNEL);
if (!pages)
goto err_free;
- new_rq[i].rq_pages = pages;
- new_rq[i].rq_npages = npages;
- new_rq[i].rq_offset = old_rq[i - 1].rq_offset;
- new_rq[i].rq_pagesz = old_rq[i - 1].rq_pagesz;
- new_rq[i].rq_tailsz = old_rq[i - 1].rq_tailsz;
- new_rq[i].rq_iov = old_rq[i - 1].rq_iov;
- new_rq[i].rq_nvec = old_rq[i - 1].rq_nvec;
-
- orig_len += smb_rqst_len(server, &old_rq[i - 1]);
+ new->rq_pages = pages;
+ new->rq_npages = npages;
+ new->rq_offset = old->rq_offset;
+ new->rq_pagesz = old->rq_pagesz;
+ new->rq_tailsz = old->rq_tailsz;
for (j = 0; j < npages; j++) {
pages[j] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
@@ -4484,14 +4471,14 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst,
char *dst, *src;
unsigned int offset, len;
- rqst_page_get_length(&new_rq[i], j, &len, &offset);
+ rqst_page_get_length(new, j, &len, &offset);
- dst = (char *) kmap(new_rq[i].rq_pages[j]) + offset;
- src = (char *) kmap(old_rq[i - 1].rq_pages[j]) + offset;
+ dst = kmap_local_page(new->rq_pages[j]) + offset;
+ src = kmap_local_page(old->rq_pages[j]) + offset;
memcpy(dst, src, len);
- kunmap(new_rq[i].rq_pages[j]);
- kunmap(old_rq[i - 1].rq_pages[j]);
+ kunmap(new->rq_pages[j]);
+ kunmap(old->rq_pages[j]);
}
}
@@ -4735,13 +4722,13 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
return 0;
}
- iov_iter_bvec(&iter, WRITE, bvec, npages, data_len);
+ iov_iter_bvec(&iter, ITER_SOURCE, bvec, npages, data_len);
} else if (buf_len >= data_offset + data_len) {
/* read response payload is in buf */
WARN_ONCE(npages > 0, "read data can be either in buf or in pages");
iov.iov_base = buf + data_offset;
iov.iov_len = data_len;
- iov_iter_kvec(&iter, WRITE, &iov, 1, data_len);
+ iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, data_len);
} else {
/* read response payload cannot be in both buf and pages */
WARN_ONCE(1, "buf can not contain only a part of read data");
@@ -5102,7 +5089,7 @@ smb2_make_node(unsigned int xid, struct inode *inode,
{
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
int rc = -EPERM;
- FILE_ALL_INFO *buf = NULL;
+ struct cifs_open_info_data buf = {};
struct cifs_io_parms io_parms = {0};
__u32 oplock = 0;
struct cifs_fid fid;
@@ -5118,7 +5105,7 @@ smb2_make_node(unsigned int xid, struct inode *inode,
* and was used by default in earlier versions of Windows
*/
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
- goto out;
+ return rc;
/*
* TODO: Add ability to create instead via reparse point. Windows (e.g.
@@ -5127,16 +5114,10 @@ smb2_make_node(unsigned int xid, struct inode *inode,
*/
if (!S_ISCHR(mode) && !S_ISBLK(mode))
- goto out;
+ return rc;
cifs_dbg(FYI, "sfu compat create special file\n");
- buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
- if (buf == NULL) {
- rc = -ENOMEM;
- goto out;
- }
-
oparms.tcon = tcon;
oparms.cifs_sb = cifs_sb;
oparms.desired_access = GENERIC_WRITE;
@@ -5151,21 +5132,21 @@ smb2_make_node(unsigned int xid, struct inode *inode,
oplock = REQ_OPLOCK;
else
oplock = 0;
- rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf);
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &buf);
if (rc)
- goto out;
+ return rc;
/*
* BB Do not bother to decode buf since no local inode yet to put
* timestamps in, but we can reuse it safely.
*/
- pdev = (struct win_dev *)buf;
+ pdev = (struct win_dev *)&buf.fi;
io_parms.pid = current->tgid;
io_parms.tcon = tcon;
io_parms.offset = 0;
io_parms.length = sizeof(struct win_dev);
- iov[1].iov_base = buf;
+ iov[1].iov_base = &buf.fi;
iov[1].iov_len = sizeof(struct win_dev);
if (S_ISCHR(mode)) {
memcpy(pdev->type, "IntxCHR", 8);
@@ -5184,8 +5165,8 @@ smb2_make_node(unsigned int xid, struct inode *inode,
d_drop(dentry);
/* FIXME: add code here to set EAs */
-out:
- kfree(buf);
+
+ cifs_free_open_info(&buf);
return rc;
}
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 6352ab32c7e7..a5695748a89b 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -466,15 +466,14 @@ build_signing_ctxt(struct smb2_signing_capabilities *pneg_ctxt)
/*
* Context Data length must be rounded to multiple of 8 for some servers
*/
- pneg_ctxt->DataLength = cpu_to_le16(DIV_ROUND_UP(
- sizeof(struct smb2_signing_capabilities) -
- sizeof(struct smb2_neg_context) +
- (num_algs * 2 /* sizeof u16 */), 8) * 8);
+ pneg_ctxt->DataLength = cpu_to_le16(ALIGN(sizeof(struct smb2_signing_capabilities) -
+ sizeof(struct smb2_neg_context) +
+ (num_algs * sizeof(u16)), 8));
pneg_ctxt->SigningAlgorithmCount = cpu_to_le16(num_algs);
pneg_ctxt->SigningAlgorithms[0] = cpu_to_le16(SIGNING_ALG_AES_CMAC);
- ctxt_len += 2 /* sizeof le16 */ * num_algs;
- ctxt_len = DIV_ROUND_UP(ctxt_len, 8) * 8;
+ ctxt_len += sizeof(__le16) * num_algs;
+ ctxt_len = ALIGN(ctxt_len, 8);
return ctxt_len;
/* TBD add SIGNING_ALG_AES_GMAC and/or SIGNING_ALG_HMAC_SHA256 */
}
@@ -511,8 +510,7 @@ build_netname_ctxt(struct smb2_netname_neg_context *pneg_ctxt, char *hostname)
/* copy up to max of first 100 bytes of server name to NetName field */
pneg_ctxt->DataLength = cpu_to_le16(2 * cifs_strtoUTF16(pneg_ctxt->NetName, hostname, 100, cp));
/* context size is DataLength + minimal smb2_neg_context */
- return DIV_ROUND_UP(le16_to_cpu(pneg_ctxt->DataLength) +
- sizeof(struct smb2_neg_context), 8) * 8;
+ return ALIGN(le16_to_cpu(pneg_ctxt->DataLength) + sizeof(struct smb2_neg_context), 8);
}
static void
@@ -557,18 +555,18 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
* round up total_len of fixed part of SMB3 negotiate request to 8
* byte boundary before adding negotiate contexts
*/
- *total_len = roundup(*total_len, 8);
+ *total_len = ALIGN(*total_len, 8);
pneg_ctxt = (*total_len) + (char *)req;
req->NegotiateContextOffset = cpu_to_le32(*total_len);
build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt);
- ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_preauth_neg_context), 8) * 8;
+ ctxt_len = ALIGN(sizeof(struct smb2_preauth_neg_context), 8);
*total_len += ctxt_len;
pneg_ctxt += ctxt_len;
build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt);
- ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_encryption_neg_context), 8) * 8;
+ ctxt_len = ALIGN(sizeof(struct smb2_encryption_neg_context), 8);
*total_len += ctxt_len;
pneg_ctxt += ctxt_len;
@@ -595,9 +593,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
if (server->compress_algorithm) {
build_compression_ctxt((struct smb2_compression_capabilities_context *)
pneg_ctxt);
- ctxt_len = DIV_ROUND_UP(
- sizeof(struct smb2_compression_capabilities_context),
- 8) * 8;
+ ctxt_len = ALIGN(sizeof(struct smb2_compression_capabilities_context), 8);
*total_len += ctxt_len;
pneg_ctxt += ctxt_len;
neg_context_count++;
@@ -780,7 +776,7 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp,
if (rc)
break;
/* offsets must be 8 byte aligned */
- clen = (clen + 7) & ~0x7;
+ clen = ALIGN(clen, 8);
offset += clen + sizeof(struct smb2_neg_context);
len_of_ctxts -= clen;
}
@@ -873,7 +869,7 @@ SMB2_negotiate(const unsigned int xid,
struct smb2_negotiate_rsp *rsp;
struct kvec iov[1];
struct kvec rsp_iov;
- int rc = 0;
+ int rc;
int resp_buftype;
int blob_offset, blob_length;
char *security_blob;
@@ -1169,9 +1165,9 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
pneg_inbuf->Dialects[0] =
cpu_to_le16(server->vals->protocol_id);
pneg_inbuf->DialectCount = cpu_to_le16(1);
- /* structure is big enough for 3 dialects, sending only 1 */
+ /* structure is big enough for 4 dialects, sending only 1 */
inbuflen = sizeof(*pneg_inbuf) -
- sizeof(pneg_inbuf->Dialects[0]) * 2;
+ sizeof(pneg_inbuf->Dialects[0]) * 3;
}
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
@@ -1345,7 +1341,13 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data)
static void
SMB2_sess_free_buffer(struct SMB2_sess_data *sess_data)
{
- free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base);
+ struct kvec *iov = sess_data->iov;
+
+ /* iov[1] is already freed by caller */
+ if (sess_data->buf0_type != CIFS_NO_BUFFER && iov[0].iov_base)
+ memzero_explicit(iov[0].iov_base, iov[0].iov_len);
+
+ free_rsp_buf(sess_data->buf0_type, iov[0].iov_base);
sess_data->buf0_type = CIFS_NO_BUFFER;
}
@@ -1477,6 +1479,8 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
out_put_spnego_key:
key_invalidate(spnego_key);
key_put(spnego_key);
+ if (rc)
+ kfree_sensitive(ses->auth_key.response);
out:
sess_data->result = rc;
sess_data->func = NULL;
@@ -1526,7 +1530,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
&blob_length, ses, server,
sess_data->nls_cp);
if (rc)
- goto out_err;
+ goto out;
if (use_spnego) {
/* BB eventually need to add this */
@@ -1573,7 +1577,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
}
out:
- kfree(ntlmssp_blob);
+ kfree_sensitive(ntlmssp_blob);
SMB2_sess_free_buffer(sess_data);
if (!rc) {
sess_data->result = 0;
@@ -1581,7 +1585,7 @@ out:
return;
}
out_err:
- kfree(ses->ntlmssp);
+ kfree_sensitive(ses->ntlmssp);
ses->ntlmssp = NULL;
sess_data->result = rc;
sess_data->func = NULL;
@@ -1657,9 +1661,9 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data)
}
#endif
out:
- kfree(ntlmssp_blob);
+ kfree_sensitive(ntlmssp_blob);
SMB2_sess_free_buffer(sess_data);
- kfree(ses->ntlmssp);
+ kfree_sensitive(ses->ntlmssp);
ses->ntlmssp = NULL;
sess_data->result = rc;
sess_data->func = NULL;
@@ -1737,7 +1741,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
cifs_server_dbg(VFS, "signing requested but authenticated as guest\n");
rc = sess_data->result;
out:
- kfree(sess_data);
+ kfree_sensitive(sess_data);
return rc;
}
@@ -1930,7 +1934,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */
tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess);
tcon->tid = le32_to_cpu(rsp->hdr.Id.SyncId.TreeId);
- strscpy(tcon->treeName, tree, sizeof(tcon->treeName));
+ strscpy(tcon->tree_name, tree, sizeof(tcon->tree_name));
if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
((tcon->share_flags & SHI1005_FLAGS_DFS) == 0))
@@ -1973,6 +1977,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
if (!ses || !(ses->server))
return -EIO;
+ trace_smb3_tdis_enter(xid, tcon->tid, ses->Suid, tcon->tree_name);
spin_lock(&ses->chan_lock);
if ((tcon->need_reconnect) ||
(CIFS_ALL_CHANS_NEED_RECONNECT(tcon->ses))) {
@@ -2004,8 +2009,11 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
rc = cifs_send_recv(xid, ses, ses->server,
&rqst, &resp_buf_type, flags, &rsp_iov);
cifs_small_buf_release(req);
- if (rc)
+ if (rc) {
cifs_stats_fail_inc(tcon, SMB2_TREE_DISCONNECT_HE);
+ trace_smb3_tdis_err(xid, tcon->tid, ses->Suid, rc);
+ }
+ trace_smb3_tdis_done(xid, tcon->tid, ses->Suid);
return rc;
}
@@ -2411,9 +2419,9 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
unsigned int acelen, acl_size, ace_count;
unsigned int owner_offset = 0;
unsigned int group_offset = 0;
- struct smb3_acl acl;
+ struct smb3_acl acl = {};
- *len = roundup(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8);
+ *len = round_up(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8);
if (set_owner) {
/* sizeof(struct owner_group_sids) is already multiple of 8 so no need to round */
@@ -2484,10 +2492,11 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
acl.AclRevision = ACL_REVISION; /* See 2.4.4.1 of MS-DTYP */
acl.AclSize = cpu_to_le16(acl_size);
acl.AceCount = cpu_to_le16(ace_count);
+ /* acl.Sbz1 and Sbz2 MBZ so are not set here, but initialized above */
memcpy(aclptr, &acl, sizeof(struct smb3_acl));
buf->ccontext.DataLength = cpu_to_le32(ptr - (__u8 *)&buf->sd);
- *len = roundup(ptr - (__u8 *)buf, 8);
+ *len = round_up((unsigned int)(ptr - (__u8 *)buf), 8);
return buf;
}
@@ -2581,7 +2590,7 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len,
* final path needs to be 8-byte aligned as specified in
* MS-SMB2 2.2.13 SMB2 CREATE Request.
*/
- *out_size = roundup(*out_len * sizeof(__le16), 8);
+ *out_size = round_up(*out_len * sizeof(__le16), 8);
*out_path = kzalloc(*out_size + sizeof(__le16) /* null */, GFP_KERNEL);
if (!*out_path)
return -ENOMEM;
@@ -2674,7 +2683,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS;
rc = alloc_path_with_tree_prefix(&copy_path, &copy_size,
&name_len,
- tcon->treeName, utf16_path);
+ tcon->tree_name, utf16_path);
if (rc)
goto err_free_req;
@@ -2816,7 +2825,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS;
rc = alloc_path_with_tree_prefix(&copy_path, &copy_size,
&name_len,
- tcon->treeName, path);
+ tcon->tree_name, path);
if (rc)
return rc;
req->NameLength = cpu_to_le16(name_len * 2);
@@ -2826,9 +2835,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2;
/* MUST set path len (NameLength) to 0 opening root of share */
req->NameLength = cpu_to_le16(uni_path_len - 2);
- copy_size = uni_path_len;
- if (copy_size % 8 != 0)
- copy_size = roundup(copy_size, 8);
+ copy_size = round_up(uni_path_len, 8);
copy_path = kzalloc(copy_size, GFP_KERNEL);
if (!copy_path)
return -ENOMEM;
@@ -3011,7 +3018,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
oparms->create_options, oparms->desired_access, rc);
if (rc == -EREMCHG) {
pr_warn_once("server share %s deleted\n",
- tcon->treeName);
+ tcon->tree_name);
tcon->need_reconnect = true;
}
goto creat_exit;
@@ -3472,7 +3479,7 @@ smb2_validate_and_copy_iov(unsigned int offset, unsigned int buffer_length,
if (rc)
return rc;
- memcpy(data, begin_of_buf, buffer_length);
+ memcpy(data, begin_of_buf, minbufsize);
return 0;
}
@@ -3596,7 +3603,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
rc = smb2_validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset),
le32_to_cpu(rsp->OutputBufferLength),
- &rsp_iov, min_len, *data);
+ &rsp_iov, dlen ? *dlen : min_len, *data);
if (rc && allocated) {
kfree(*data);
*data = NULL;
@@ -3702,11 +3709,13 @@ SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst,
int
SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, bool watch_tree,
- u32 completion_filter)
+ u32 completion_filter, u32 max_out_data_len, char **out_data,
+ u32 *plen /* returned data len */)
{
struct cifs_ses *ses = tcon->ses;
struct TCP_Server_Info *server = cifs_pick_channel(ses);
struct smb_rqst rqst;
+ struct smb2_change_notify_rsp *smb_rsp;
struct kvec iov[1];
struct kvec rsp_iov = {NULL, 0};
int resp_buftype = CIFS_NO_BUFFER;
@@ -3722,6 +3731,9 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
memset(&rqst, 0, sizeof(struct smb_rqst));
memset(&iov, 0, sizeof(iov));
+ if (plen)
+ *plen = 0;
+
rqst.rq_iov = iov;
rqst.rq_nvec = 1;
@@ -3740,9 +3752,28 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
cifs_stats_fail_inc(tcon, SMB2_CHANGE_NOTIFY_HE);
trace_smb3_notify_err(xid, persistent_fid, tcon->tid, ses->Suid,
(u8)watch_tree, completion_filter, rc);
- } else
+ } else {
trace_smb3_notify_done(xid, persistent_fid, tcon->tid,
- ses->Suid, (u8)watch_tree, completion_filter);
+ ses->Suid, (u8)watch_tree, completion_filter);
+ /* validate that notify information is plausible */
+ if ((rsp_iov.iov_base == NULL) ||
+ (rsp_iov.iov_len < sizeof(struct smb2_change_notify_rsp)))
+ goto cnotify_exit;
+
+ smb_rsp = (struct smb2_change_notify_rsp *)rsp_iov.iov_base;
+
+ smb2_validate_iov(le16_to_cpu(smb_rsp->OutputBufferOffset),
+ le32_to_cpu(smb_rsp->OutputBufferLength), &rsp_iov,
+ sizeof(struct file_notify_information));
+
+ *out_data = kmemdup((char *)smb_rsp + le16_to_cpu(smb_rsp->OutputBufferOffset),
+ le32_to_cpu(smb_rsp->OutputBufferLength), GFP_KERNEL);
+ if (*out_data == NULL) {
+ rc = -ENOMEM;
+ goto cnotify_exit;
+ } else
+ *plen = le32_to_cpu(smb_rsp->OutputBufferLength);
+ }
cnotify_exit:
if (rqst.rq_iov)
@@ -4090,7 +4121,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
if (request_type & CHAINED_REQUEST) {
if (!(request_type & END_OF_CHAIN)) {
/* next 8-byte aligned request */
- *total_len = DIV_ROUND_UP(*total_len, 8) * 8;
+ *total_len = ALIGN(*total_len, 8);
shdr->NextCommand = cpu_to_le32(*total_len);
} else /* END_OF_CHAIN */
shdr->NextCommand = 0;
@@ -4429,7 +4460,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
wdata->bytes, wdata->result);
if (wdata->result == -ENOSPC)
pr_warn_once("Out of space writing to %s\n",
- tcon->treeName);
+ tcon->tree_name);
} else
trace_smb3_write_done(0 /* no xid */,
wdata->cfile->fid.persistent_fid,
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index f57881b8464f..1237bb86e93a 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -56,6 +56,9 @@ struct smb2_rdma_crypto_transform {
#define COMPOUND_FID 0xFFFFFFFFFFFFFFFFULL
+#define SMB2_SYMLINK_STRUCT_SIZE \
+ (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp))
+
#define SYMLINK_ERROR_TAG 0x4c4d5953
struct smb2_symlink_err_rsp {
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 3f740f24b96a..d5d7ffb7711c 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -53,16 +53,12 @@ extern bool smb2_is_valid_oplock_break(char *buffer,
struct TCP_Server_Info *srv);
extern int smb3_handle_read_data(struct TCP_Server_Info *server,
struct mid_q_entry *mid);
-
-extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst,
- struct smb2_file_all_info *src);
extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *path,
__u32 *reparse_tag);
-extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb,
- const char *full_path, FILE_ALL_INFO *data,
- bool *adjust_tz, bool *symlink);
+int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path,
+ struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse);
extern int smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
const char *full_path, __u64 size,
struct cifs_sb_info *cifs_sb, bool set_alloc);
@@ -95,9 +91,9 @@ extern int smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
const unsigned char *path, char *pbuf,
unsigned int *pbytes_read);
-extern int smb2_open_file(const unsigned int xid,
- struct cifs_open_parms *oparms,
- __u32 *oplock, FILE_ALL_INFO *buf);
+int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec *iov, char **path);
+int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock,
+ void *buf);
extern int smb2_unlock_range(struct cifsFileInfo *cfile,
struct file_lock *flock, const unsigned int xid);
extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile);
@@ -148,7 +144,8 @@ extern int SMB2_ioctl_init(struct cifs_tcon *tcon,
extern void SMB2_ioctl_free(struct smb_rqst *rqst);
extern int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, bool watch_tree,
- u32 completion_filter);
+ u32 completion_filter, u32 max_out_data_len,
+ char **out_data, u32 *plen /* returned data len */);
extern int __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
@@ -278,9 +275,12 @@ extern int smb2_query_info_compound(const unsigned int xid,
struct kvec *rsp, int *buftype,
struct cifs_sb_info *cifs_sb);
/* query path info from the server using SMB311 POSIX extensions*/
-extern int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *sb, const char *path, struct smb311_posix_qinfo *qinf,
- bool *adjust_tx, bool *symlink);
+int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path,
+ struct cifs_open_info_data *data,
+ struct cifs_sid *owner,
+ struct cifs_sid *group,
+ bool *adjust_tz, bool *reparse);
int posix_info_parse(const void *beg, const void *end,
struct smb2_posix_info_parsed *out);
int posix_info_sid_size(const void *beg, const void *end);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 1a5fc3314dbf..381babc1212c 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -32,19 +32,17 @@ smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
struct cifs_secmech *p = &server->secmech;
int rc;
- rc = cifs_alloc_hash("hmac(sha256)",
- &p->hmacsha256,
- &p->sdeschmacsha256);
+ rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256);
if (rc)
goto err;
- rc = cifs_alloc_hash("cmac(aes)", &p->cmacaes, &p->sdesccmacaes);
+ rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac);
if (rc)
goto err;
return 0;
err:
- cifs_free_hash(&p->hmacsha256, &p->sdeschmacsha256);
+ cifs_free_hash(&p->hmacsha256);
return rc;
}
@@ -54,25 +52,23 @@ smb311_crypto_shash_allocate(struct TCP_Server_Info *server)
struct cifs_secmech *p = &server->secmech;
int rc = 0;
- rc = cifs_alloc_hash("hmac(sha256)",
- &p->hmacsha256,
- &p->sdeschmacsha256);
+ rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256);
if (rc)
return rc;
- rc = cifs_alloc_hash("cmac(aes)", &p->cmacaes, &p->sdesccmacaes);
+ rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac);
if (rc)
goto err;
- rc = cifs_alloc_hash("sha512", &p->sha512, &p->sdescsha512);
+ rc = cifs_alloc_hash("sha512", &p->sha512);
if (rc)
goto err;
return 0;
err:
- cifs_free_hash(&p->cmacaes, &p->sdesccmacaes);
- cifs_free_hash(&p->hmacsha256, &p->sdeschmacsha256);
+ cifs_free_hash(&p->aes_cmac);
+ cifs_free_hash(&p->hmacsha256);
return rc;
}
@@ -81,18 +77,19 @@ static
int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key)
{
struct cifs_chan *chan;
+ struct TCP_Server_Info *pserver;
struct cifs_ses *ses = NULL;
- struct TCP_Server_Info *it = NULL;
int i;
int rc = 0;
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(it, &cifs_tcp_ses_list, tcp_ses_list) {
- list_for_each_entry(ses, &it->smb_ses_list, smb_ses_list) {
- if (ses->Suid == ses_id)
- goto found;
- }
+ /* If server is a channel, select the primary channel */
+ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ if (ses->Suid == ses_id)
+ goto found;
}
cifs_server_dbg(VFS, "%s: Could not find session 0x%llx\n",
__func__, ses_id);
@@ -140,9 +137,13 @@ out:
static struct cifs_ses *
smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id)
{
+ struct TCP_Server_Info *pserver;
struct cifs_ses *ses;
- list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ /* If server is a channel, select the primary channel */
+ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
if (ses->Suid != ses_id)
continue;
++ses->ses_count;
@@ -219,34 +220,30 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
struct kvec *iov = rqst->rq_iov;
struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base;
struct cifs_ses *ses;
- struct shash_desc *shash;
- struct crypto_shash *hash;
- struct sdesc *sdesc = NULL;
+ struct shash_desc *shash = NULL;
struct smb_rqst drqst;
ses = smb2_find_smb_ses(server, le64_to_cpu(shdr->SessionId));
- if (!ses) {
+ if (unlikely(!ses)) {
cifs_server_dbg(VFS, "%s: Could not find session\n", __func__);
- return 0;
+ return -ENOENT;
}
memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE);
memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE);
if (allocate_crypto) {
- rc = cifs_alloc_hash("hmac(sha256)", &hash, &sdesc);
+ rc = cifs_alloc_hash("hmac(sha256)", &shash);
if (rc) {
cifs_server_dbg(VFS,
"%s: sha256 alloc failed\n", __func__);
goto out;
}
- shash = &sdesc->shash;
} else {
- hash = server->secmech.hmacsha256;
- shash = &server->secmech.sdeschmacsha256->shash;
+ shash = server->secmech.hmacsha256;
}
- rc = crypto_shash_setkey(hash, ses->auth_key.response,
+ rc = crypto_shash_setkey(shash->tfm, ses->auth_key.response,
SMB2_NTLMV2_SESSKEY_SIZE);
if (rc) {
cifs_server_dbg(VFS,
@@ -288,7 +285,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
out:
if (allocate_crypto)
- cifs_free_hash(&hash, &sdesc);
+ cifs_free_hash(&shash);
if (ses)
cifs_put_smb_ses(ses);
return rc;
@@ -315,42 +312,38 @@ static int generate_key(struct cifs_ses *ses, struct kvec label,
goto smb3signkey_ret;
}
- rc = crypto_shash_setkey(server->secmech.hmacsha256,
+ rc = crypto_shash_setkey(server->secmech.hmacsha256->tfm,
ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not set with session key\n", __func__);
goto smb3signkey_ret;
}
- rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash);
+ rc = crypto_shash_init(server->secmech.hmacsha256);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not init sign hmac\n", __func__);
goto smb3signkey_ret;
}
- rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
- i, 4);
+ rc = crypto_shash_update(server->secmech.hmacsha256, i, 4);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not update with n\n", __func__);
goto smb3signkey_ret;
}
- rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
- label.iov_base, label.iov_len);
+ rc = crypto_shash_update(server->secmech.hmacsha256, label.iov_base, label.iov_len);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not update with label\n", __func__);
goto smb3signkey_ret;
}
- rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
- &zero, 1);
+ rc = crypto_shash_update(server->secmech.hmacsha256, &zero, 1);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not update with zero\n", __func__);
goto smb3signkey_ret;
}
- rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
- context.iov_base, context.iov_len);
+ rc = crypto_shash_update(server->secmech.hmacsha256, context.iov_base, context.iov_len);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not update with context\n", __func__);
goto smb3signkey_ret;
@@ -358,19 +351,16 @@ static int generate_key(struct cifs_ses *ses, struct kvec label,
if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) ||
(server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) {
- rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
- L256, 4);
+ rc = crypto_shash_update(server->secmech.hmacsha256, L256, 4);
} else {
- rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
- L128, 4);
+ rc = crypto_shash_update(server->secmech.hmacsha256, L128, 4);
}
if (rc) {
cifs_server_dbg(VFS, "%s: Could not update with L\n", __func__);
goto smb3signkey_ret;
}
- rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash,
- hashptr);
+ rc = crypto_shash_final(server->secmech.hmacsha256, hashptr);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__);
goto smb3signkey_ret;
@@ -550,38 +540,35 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
unsigned char *sigptr = smb3_signature;
struct kvec *iov = rqst->rq_iov;
struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base;
- struct shash_desc *shash;
- struct crypto_shash *hash;
- struct sdesc *sdesc = NULL;
+ struct shash_desc *shash = NULL;
struct smb_rqst drqst;
u8 key[SMB3_SIGN_KEY_SIZE];
rc = smb2_get_sign_key(le64_to_cpu(shdr->SessionId), server, key);
- if (rc)
- return 0;
+ if (unlikely(rc)) {
+ cifs_server_dbg(VFS, "%s: Could not get signing key\n", __func__);
+ return rc;
+ }
if (allocate_crypto) {
- rc = cifs_alloc_hash("cmac(aes)", &hash, &sdesc);
+ rc = cifs_alloc_hash("cmac(aes)", &shash);
if (rc)
return rc;
-
- shash = &sdesc->shash;
} else {
- hash = server->secmech.cmacaes;
- shash = &server->secmech.sdesccmacaes->shash;
+ shash = server->secmech.aes_cmac;
}
memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE);
memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE);
- rc = crypto_shash_setkey(hash, key, SMB2_CMACAES_SIZE);
+ rc = crypto_shash_setkey(shash->tfm, key, SMB2_CMACAES_SIZE);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__);
goto out;
}
/*
- * we already allocate sdesccmacaes when we init smb3 signing key,
+ * we already allocate aes_cmac when we init smb3 signing key,
* so unlike smb2 case we do not have to check here if secmech are
* initialized
*/
@@ -617,7 +604,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
out:
if (allocate_crypto)
- cifs_free_hash(&hash, &sdesc);
+ cifs_free_hash(&shash);
return rc;
}
@@ -902,7 +889,7 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server)
{
struct crypto_aead *tfm;
- if (!server->secmech.ccmaesencrypt) {
+ if (!server->secmech.enc) {
if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) ||
(server->cipher_type == SMB2_ENCRYPTION_AES256_GCM))
tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
@@ -913,23 +900,23 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server)
__func__);
return PTR_ERR(tfm);
}
- server->secmech.ccmaesencrypt = tfm;
+ server->secmech.enc = tfm;
}
- if (!server->secmech.ccmaesdecrypt) {
+ if (!server->secmech.dec) {
if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) ||
(server->cipher_type == SMB2_ENCRYPTION_AES256_GCM))
tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
else
tfm = crypto_alloc_aead("ccm(aes)", 0, 0);
if (IS_ERR(tfm)) {
- crypto_free_aead(server->secmech.ccmaesencrypt);
- server->secmech.ccmaesencrypt = NULL;
+ crypto_free_aead(server->secmech.enc);
+ server->secmech.enc = NULL;
cifs_server_dbg(VFS, "%s: Failed to alloc decrypt aead\n",
__func__);
return PTR_ERR(tfm);
}
- server->secmech.ccmaesdecrypt = tfm;
+ server->secmech.dec = tfm;
}
return 0;
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 5fbbec22bcc8..90789aaa6567 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -90,7 +90,7 @@ int smbd_max_send_size = 1364;
int smbd_max_fragmented_recv_size = 1024 * 1024;
/* The maximum single-message size which can be received */
-int smbd_max_receive_size = 8192;
+int smbd_max_receive_size = 1364;
/* The timeout to initiate send of a keepalive message on idle */
int smbd_keep_alive_interval = 120;
@@ -99,7 +99,7 @@ int smbd_keep_alive_interval = 120;
* User configurable initial values for RDMA transport
* The actual values used may be lower and are limited to hardware capabilities
*/
-/* Default maximum number of SGEs in a RDMA write/read */
+/* Default maximum number of pages in a single RDMA write/read */
int smbd_max_frmr_depth = 2048;
/* If payload is less than this byte, use RDMA send/recv not read/write */
@@ -270,7 +270,7 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
struct smbd_request *request =
container_of(wc->wr_cqe, struct smbd_request, cqe);
- log_rdma_send(INFO, "smbd_request %p completed wc->status=%d\n",
+ log_rdma_send(INFO, "smbd_request 0x%p completed wc->status=%d\n",
request, wc->status);
if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
@@ -448,7 +448,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
struct smbd_connection *info = response->info;
int data_length = 0;
- log_rdma_recv(INFO, "response=%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%x\n",
+ log_rdma_recv(INFO, "response=0x%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%u\n",
response, response->type, wc->status, wc->opcode,
wc->byte_len, wc->pkey_index);
@@ -723,7 +723,7 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info)
send_wr.opcode = IB_WR_SEND;
send_wr.send_flags = IB_SEND_SIGNALED;
- log_rdma_send(INFO, "sge addr=%llx length=%x lkey=%x\n",
+ log_rdma_send(INFO, "sge addr=0x%llx length=%u lkey=0x%x\n",
request->sge[0].addr,
request->sge[0].length, request->sge[0].lkey);
@@ -792,7 +792,7 @@ static int smbd_post_send(struct smbd_connection *info,
for (i = 0; i < request->num_sge; i++) {
log_rdma_send(INFO,
- "rdma_request sge[%d] addr=%llu length=%u\n",
+ "rdma_request sge[%d] addr=0x%llx length=%u\n",
i, request->sge[i].addr, request->sge[i].length);
ib_dma_sync_single_for_device(
info->id->device,
@@ -1017,9 +1017,9 @@ static int smbd_post_send_data(
{
int i;
u32 data_length = 0;
- struct scatterlist sgl[SMBDIRECT_MAX_SGE];
+ struct scatterlist sgl[SMBDIRECT_MAX_SEND_SGE - 1];
- if (n_vec > SMBDIRECT_MAX_SGE) {
+ if (n_vec > SMBDIRECT_MAX_SEND_SGE - 1) {
cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec);
return -EINVAL;
}
@@ -1079,7 +1079,7 @@ static int smbd_negotiate(struct smbd_connection *info)
response->type = SMBD_NEGOTIATE_RESP;
rc = smbd_post_recv(info, response);
- log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=%llx iov.length=%x iov.lkey=%x\n",
+ log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n",
rc, response->sge.addr,
response->sge.length, response->sge.lkey);
if (rc)
@@ -1539,7 +1539,7 @@ static struct smbd_connection *_smbd_get_connection(
if (smbd_send_credit_target > info->id->device->attrs.max_cqe ||
smbd_send_credit_target > info->id->device->attrs.max_qp_wr) {
- log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n",
+ log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
smbd_send_credit_target,
info->id->device->attrs.max_cqe,
info->id->device->attrs.max_qp_wr);
@@ -1548,7 +1548,7 @@ static struct smbd_connection *_smbd_get_connection(
if (smbd_receive_credit_max > info->id->device->attrs.max_cqe ||
smbd_receive_credit_max > info->id->device->attrs.max_qp_wr) {
- log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n",
+ log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
smbd_receive_credit_max,
info->id->device->attrs.max_cqe,
info->id->device->attrs.max_qp_wr);
@@ -1562,17 +1562,15 @@ static struct smbd_connection *_smbd_get_connection(
info->max_receive_size = smbd_max_receive_size;
info->keep_alive_interval = smbd_keep_alive_interval;
- if (info->id->device->attrs.max_send_sge < SMBDIRECT_MAX_SGE) {
+ if (info->id->device->attrs.max_send_sge < SMBDIRECT_MAX_SEND_SGE ||
+ info->id->device->attrs.max_recv_sge < SMBDIRECT_MAX_RECV_SGE) {
log_rdma_event(ERR,
- "warning: device max_send_sge = %d too small\n",
- info->id->device->attrs.max_send_sge);
- log_rdma_event(ERR, "Queue Pair creation may fail\n");
- }
- if (info->id->device->attrs.max_recv_sge < SMBDIRECT_MAX_SGE) {
- log_rdma_event(ERR,
- "warning: device max_recv_sge = %d too small\n",
+ "device %.*s max_send_sge/max_recv_sge = %d/%d too small\n",
+ IB_DEVICE_NAME_MAX,
+ info->id->device->name,
+ info->id->device->attrs.max_send_sge,
info->id->device->attrs.max_recv_sge);
- log_rdma_event(ERR, "Queue Pair creation may fail\n");
+ goto config_failed;
}
info->send_cq = NULL;
@@ -1598,8 +1596,8 @@ static struct smbd_connection *_smbd_get_connection(
qp_attr.qp_context = info;
qp_attr.cap.max_send_wr = info->send_credit_target;
qp_attr.cap.max_recv_wr = info->receive_credit_max;
- qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SGE;
- qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_SGE;
+ qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SEND_SGE;
+ qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_RECV_SGE;
qp_attr.cap.max_inline_data = 0;
qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
qp_attr.qp_type = IB_QPT_RC;
@@ -1986,10 +1984,11 @@ int smbd_send(struct TCP_Server_Info *server,
int num_rqst, struct smb_rqst *rqst_array)
{
struct smbd_connection *info = server->smbd_conn;
- struct kvec vec;
+ struct kvec vecs[SMBDIRECT_MAX_SEND_SGE - 1];
int nvecs;
int size;
unsigned int buflen, remaining_data_length;
+ unsigned int offset, remaining_vec_data_length;
int start, i, j;
int max_iov_size =
info->max_send_size - sizeof(struct smbd_data_transfer);
@@ -1998,10 +1997,8 @@ int smbd_send(struct TCP_Server_Info *server,
struct smb_rqst *rqst;
int rqst_idx;
- if (info->transport_status != SMBD_CONNECTED) {
- rc = -EAGAIN;
- goto done;
- }
+ if (info->transport_status != SMBD_CONNECTED)
+ return -EAGAIN;
/*
* Add in the page array if there is one. The caller needs to set
@@ -2012,125 +2009,95 @@ int smbd_send(struct TCP_Server_Info *server,
for (i = 0; i < num_rqst; i++)
remaining_data_length += smb_rqst_len(server, &rqst_array[i]);
- if (remaining_data_length > info->max_fragmented_send_size) {
+ if (unlikely(remaining_data_length > info->max_fragmented_send_size)) {
+ /* assertion: payload never exceeds negotiated maximum */
log_write(ERR, "payload size %d > max size %d\n",
remaining_data_length, info->max_fragmented_send_size);
- rc = -EINVAL;
- goto done;
+ return -EINVAL;
}
log_write(INFO, "num_rqst=%d total length=%u\n",
num_rqst, remaining_data_length);
rqst_idx = 0;
-next_rqst:
- rqst = &rqst_array[rqst_idx];
- iov = rqst->rq_iov;
-
- cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n",
- rqst_idx, smb_rqst_len(server, rqst));
- for (i = 0; i < rqst->rq_nvec; i++)
- dump_smb(iov[i].iov_base, iov[i].iov_len);
-
-
- log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d rq_tailsz=%d buflen=%lu\n",
- rqst_idx, rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz,
- rqst->rq_tailsz, smb_rqst_len(server, rqst));
-
- start = i = 0;
- buflen = 0;
- while (true) {
- buflen += iov[i].iov_len;
- if (buflen > max_iov_size) {
- if (i > start) {
- remaining_data_length -=
- (buflen-iov[i].iov_len);
- log_write(INFO, "sending iov[] from start=%d i=%d nvecs=%d remaining_data_length=%d\n",
- start, i, i - start,
- remaining_data_length);
- rc = smbd_post_send_data(
- info, &iov[start], i-start,
- remaining_data_length);
- if (rc)
- goto done;
- } else {
- /* iov[start] is too big, break it */
- nvecs = (buflen+max_iov_size-1)/max_iov_size;
- log_write(INFO, "iov[%d] iov_base=%p buflen=%d break to %d vectors\n",
- start, iov[start].iov_base,
- buflen, nvecs);
- for (j = 0; j < nvecs; j++) {
- vec.iov_base =
- (char *)iov[start].iov_base +
- j*max_iov_size;
- vec.iov_len = max_iov_size;
- if (j == nvecs-1)
- vec.iov_len =
- buflen -
- max_iov_size*(nvecs-1);
- remaining_data_length -= vec.iov_len;
- log_write(INFO,
- "sending vec j=%d iov_base=%p iov_len=%zu remaining_data_length=%d\n",
- j, vec.iov_base, vec.iov_len,
- remaining_data_length);
- rc = smbd_post_send_data(
- info, &vec, 1,
- remaining_data_length);
- if (rc)
- goto done;
+ do {
+ rqst = &rqst_array[rqst_idx];
+ iov = rqst->rq_iov;
+
+ cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n",
+ rqst_idx, smb_rqst_len(server, rqst));
+ remaining_vec_data_length = 0;
+ for (i = 0; i < rqst->rq_nvec; i++) {
+ remaining_vec_data_length += iov[i].iov_len;
+ dump_smb(iov[i].iov_base, iov[i].iov_len);
+ }
+
+ log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d rq_tailsz=%d buflen=%lu\n",
+ rqst_idx, rqst->rq_nvec,
+ rqst->rq_npages, rqst->rq_pagesz,
+ rqst->rq_tailsz, smb_rqst_len(server, rqst));
+
+ start = 0;
+ offset = 0;
+ do {
+ buflen = 0;
+ i = start;
+ j = 0;
+ while (i < rqst->rq_nvec &&
+ j < SMBDIRECT_MAX_SEND_SGE - 1 &&
+ buflen < max_iov_size) {
+
+ vecs[j].iov_base = iov[i].iov_base + offset;
+ if (buflen + iov[i].iov_len > max_iov_size) {
+ vecs[j].iov_len =
+ max_iov_size - iov[i].iov_len;
+ buflen = max_iov_size;
+ offset = vecs[j].iov_len;
+ } else {
+ vecs[j].iov_len =
+ iov[i].iov_len - offset;
+ buflen += vecs[j].iov_len;
+ offset = 0;
+ ++i;
}
- i++;
- if (i == rqst->rq_nvec)
- break;
+ ++j;
}
+
+ remaining_vec_data_length -= buflen;
+ remaining_data_length -= buflen;
+ log_write(INFO, "sending %s iov[%d] from start=%d nvecs=%d remaining_data_length=%d\n",
+ remaining_vec_data_length > 0 ?
+ "partial" : "complete",
+ rqst->rq_nvec, start, j,
+ remaining_data_length);
+
start = i;
- buflen = 0;
- } else {
- i++;
- if (i == rqst->rq_nvec) {
- /* send out all remaining vecs */
- remaining_data_length -= buflen;
- log_write(INFO, "sending iov[] from start=%d i=%d nvecs=%d remaining_data_length=%d\n",
- start, i, i - start,
+ rc = smbd_post_send_data(info, vecs, j, remaining_data_length);
+ if (rc)
+ goto done;
+ } while (remaining_vec_data_length > 0);
+
+ /* now sending pages if there are any */
+ for (i = 0; i < rqst->rq_npages; i++) {
+ rqst_page_get_length(rqst, i, &buflen, &offset);
+ nvecs = (buflen + max_iov_size - 1) / max_iov_size;
+ log_write(INFO, "sending pages buflen=%d nvecs=%d\n",
+ buflen, nvecs);
+ for (j = 0; j < nvecs; j++) {
+ size = min_t(unsigned int, max_iov_size, remaining_data_length);
+ remaining_data_length -= size;
+ log_write(INFO, "sending pages i=%d offset=%d size=%d remaining_data_length=%d\n",
+ i, j * max_iov_size + offset, size,
remaining_data_length);
- rc = smbd_post_send_data(info, &iov[start],
- i-start, remaining_data_length);
+ rc = smbd_post_send_page(
+ info, rqst->rq_pages[i],
+ j*max_iov_size + offset,
+ size, remaining_data_length);
if (rc)
goto done;
- break;
}
}
- log_write(INFO, "looping i=%d buflen=%d\n", i, buflen);
- }
-
- /* now sending pages if there are any */
- for (i = 0; i < rqst->rq_npages; i++) {
- unsigned int offset;
-
- rqst_page_get_length(rqst, i, &buflen, &offset);
- nvecs = (buflen + max_iov_size - 1) / max_iov_size;
- log_write(INFO, "sending pages buflen=%d nvecs=%d\n",
- buflen, nvecs);
- for (j = 0; j < nvecs; j++) {
- size = max_iov_size;
- if (j == nvecs-1)
- size = buflen - j*max_iov_size;
- remaining_data_length -= size;
- log_write(INFO, "sending pages i=%d offset=%d size=%d remaining_data_length=%d\n",
- i, j * max_iov_size + offset, size,
- remaining_data_length);
- rc = smbd_post_send_page(
- info, rqst->rq_pages[i],
- j*max_iov_size + offset,
- size, remaining_data_length);
- if (rc)
- goto done;
- }
- }
-
- rqst_idx++;
- if (rqst_idx < num_rqst)
- goto next_rqst;
+ } while (++rqst_idx < num_rqst);
done:
/*
diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h
index a87fca82a796..207ef979cd51 100644
--- a/fs/cifs/smbdirect.h
+++ b/fs/cifs/smbdirect.h
@@ -91,7 +91,7 @@ struct smbd_connection {
/* Memory registrations */
/* Maximum number of RDMA read/write outstanding on this connection */
int responder_resources;
- /* Maximum number of SGEs in a RDMA write/read */
+ /* Maximum number of pages in a single RDMA write/read on this connection */
int max_frmr_depth;
/*
* If payload is less than or equal to the threshold,
@@ -225,21 +225,25 @@ struct smbd_buffer_descriptor_v1 {
__le32 length;
} __packed;
-/* Default maximum number of SGEs in a RDMA send/recv */
-#define SMBDIRECT_MAX_SGE 16
+/* Maximum number of SGEs used by smbdirect.c in any send work request */
+#define SMBDIRECT_MAX_SEND_SGE 6
+
/* The context for a SMBD request */
struct smbd_request {
struct smbd_connection *info;
struct ib_cqe cqe;
- /* the SGE entries for this packet */
- struct ib_sge sge[SMBDIRECT_MAX_SGE];
+ /* the SGE entries for this work request */
+ struct ib_sge sge[SMBDIRECT_MAX_SEND_SGE];
int num_sge;
/* SMBD packet header follows this structure */
u8 packet[];
};
+/* Maximum number of SGEs used by smbdirect.c in any receive work request */
+#define SMBDIRECT_MAX_RECV_SGE 1
+
/* The context for a SMBD response */
struct smbd_response {
struct smbd_connection *info;
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index 6b88dc2e364f..110070ba8b04 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h
@@ -372,6 +372,7 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter);
DECLARE_EVENT_CLASS(smb3_inf_compound_done_class,
@@ -409,6 +410,7 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done);
DECLARE_EVENT_CLASS(smb3_inf_compound_err_class,
@@ -451,6 +453,7 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err);
/*
* For logging SMB3 Status code and Command for responses which return errors
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 9a2753e21170..3851d0aaa288 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -347,7 +347,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
.iov_base = &rfc1002_marker,
.iov_len = 4
};
- iov_iter_kvec(&smb_msg.msg_iter, WRITE, &hiov, 1, 4);
+ iov_iter_kvec(&smb_msg.msg_iter, ITER_SOURCE, &hiov, 1, 4);
rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
goto unmask;
@@ -368,7 +368,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
size += iov[i].iov_len;
}
- iov_iter_kvec(&smb_msg.msg_iter, WRITE, iov, n_vec, size);
+ iov_iter_kvec(&smb_msg.msg_iter, ITER_SOURCE, iov, n_vec, size);
rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
@@ -384,7 +384,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
rqst_page_get_length(&rqst[j], i, &bvec.bv_len,
&bvec.bv_offset);
- iov_iter_bvec(&smb_msg.msg_iter, WRITE,
+ iov_iter_bvec(&smb_msg.msg_iter, ITER_SOURCE,
&bvec, 1, bvec.bv_len);
rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
@@ -753,8 +753,9 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
{
int error;
- error = wait_event_freezekillable_unsafe(server->response_q,
- midQ->mid_state != MID_REQUEST_SUBMITTED);
+ error = wait_event_state(server->response_q,
+ midQ->mid_state != MID_REQUEST_SUBMITTED,
+ (TASK_KILLABLE|TASK_FREEZABLE_UNSAFE));
if (error < 0)
return -ERESTARTSYS;
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 998fa51f9b68..5f2fb2fd2e37 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -200,32 +200,6 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
}
break;
}
-
-#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
- case XATTR_ACL_ACCESS:
-#ifdef CONFIG_CIFS_POSIX
- if (!value)
- goto out;
- if (sb->s_flags & SB_POSIXACL)
- rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
- value, (const int)size,
- ACL_TYPE_ACCESS, cifs_sb->local_nls,
- cifs_remap(cifs_sb));
-#endif /* CONFIG_CIFS_POSIX */
- break;
-
- case XATTR_ACL_DEFAULT:
-#ifdef CONFIG_CIFS_POSIX
- if (!value)
- goto out;
- if (sb->s_flags & SB_POSIXACL)
- rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
- value, (const int)size,
- ACL_TYPE_DEFAULT, cifs_sb->local_nls,
- cifs_remap(cifs_sb));
-#endif /* CONFIG_CIFS_POSIX */
- break;
-#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
}
out:
@@ -366,27 +340,6 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
}
break;
}
-#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
- case XATTR_ACL_ACCESS:
-#ifdef CONFIG_CIFS_POSIX
- if (sb->s_flags & SB_POSIXACL)
- rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
- value, size, ACL_TYPE_ACCESS,
- cifs_sb->local_nls,
- cifs_remap(cifs_sb));
-#endif /* CONFIG_CIFS_POSIX */
- break;
-
- case XATTR_ACL_DEFAULT:
-#ifdef CONFIG_CIFS_POSIX
- if (sb->s_flags & SB_POSIXACL)
- rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
- value, size, ACL_TYPE_DEFAULT,
- cifs_sb->local_nls,
- cifs_remap(cifs_sb));
-#endif /* CONFIG_CIFS_POSIX */
- break;
-#endif /* ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
}
/* We could add an additional check for streams ie
@@ -525,21 +478,6 @@ static const struct xattr_handler smb3_ntsd_full_xattr_handler = {
.set = cifs_xattr_set,
};
-
-static const struct xattr_handler cifs_posix_acl_access_xattr_handler = {
- .name = XATTR_NAME_POSIX_ACL_ACCESS,
- .flags = XATTR_ACL_ACCESS,
- .get = cifs_xattr_get,
- .set = cifs_xattr_set,
-};
-
-static const struct xattr_handler cifs_posix_acl_default_xattr_handler = {
- .name = XATTR_NAME_POSIX_ACL_DEFAULT,
- .flags = XATTR_ACL_DEFAULT,
- .get = cifs_xattr_get,
- .set = cifs_xattr_set,
-};
-
const struct xattr_handler *cifs_xattr_handlers[] = {
&cifs_user_xattr_handler,
&cifs_os2_xattr_handler,
@@ -549,7 +487,9 @@ const struct xattr_handler *cifs_xattr_handlers[] = {
&smb3_ntsd_xattr_handler, /* alias for above since avoiding "cifs" */
&cifs_cifs_ntsd_full_xattr_handler,
&smb3_ntsd_full_xattr_handler, /* alias for above since avoiding "cifs" */
- &cifs_posix_acl_access_xattr_handler,
- &cifs_posix_acl_default_xattr_handler,
+#ifdef CONFIG_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
NULL
};
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index d1f9d2632202..ec6519e1ca3b 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -316,6 +316,7 @@ static int configfs_create_dir(struct config_item *item, struct dentry *dentry,
return 0;
out_remove:
+ configfs_put(dentry->d_fsdata);
configfs_remove_dirent(dentry);
return PTR_ERR(inode);
}
@@ -382,6 +383,7 @@ int configfs_create_link(struct configfs_dirent *target, struct dentry *parent,
return 0;
out_remove:
+ configfs_put(dentry->d_fsdata);
configfs_remove_dirent(dentry);
return PTR_ERR(inode);
}
diff --git a/fs/coredump.c b/fs/coredump.c
index 9f4aae202109..de78bde2991b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -68,7 +68,10 @@ struct core_name {
static int expand_corename(struct core_name *cn, int size)
{
- char *corename = krealloc(cn->corename, size, GFP_KERNEL);
+ char *corename;
+
+ size = kmalloc_size_roundup(size);
+ corename = krealloc(cn->corename, size, GFP_KERNEL);
if (!corename)
return -ENOMEM;
@@ -76,7 +79,7 @@ static int expand_corename(struct core_name *cn, int size)
if (size > core_name_size) /* racy but harmless */
core_name_size = size;
- cn->size = ksize(corename);
+ cn->size = size;
cn->corename = corename;
return 0;
}
@@ -325,6 +328,10 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
err = cn_printf(cn, "%lu",
rlimit(RLIMIT_CORE));
break;
+ /* CPU the task ran on */
+ case 'C':
+ err = cn_printf(cn, "%d", cprm->cpu);
+ break;
default:
break;
}
@@ -354,7 +361,7 @@ static int zap_process(struct task_struct *start, int exit_code)
struct task_struct *t;
int nr = 0;
- /* ignore all signals except SIGKILL, see prepare_signal() */
+ /* Allow SIGKILL, see prepare_signal() */
start->signal->flags = SIGNAL_GROUP_EXIT;
start->signal->group_exit_code = exit_code;
start->signal->group_stop_count = 0;
@@ -402,9 +409,8 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
if (core_waiters > 0) {
struct core_thread *ptr;
- freezer_do_not_count();
- wait_for_completion(&core_state->startup);
- freezer_count();
+ wait_for_completion_state(&core_state->startup,
+ TASK_UNINTERRUPTIBLE|TASK_FREEZABLE);
/*
* Wait for all the threads to become inactive, so that
* all the thread context (extended register state, like
@@ -412,7 +418,7 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
*/
ptr = core_state->dumper.next;
while (ptr != NULL) {
- wait_task_inactive(ptr->task, 0);
+ wait_task_inactive(ptr->task, TASK_ANY);
ptr = ptr->next;
}
}
@@ -526,7 +532,6 @@ void do_coredump(const kernel_siginfo_t *siginfo)
static atomic_t core_dump_count = ATOMIC_INIT(0);
struct coredump_params cprm = {
.siginfo = siginfo,
- .regs = signal_pt_regs(),
.limit = rlimit(RLIMIT_CORE),
/*
* We must use the same mm->flags while dumping core to avoid
@@ -535,6 +540,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
*/
.mm_flags = mm->flags,
.vma_meta = NULL,
+ .cpu = raw_smp_processor_id(),
};
audit_core_dumps(siginfo->si_signo);
@@ -717,8 +723,8 @@ void do_coredump(const kernel_siginfo_t *siginfo)
* filesystem.
*/
mnt_userns = file_mnt_user_ns(cprm.file);
- if (!uid_eq(i_uid_into_mnt(mnt_userns, inode),
- current_fsuid())) {
+ if (!vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode),
+ current_fsuid())) {
pr_info_ratelimited("Core dump to %s aborted: cannot preserve file owner\n",
cn.corename);
goto close_fail;
@@ -832,6 +838,39 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr)
}
}
+static int dump_emit_page(struct coredump_params *cprm, struct page *page)
+{
+ struct bio_vec bvec = {
+ .bv_page = page,
+ .bv_offset = 0,
+ .bv_len = PAGE_SIZE,
+ };
+ struct iov_iter iter;
+ struct file *file = cprm->file;
+ loff_t pos;
+ ssize_t n;
+
+ if (cprm->to_skip) {
+ if (!__dump_skip(cprm, cprm->to_skip))
+ return 0;
+ cprm->to_skip = 0;
+ }
+ if (cprm->written + PAGE_SIZE > cprm->limit)
+ return 0;
+ if (dump_interrupted())
+ return 0;
+ pos = file->f_pos;
+ iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE);
+ n = __kernel_write_iter(cprm->file, &iter, &pos);
+ if (n != PAGE_SIZE)
+ return 0;
+ file->f_pos = pos;
+ cprm->written += PAGE_SIZE;
+ cprm->pos += PAGE_SIZE;
+
+ return 1;
+}
+
int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
{
if (cprm->to_skip) {
@@ -863,7 +902,6 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
for (addr = start; addr < start + len; addr += PAGE_SIZE) {
struct page *page;
- int stop;
/*
* To avoid having to allocate page tables for virtual address
@@ -874,10 +912,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
*/
page = get_dump_page(addr);
if (page) {
- void *kaddr = kmap_local_page(page);
-
- stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
- kunmap_local(kaddr);
+ int stop = !dump_emit_page(cprm, page);
put_page(page);
if (stop)
return 0;
@@ -1072,30 +1107,20 @@ whole:
return vma->vm_end - vma->vm_start;
}
-static struct vm_area_struct *first_vma(struct task_struct *tsk,
- struct vm_area_struct *gate_vma)
-{
- struct vm_area_struct *ret = tsk->mm->mmap;
-
- if (ret)
- return ret;
- return gate_vma;
-}
-
/*
* Helper function for iterating across a vma list. It ensures that the caller
* will visit `gate_vma' prior to terminating the search.
*/
-static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
+static struct vm_area_struct *coredump_next_vma(struct ma_state *mas,
+ struct vm_area_struct *vma,
struct vm_area_struct *gate_vma)
{
- struct vm_area_struct *ret;
-
- ret = this_vma->vm_next;
- if (ret)
- return ret;
- if (this_vma == gate_vma)
+ if (gate_vma && (vma == gate_vma))
return NULL;
+
+ vma = mas_next(mas, ULONG_MAX);
+ if (vma)
+ return vma;
return gate_vma;
}
@@ -1119,9 +1144,10 @@ static void free_vma_snapshot(struct coredump_params *cprm)
*/
static bool dump_vma_snapshot(struct coredump_params *cprm)
{
- struct vm_area_struct *vma, *gate_vma;
+ struct vm_area_struct *gate_vma, *vma = NULL;
struct mm_struct *mm = current->mm;
- int i;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
+ int i = 0;
/*
* Once the stack expansion code is fixed to not change VMA bounds
@@ -1141,8 +1167,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm)
return false;
}
- for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
- vma = next_vma(vma, gate_vma), i++) {
+ while ((vma = coredump_next_vma(&mas, vma, gate_vma)) != NULL) {
struct core_vma_metadata *m = cprm->vma_meta + i;
m->start = vma->vm_start;
@@ -1150,10 +1175,10 @@ static bool dump_vma_snapshot(struct coredump_params *cprm)
m->flags = vma->vm_flags;
m->dump_size = vma_dump_size(vma, cprm->mm_flags);
m->pgoff = vma->vm_pgoff;
-
m->file = vma->vm_file;
if (m->file)
get_file(m->file);
+ i++;
}
mmap_write_unlock(mm);
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 2217fe5ece6f..1b4403136d05 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -25,21 +25,25 @@
* then this function isn't applicable. This function may sleep, so it must be
* called from a workqueue rather than from the bio's bi_end_io callback.
*
- * This function sets PG_error on any pages that contain any blocks that failed
- * to be decrypted. The filesystem must not mark such pages uptodate.
+ * Return: %true on success; %false on failure. On failure, bio->bi_status is
+ * also set to an error status.
*/
-void fscrypt_decrypt_bio(struct bio *bio)
+bool fscrypt_decrypt_bio(struct bio *bio)
{
struct bio_vec *bv;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bv, bio, iter_all) {
struct page *page = bv->bv_page;
- int ret = fscrypt_decrypt_pagecache_blocks(page, bv->bv_len,
+ int err = fscrypt_decrypt_pagecache_blocks(page, bv->bv_len,
bv->bv_offset);
- if (ret)
- SetPageError(page);
+
+ if (err) {
+ bio->bi_status = errno_to_blk_status(err);
+ return false;
+ }
}
+ return true;
}
EXPORT_SYMBOL(fscrypt_decrypt_bio);
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 3afdaa084773..316a778cec0f 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -184,7 +184,7 @@ struct fscrypt_symlink_data {
struct fscrypt_prepared_key {
struct crypto_skcipher *tfm;
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
- struct fscrypt_blk_crypto_key *blk_key;
+ struct blk_crypto_key *blk_key;
#endif
};
@@ -225,7 +225,7 @@ struct fscrypt_info {
* will be NULL if the master key was found in a process-subscribed
* keyring rather than in the filesystem-level keyring.
*/
- struct key *ci_master_key;
+ struct fscrypt_master_key *ci_master_key;
/*
* Link in list of inodes that were unlocked with the master key.
@@ -344,7 +344,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
const u8 *raw_key,
const struct fscrypt_info *ci);
-void fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key);
+void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
+ struct fscrypt_prepared_key *prep_key);
/*
* Check whether the crypto transform or blk-crypto key has been allocated in
@@ -390,7 +391,8 @@ fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
}
static inline void
-fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key)
+fscrypt_destroy_inline_crypt_key(struct super_block *sb,
+ struct fscrypt_prepared_key *prep_key)
{
}
@@ -437,6 +439,34 @@ struct fscrypt_master_key_secret {
struct fscrypt_master_key {
/*
+ * Link in ->s_master_keys->key_hashtable.
+ * Only valid if ->mk_active_refs > 0.
+ */
+ struct hlist_node mk_node;
+
+ /* Semaphore that protects ->mk_secret and ->mk_users */
+ struct rw_semaphore mk_sem;
+
+ /*
+ * Active and structural reference counts. An active ref guarantees
+ * that the struct continues to exist, continues to be in the keyring
+ * ->s_master_keys, and that any embedded subkeys (e.g.
+ * ->mk_direct_keys) that have been prepared continue to exist.
+ * A structural ref only guarantees that the struct continues to exist.
+ *
+ * There is one active ref associated with ->mk_secret being present,
+ * and one active ref for each inode in ->mk_decrypted_inodes.
+ *
+ * There is one structural ref associated with the active refcount being
+ * nonzero. Finding a key in the keyring also takes a structural ref,
+ * which is then held temporarily while the key is operated on.
+ */
+ refcount_t mk_active_refs;
+ refcount_t mk_struct_refs;
+
+ struct rcu_head mk_rcu_head;
+
+ /*
* The secret key material. After FS_IOC_REMOVE_ENCRYPTION_KEY is
* executed, this is wiped and no new inodes can be unlocked with this
* key; however, there may still be inodes in ->mk_decrypted_inodes
@@ -444,7 +474,10 @@ struct fscrypt_master_key {
* FS_IOC_REMOVE_ENCRYPTION_KEY can be retried, or
* FS_IOC_ADD_ENCRYPTION_KEY can add the secret again.
*
- * Locking: protected by this master key's key->sem.
+ * While ->mk_secret is present, one ref in ->mk_active_refs is held.
+ *
+ * Locking: protected by ->mk_sem. The manipulation of ->mk_active_refs
+ * associated with this field is protected by ->mk_sem as well.
*/
struct fscrypt_master_key_secret mk_secret;
@@ -465,23 +498,13 @@ struct fscrypt_master_key {
*
* This is NULL for v1 policy keys; those can only be added by root.
*
- * Locking: in addition to this keyring's own semaphore, this is
- * protected by this master key's key->sem, so we can do atomic
- * search+insert. It can also be searched without taking any locks, but
- * in that case the returned key may have already been removed.
+ * Locking: protected by ->mk_sem. (We don't just rely on the keyrings
+ * subsystem semaphore ->mk_users->sem, as we need support for atomic
+ * search+insert along with proper synchronization with ->mk_secret.)
*/
struct key *mk_users;
/*
- * Length of ->mk_decrypted_inodes, plus one if mk_secret is present.
- * Once this goes to 0, the master key is removed from ->s_master_keys.
- * The 'struct fscrypt_master_key' will continue to live as long as the
- * 'struct key' whose payload it is, but we won't let this reference
- * count rise again.
- */
- refcount_t mk_refcount;
-
- /*
* List of inodes that were unlocked using this key. This allows the
* inodes to be evicted efficiently if the key is removed.
*/
@@ -506,10 +529,10 @@ static inline bool
is_master_key_secret_present(const struct fscrypt_master_key_secret *secret)
{
/*
- * The READ_ONCE() is only necessary for fscrypt_drop_inode() and
- * fscrypt_key_describe(). These run in atomic context, so they can't
- * take the key semaphore and thus 'secret' can change concurrently
- * which would be a data race. But they only need to know whether the
+ * The READ_ONCE() is only necessary for fscrypt_drop_inode().
+ * fscrypt_drop_inode() runs in atomic context, so it can't take the key
+ * semaphore and thus 'secret' can change concurrently which would be a
+ * data race. But fscrypt_drop_inode() only need to know whether the
* secret *was* present at the time of check, so READ_ONCE() suffices.
*/
return READ_ONCE(secret->size) != 0;
@@ -538,7 +561,12 @@ static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec)
return 0;
}
-struct key *
+void fscrypt_put_master_key(struct fscrypt_master_key *mk);
+
+void fscrypt_put_master_key_activeref(struct super_block *sb,
+ struct fscrypt_master_key *mk);
+
+struct fscrypt_master_key *
fscrypt_find_master_key(struct super_block *sb,
const struct fscrypt_key_specifier *mk_spec);
@@ -569,7 +597,8 @@ extern struct fscrypt_mode fscrypt_modes[];
int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
const u8 *raw_key, const struct fscrypt_info *ci);
-void fscrypt_destroy_prepared_key(struct fscrypt_prepared_key *prep_key);
+void fscrypt_destroy_prepared_key(struct super_block *sb,
+ struct fscrypt_prepared_key *prep_key);
int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key);
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 7c01025879b3..7b8c5a1104b5 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -5,8 +5,6 @@
* Encryption hooks for higher-level filesystem operations.
*/
-#include <linux/key.h>
-
#include "fscrypt_private.h"
/**
@@ -142,7 +140,6 @@ int fscrypt_prepare_setflags(struct inode *inode,
unsigned int oldflags, unsigned int flags)
{
struct fscrypt_info *ci;
- struct key *key;
struct fscrypt_master_key *mk;
int err;
@@ -158,14 +155,13 @@ int fscrypt_prepare_setflags(struct inode *inode,
ci = inode->i_crypt_info;
if (ci->ci_policy.version != FSCRYPT_POLICY_V2)
return -EINVAL;
- key = ci->ci_master_key;
- mk = key->payload.data[0];
- down_read(&key->sem);
+ mk = ci->ci_master_key;
+ down_read(&mk->mk_sem);
if (is_master_key_secret_present(&mk->mk_secret))
err = fscrypt_derive_dirhash_key(ci, mk);
else
err = -ENOKEY;
- up_read(&key->sem);
+ up_read(&mk->mk_sem);
return err;
}
return 0;
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 90f3e68f166e..8bfb3ce86476 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -12,7 +12,7 @@
* provides the key and IV to use.
*/
-#include <linux/blk-crypto-profile.h>
+#include <linux/blk-crypto.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/sched/mm.h>
@@ -21,26 +21,22 @@
#include "fscrypt_private.h"
-struct fscrypt_blk_crypto_key {
- struct blk_crypto_key base;
- int num_devs;
- struct request_queue *devs[];
-};
-
-static int fscrypt_get_num_devices(struct super_block *sb)
+static struct block_device **fscrypt_get_devices(struct super_block *sb,
+ unsigned int *num_devs)
{
- if (sb->s_cop->get_num_devices)
- return sb->s_cop->get_num_devices(sb);
- return 1;
-}
+ struct block_device **devs;
-static void fscrypt_get_devices(struct super_block *sb, int num_devs,
- struct request_queue **devs)
-{
- if (num_devs == 1)
- devs[0] = bdev_get_queue(sb->s_bdev);
- else
- sb->s_cop->get_devices(sb, devs);
+ if (sb->s_cop->get_devices) {
+ devs = sb->s_cop->get_devices(sb, num_devs);
+ if (devs)
+ return devs;
+ }
+ devs = kmalloc(sizeof(*devs), GFP_KERNEL);
+ if (!devs)
+ return ERR_PTR(-ENOMEM);
+ devs[0] = sb->s_bdev;
+ *num_devs = 1;
+ return devs;
}
static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci)
@@ -74,15 +70,15 @@ static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci)
* helpful for debugging problems where the "wrong" implementation is used.
*/
static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode,
- struct request_queue **devs,
- int num_devs,
+ struct block_device **devs,
+ unsigned int num_devs,
const struct blk_crypto_config *cfg)
{
- int i;
+ unsigned int i;
for (i = 0; i < num_devs; i++) {
if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
- __blk_crypto_cfg_supported(devs[i]->crypto_profile, cfg)) {
+ blk_crypto_config_supported_natively(devs[i], cfg)) {
if (!xchg(&mode->logged_blk_crypto_native, 1))
pr_info("fscrypt: %s using blk-crypto (native)\n",
mode->friendly_name);
@@ -99,9 +95,9 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
const struct inode *inode = ci->ci_inode;
struct super_block *sb = inode->i_sb;
struct blk_crypto_config crypto_cfg;
- int num_devs;
- struct request_queue **devs;
- int i;
+ struct block_device **devs;
+ unsigned int num_devs;
+ unsigned int i;
/* The file must need contents encryption, not filenames encryption */
if (!S_ISREG(inode->i_mode))
@@ -129,17 +125,16 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
return 0;
/*
- * On all the filesystem's devices, blk-crypto must support the crypto
- * configuration that the file would use.
+ * On all the filesystem's block devices, blk-crypto must support the
+ * crypto configuration that the file would use.
*/
crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode;
crypto_cfg.data_unit_size = sb->s_blocksize;
crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci);
- num_devs = fscrypt_get_num_devices(sb);
- devs = kmalloc_array(num_devs, sizeof(*devs), GFP_KERNEL);
- if (!devs)
- return -ENOMEM;
- fscrypt_get_devices(sb, num_devs, devs);
+
+ devs = fscrypt_get_devices(sb, &num_devs);
+ if (IS_ERR(devs))
+ return PTR_ERR(devs);
for (i = 0; i < num_devs; i++) {
if (!blk_crypto_config_supported(devs[i], &crypto_cfg))
@@ -162,49 +157,40 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
const struct inode *inode = ci->ci_inode;
struct super_block *sb = inode->i_sb;
enum blk_crypto_mode_num crypto_mode = ci->ci_mode->blk_crypto_mode;
- int num_devs = fscrypt_get_num_devices(sb);
- int queue_refs = 0;
- struct fscrypt_blk_crypto_key *blk_key;
+ struct blk_crypto_key *blk_key;
+ struct block_device **devs;
+ unsigned int num_devs;
+ unsigned int i;
int err;
- int i;
- blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_KERNEL);
+ blk_key = kmalloc(sizeof(*blk_key), GFP_KERNEL);
if (!blk_key)
return -ENOMEM;
- blk_key->num_devs = num_devs;
- fscrypt_get_devices(sb, num_devs, blk_key->devs);
-
- err = blk_crypto_init_key(&blk_key->base, raw_key, crypto_mode,
+ err = blk_crypto_init_key(blk_key, raw_key, crypto_mode,
fscrypt_get_dun_bytes(ci), sb->s_blocksize);
if (err) {
fscrypt_err(inode, "error %d initializing blk-crypto key", err);
goto fail;
}
- /*
- * We have to start using blk-crypto on all the filesystem's devices.
- * We also have to save all the request_queue's for later so that the
- * key can be evicted from them. This is needed because some keys
- * aren't destroyed until after the filesystem was already unmounted
- * (namely, the per-mode keys in struct fscrypt_master_key).
- */
+ /* Start using blk-crypto on all the filesystem's block devices. */
+ devs = fscrypt_get_devices(sb, &num_devs);
+ if (IS_ERR(devs)) {
+ err = PTR_ERR(devs);
+ goto fail;
+ }
for (i = 0; i < num_devs; i++) {
- if (!blk_get_queue(blk_key->devs[i])) {
- fscrypt_err(inode, "couldn't get request_queue");
- err = -EAGAIN;
- goto fail;
- }
- queue_refs++;
-
- err = blk_crypto_start_using_key(&blk_key->base,
- blk_key->devs[i]);
- if (err) {
- fscrypt_err(inode,
- "error %d starting to use blk-crypto", err);
- goto fail;
- }
+ err = blk_crypto_start_using_key(devs[i], blk_key);
+ if (err)
+ break;
}
+ kfree(devs);
+ if (err) {
+ fscrypt_err(inode, "error %d starting to use blk-crypto", err);
+ goto fail;
+ }
+
/*
* Pairs with the smp_load_acquire() in fscrypt_is_key_prepared().
* I.e., here we publish ->blk_key with a RELEASE barrier so that
@@ -215,24 +201,29 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
return 0;
fail:
- for (i = 0; i < queue_refs; i++)
- blk_put_queue(blk_key->devs[i]);
kfree_sensitive(blk_key);
return err;
}
-void fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key)
+void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
+ struct fscrypt_prepared_key *prep_key)
{
- struct fscrypt_blk_crypto_key *blk_key = prep_key->blk_key;
- int i;
+ struct blk_crypto_key *blk_key = prep_key->blk_key;
+ struct block_device **devs;
+ unsigned int num_devs;
+ unsigned int i;
- if (blk_key) {
- for (i = 0; i < blk_key->num_devs; i++) {
- blk_crypto_evict_key(blk_key->devs[i], &blk_key->base);
- blk_put_queue(blk_key->devs[i]);
- }
- kfree_sensitive(blk_key);
+ if (!blk_key)
+ return;
+
+ /* Evict the key from all the filesystem's block devices. */
+ devs = fscrypt_get_devices(sb, &num_devs);
+ if (!IS_ERR(devs)) {
+ for (i = 0; i < num_devs; i++)
+ blk_crypto_evict_key(devs[i], blk_key);
+ kfree(devs);
}
+ kfree_sensitive(blk_key);
}
bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode)
@@ -282,7 +273,7 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode,
ci = inode->i_crypt_info;
fscrypt_generate_dun(ci, first_lblk, dun);
- bio_crypt_set_ctx(bio, &ci->ci_enc_key.blk_key->base, dun, gfp_mask);
+ bio_crypt_set_ctx(bio, ci->ci_enc_key.blk_key, dun, gfp_mask);
}
EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx);
@@ -369,7 +360,7 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
* uses the same pointer. I.e., there's currently no need to support
* merging requests where the keys are the same but the pointers differ.
*/
- if (bc->bc_key != &inode->i_crypt_info->ci_enc_key.blk_key->base)
+ if (bc->bc_key != inode->i_crypt_info->ci_enc_key.blk_key)
return false;
fscrypt_generate_dun(inode->i_crypt_info, next_lblk, next_dun);
@@ -401,46 +392,45 @@ bool fscrypt_mergeable_bio_bh(struct bio *bio,
EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh);
/**
- * fscrypt_dio_supported() - check whether a DIO (direct I/O) request is
- * supported as far as encryption is concerned
- * @iocb: the file and position the I/O is targeting
- * @iter: the I/O data segment(s)
+ * fscrypt_dio_supported() - check whether DIO (direct I/O) is supported on an
+ * inode, as far as encryption is concerned
+ * @inode: the inode in question
*
* Return: %true if there are no encryption constraints that prevent DIO from
* being supported; %false if DIO is unsupported. (Note that in the
* %true case, the filesystem might have other, non-encryption-related
- * constraints that prevent DIO from actually being supported.)
+ * constraints that prevent DIO from actually being supported. Also, on
+ * encrypted files the filesystem is still responsible for only allowing
+ * DIO when requests are filesystem-block-aligned.)
*/
-bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
+bool fscrypt_dio_supported(struct inode *inode)
{
- const struct inode *inode = file_inode(iocb->ki_filp);
- const unsigned int blocksize = i_blocksize(inode);
+ int err;
/* If the file is unencrypted, no veto from us. */
if (!fscrypt_needs_contents_encryption(inode))
return true;
- /* We only support DIO with inline crypto, not fs-layer crypto. */
- if (!fscrypt_inode_uses_inline_crypto(inode))
- return false;
-
/*
- * Since the granularity of encryption is filesystem blocks, the file
- * position and total I/O length must be aligned to the filesystem block
- * size -- not just to the block device's logical block size as is
- * traditionally the case for DIO on many filesystems.
+ * We only support DIO with inline crypto, not fs-layer crypto.
*
- * We require that the user-provided memory buffers be filesystem block
- * aligned too. It is simpler to have a single alignment value required
- * for all properties of the I/O, as is normally the case for DIO.
- * Also, allowing less aligned buffers would imply that data units could
- * cross bvecs, which would greatly complicate the I/O stack, which
- * assumes that bios can be split at any bvec boundary.
+ * To determine whether the inode is using inline crypto, we have to set
+ * up the key if it wasn't already done. This is because in the current
+ * design of fscrypt, the decision of whether to use inline crypto or
+ * not isn't made until the inode's encryption key is being set up. In
+ * the DIO read/write case, the key will always be set up already, since
+ * the file will be open. But in the case of statx(), the key might not
+ * be set up yet, as the file might not have been opened yet.
*/
- if (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), blocksize))
+ err = fscrypt_require_key(inode);
+ if (err) {
+ /*
+ * Key unavailable or couldn't be set up. This edge case isn't
+ * worth worrying about; just report that DIO is unsupported.
+ */
return false;
-
- return true;
+ }
+ return fscrypt_inode_uses_inline_crypto(inode);
}
EXPORT_SYMBOL_GPL(fscrypt_dio_supported);
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index caee9f8620dd..78dd2ff306bd 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -18,6 +18,7 @@
* information about these ioctls.
*/
+#include <asm/unaligned.h>
#include <crypto/skcipher.h>
#include <linux/key-type.h>
#include <linux/random.h>
@@ -25,6 +26,18 @@
#include "fscrypt_private.h"
+/* The master encryption keys for a filesystem (->s_master_keys) */
+struct fscrypt_keyring {
+ /*
+ * Lock that protects ->key_hashtable. It does *not* protect the
+ * fscrypt_master_key structs themselves.
+ */
+ spinlock_t lock;
+
+ /* Hash table that maps fscrypt_key_specifier to fscrypt_master_key */
+ struct hlist_head key_hashtable[128];
+};
+
static void wipe_master_key_secret(struct fscrypt_master_key_secret *secret)
{
fscrypt_destroy_hkdf(&secret->hkdf);
@@ -38,66 +51,80 @@ static void move_master_key_secret(struct fscrypt_master_key_secret *dst,
memzero_explicit(src, sizeof(*src));
}
-static void free_master_key(struct fscrypt_master_key *mk)
+static void fscrypt_free_master_key(struct rcu_head *head)
{
- size_t i;
-
- wipe_master_key_secret(&mk->mk_secret);
-
- for (i = 0; i <= FSCRYPT_MODE_MAX; i++) {
- fscrypt_destroy_prepared_key(&mk->mk_direct_keys[i]);
- fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_64_keys[i]);
- fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_32_keys[i]);
- }
-
- key_put(mk->mk_users);
+ struct fscrypt_master_key *mk =
+ container_of(head, struct fscrypt_master_key, mk_rcu_head);
+ /*
+ * The master key secret and any embedded subkeys should have already
+ * been wiped when the last active reference to the fscrypt_master_key
+ * struct was dropped; doing it here would be unnecessarily late.
+ * Nevertheless, use kfree_sensitive() in case anything was missed.
+ */
kfree_sensitive(mk);
}
-static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec)
+void fscrypt_put_master_key(struct fscrypt_master_key *mk)
{
- if (spec->__reserved)
- return false;
- return master_key_spec_len(spec) != 0;
+ if (!refcount_dec_and_test(&mk->mk_struct_refs))
+ return;
+ /*
+ * No structural references left, so free ->mk_users, and also free the
+ * fscrypt_master_key struct itself after an RCU grace period ensures
+ * that concurrent keyring lookups can no longer find it.
+ */
+ WARN_ON(refcount_read(&mk->mk_active_refs) != 0);
+ key_put(mk->mk_users);
+ mk->mk_users = NULL;
+ call_rcu(&mk->mk_rcu_head, fscrypt_free_master_key);
}
-static int fscrypt_key_instantiate(struct key *key,
- struct key_preparsed_payload *prep)
+void fscrypt_put_master_key_activeref(struct super_block *sb,
+ struct fscrypt_master_key *mk)
{
- key->payload.data[0] = (struct fscrypt_master_key *)prep->data;
- return 0;
-}
+ size_t i;
-static void fscrypt_key_destroy(struct key *key)
-{
- free_master_key(key->payload.data[0]);
-}
+ if (!refcount_dec_and_test(&mk->mk_active_refs))
+ return;
+ /*
+ * No active references left, so complete the full removal of this
+ * fscrypt_master_key struct by removing it from the keyring and
+ * destroying any subkeys embedded in it.
+ */
-static void fscrypt_key_describe(const struct key *key, struct seq_file *m)
-{
- seq_puts(m, key->description);
+ spin_lock(&sb->s_master_keys->lock);
+ hlist_del_rcu(&mk->mk_node);
+ spin_unlock(&sb->s_master_keys->lock);
- if (key_is_positive(key)) {
- const struct fscrypt_master_key *mk = key->payload.data[0];
+ /*
+ * ->mk_active_refs == 0 implies that ->mk_secret is not present and
+ * that ->mk_decrypted_inodes is empty.
+ */
+ WARN_ON(is_master_key_secret_present(&mk->mk_secret));
+ WARN_ON(!list_empty(&mk->mk_decrypted_inodes));
- if (!is_master_key_secret_present(&mk->mk_secret))
- seq_puts(m, ": secret removed");
+ for (i = 0; i <= FSCRYPT_MODE_MAX; i++) {
+ fscrypt_destroy_prepared_key(
+ sb, &mk->mk_direct_keys[i]);
+ fscrypt_destroy_prepared_key(
+ sb, &mk->mk_iv_ino_lblk_64_keys[i]);
+ fscrypt_destroy_prepared_key(
+ sb, &mk->mk_iv_ino_lblk_32_keys[i]);
}
+ memzero_explicit(&mk->mk_ino_hash_key,
+ sizeof(mk->mk_ino_hash_key));
+ mk->mk_ino_hash_key_initialized = false;
+
+ /* Drop the structural ref associated with the active refs. */
+ fscrypt_put_master_key(mk);
}
-/*
- * Type of key in ->s_master_keys. Each key of this type represents a master
- * key which has been added to the filesystem. Its payload is a
- * 'struct fscrypt_master_key'. The "." prefix in the key type name prevents
- * users from adding keys of this type via the keyrings syscalls rather than via
- * the intended method of FS_IOC_ADD_ENCRYPTION_KEY.
- */
-static struct key_type key_type_fscrypt = {
- .name = "._fscrypt",
- .instantiate = fscrypt_key_instantiate,
- .destroy = fscrypt_key_destroy,
- .describe = fscrypt_key_describe,
-};
+static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec)
+{
+ if (spec->__reserved)
+ return false;
+ return master_key_spec_len(spec) != 0;
+}
static int fscrypt_user_key_instantiate(struct key *key,
struct key_preparsed_payload *prep)
@@ -131,32 +158,6 @@ static struct key_type key_type_fscrypt_user = {
.describe = fscrypt_user_key_describe,
};
-/* Search ->s_master_keys or ->mk_users */
-static struct key *search_fscrypt_keyring(struct key *keyring,
- struct key_type *type,
- const char *description)
-{
- /*
- * We need to mark the keyring reference as "possessed" so that we
- * acquire permission to search it, via the KEY_POS_SEARCH permission.
- */
- key_ref_t keyref = make_key_ref(keyring, true /* possessed */);
-
- keyref = keyring_search(keyref, type, description, false);
- if (IS_ERR(keyref)) {
- if (PTR_ERR(keyref) == -EAGAIN || /* not found */
- PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */
- keyref = ERR_PTR(-ENOKEY);
- return ERR_CAST(keyref);
- }
- return key_ref_to_ptr(keyref);
-}
-
-#define FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE \
- (CONST_STRLEN("fscrypt-") + sizeof_field(struct super_block, s_id))
-
-#define FSCRYPT_MK_DESCRIPTION_SIZE (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + 1)
-
#define FSCRYPT_MK_USERS_DESCRIPTION_SIZE \
(CONST_STRLEN("fscrypt-") + 2 * FSCRYPT_KEY_IDENTIFIER_SIZE + \
CONST_STRLEN("-users") + 1)
@@ -164,21 +165,6 @@ static struct key *search_fscrypt_keyring(struct key *keyring,
#define FSCRYPT_MK_USER_DESCRIPTION_SIZE \
(2 * FSCRYPT_KEY_IDENTIFIER_SIZE + CONST_STRLEN(".uid.") + 10 + 1)
-static void format_fs_keyring_description(
- char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE],
- const struct super_block *sb)
-{
- sprintf(description, "fscrypt-%s", sb->s_id);
-}
-
-static void format_mk_description(
- char description[FSCRYPT_MK_DESCRIPTION_SIZE],
- const struct fscrypt_key_specifier *mk_spec)
-{
- sprintf(description, "%*phN",
- master_key_spec_len(mk_spec), (u8 *)&mk_spec->u);
-}
-
static void format_mk_users_keyring_description(
char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE],
const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE])
@@ -199,20 +185,15 @@ static void format_mk_user_description(
/* Create ->s_master_keys if needed. Synchronized by fscrypt_add_key_mutex. */
static int allocate_filesystem_keyring(struct super_block *sb)
{
- char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE];
- struct key *keyring;
+ struct fscrypt_keyring *keyring;
if (sb->s_master_keys)
return 0;
- format_fs_keyring_description(description, sb);
- keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
- current_cred(), KEY_POS_SEARCH |
- KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW,
- KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
- if (IS_ERR(keyring))
- return PTR_ERR(keyring);
-
+ keyring = kzalloc(sizeof(*keyring), GFP_KERNEL);
+ if (!keyring)
+ return -ENOMEM;
+ spin_lock_init(&keyring->lock);
/*
* Pairs with the smp_load_acquire() in fscrypt_find_master_key().
* I.e., here we publish ->s_master_keys with a RELEASE barrier so that
@@ -222,21 +203,80 @@ static int allocate_filesystem_keyring(struct super_block *sb)
return 0;
}
-void fscrypt_sb_free(struct super_block *sb)
+/*
+ * Release all encryption keys that have been added to the filesystem, along
+ * with the keyring that contains them.
+ *
+ * This is called at unmount time. The filesystem's underlying block device(s)
+ * are still available at this time; this is important because after user file
+ * accesses have been allowed, this function may need to evict keys from the
+ * keyslots of an inline crypto engine, which requires the block device(s).
+ *
+ * This is also called when the super_block is being freed. This is needed to
+ * avoid a memory leak if mounting fails after the "test_dummy_encryption"
+ * option was processed, as in that case the unmount-time call isn't made.
+ */
+void fscrypt_destroy_keyring(struct super_block *sb)
{
- key_put(sb->s_master_keys);
+ struct fscrypt_keyring *keyring = sb->s_master_keys;
+ size_t i;
+
+ if (!keyring)
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(keyring->key_hashtable); i++) {
+ struct hlist_head *bucket = &keyring->key_hashtable[i];
+ struct fscrypt_master_key *mk;
+ struct hlist_node *tmp;
+
+ hlist_for_each_entry_safe(mk, tmp, bucket, mk_node) {
+ /*
+ * Since all inodes were already evicted, every key
+ * remaining in the keyring should have an empty inode
+ * list, and should only still be in the keyring due to
+ * the single active ref associated with ->mk_secret.
+ * There should be no structural refs beyond the one
+ * associated with the active ref.
+ */
+ WARN_ON(refcount_read(&mk->mk_active_refs) != 1);
+ WARN_ON(refcount_read(&mk->mk_struct_refs) != 1);
+ WARN_ON(!is_master_key_secret_present(&mk->mk_secret));
+ wipe_master_key_secret(&mk->mk_secret);
+ fscrypt_put_master_key_activeref(sb, mk);
+ }
+ }
+ kfree_sensitive(keyring);
sb->s_master_keys = NULL;
}
+static struct hlist_head *
+fscrypt_mk_hash_bucket(struct fscrypt_keyring *keyring,
+ const struct fscrypt_key_specifier *mk_spec)
+{
+ /*
+ * Since key specifiers should be "random" values, it is sufficient to
+ * use a trivial hash function that just takes the first several bits of
+ * the key specifier.
+ */
+ unsigned long i = get_unaligned((unsigned long *)&mk_spec->u);
+
+ return &keyring->key_hashtable[i % ARRAY_SIZE(keyring->key_hashtable)];
+}
+
/*
- * Find the specified master key in ->s_master_keys.
- * Returns ERR_PTR(-ENOKEY) if not found.
+ * Find the specified master key struct in ->s_master_keys and take a structural
+ * ref to it. The structural ref guarantees that the key struct continues to
+ * exist, but it does *not* guarantee that ->s_master_keys continues to contain
+ * the key struct. The structural ref needs to be dropped by
+ * fscrypt_put_master_key(). Returns NULL if the key struct is not found.
*/
-struct key *fscrypt_find_master_key(struct super_block *sb,
- const struct fscrypt_key_specifier *mk_spec)
+struct fscrypt_master_key *
+fscrypt_find_master_key(struct super_block *sb,
+ const struct fscrypt_key_specifier *mk_spec)
{
- struct key *keyring;
- char description[FSCRYPT_MK_DESCRIPTION_SIZE];
+ struct fscrypt_keyring *keyring;
+ struct hlist_head *bucket;
+ struct fscrypt_master_key *mk;
/*
* Pairs with the smp_store_release() in allocate_filesystem_keyring().
@@ -246,10 +286,38 @@ struct key *fscrypt_find_master_key(struct super_block *sb,
*/
keyring = smp_load_acquire(&sb->s_master_keys);
if (keyring == NULL)
- return ERR_PTR(-ENOKEY); /* No keyring yet, so no keys yet. */
-
- format_mk_description(description, mk_spec);
- return search_fscrypt_keyring(keyring, &key_type_fscrypt, description);
+ return NULL; /* No keyring yet, so no keys yet. */
+
+ bucket = fscrypt_mk_hash_bucket(keyring, mk_spec);
+ rcu_read_lock();
+ switch (mk_spec->type) {
+ case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR:
+ hlist_for_each_entry_rcu(mk, bucket, mk_node) {
+ if (mk->mk_spec.type ==
+ FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR &&
+ memcmp(mk->mk_spec.u.descriptor,
+ mk_spec->u.descriptor,
+ FSCRYPT_KEY_DESCRIPTOR_SIZE) == 0 &&
+ refcount_inc_not_zero(&mk->mk_struct_refs))
+ goto out;
+ }
+ break;
+ case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER:
+ hlist_for_each_entry_rcu(mk, bucket, mk_node) {
+ if (mk->mk_spec.type ==
+ FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER &&
+ memcmp(mk->mk_spec.u.identifier,
+ mk_spec->u.identifier,
+ FSCRYPT_KEY_IDENTIFIER_SIZE) == 0 &&
+ refcount_inc_not_zero(&mk->mk_struct_refs))
+ goto out;
+ }
+ break;
+ }
+ mk = NULL;
+out:
+ rcu_read_unlock();
+ return mk;
}
static int allocate_master_key_users_keyring(struct fscrypt_master_key *mk)
@@ -277,17 +345,30 @@ static int allocate_master_key_users_keyring(struct fscrypt_master_key *mk)
static struct key *find_master_key_user(struct fscrypt_master_key *mk)
{
char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE];
+ key_ref_t keyref;
format_mk_user_description(description, mk->mk_spec.u.identifier);
- return search_fscrypt_keyring(mk->mk_users, &key_type_fscrypt_user,
- description);
+
+ /*
+ * We need to mark the keyring reference as "possessed" so that we
+ * acquire permission to search it, via the KEY_POS_SEARCH permission.
+ */
+ keyref = keyring_search(make_key_ref(mk->mk_users, true /*possessed*/),
+ &key_type_fscrypt_user, description, false);
+ if (IS_ERR(keyref)) {
+ if (PTR_ERR(keyref) == -EAGAIN || /* not found */
+ PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */
+ keyref = ERR_PTR(-ENOKEY);
+ return ERR_CAST(keyref);
+ }
+ return key_ref_to_ptr(keyref);
}
/*
* Give the current user a "key" in ->mk_users. This charges the user's quota
* and marks the master key as added by the current user, so that it cannot be
- * removed by another user with the key. Either the master key's key->sem must
- * be held for write, or the master key must be still undergoing initialization.
+ * removed by another user with the key. Either ->mk_sem must be held for
+ * write, or the master key must be still undergoing initialization.
*/
static int add_master_key_user(struct fscrypt_master_key *mk)
{
@@ -309,7 +390,7 @@ static int add_master_key_user(struct fscrypt_master_key *mk)
/*
* Remove the current user's "key" from ->mk_users.
- * The master key's key->sem must be held for write.
+ * ->mk_sem must be held for write.
*
* Returns 0 if removed, -ENOKEY if not found, or another -errno code.
*/
@@ -327,63 +408,48 @@ static int remove_master_key_user(struct fscrypt_master_key *mk)
}
/*
- * Allocate a new fscrypt_master_key which contains the given secret, set it as
- * the payload of a new 'struct key' of type fscrypt, and link the 'struct key'
- * into the given keyring. Synchronized by fscrypt_add_key_mutex.
+ * Allocate a new fscrypt_master_key, transfer the given secret over to it, and
+ * insert it into sb->s_master_keys.
*/
-static int add_new_master_key(struct fscrypt_master_key_secret *secret,
- const struct fscrypt_key_specifier *mk_spec,
- struct key *keyring)
+static int add_new_master_key(struct super_block *sb,
+ struct fscrypt_master_key_secret *secret,
+ const struct fscrypt_key_specifier *mk_spec)
{
+ struct fscrypt_keyring *keyring = sb->s_master_keys;
struct fscrypt_master_key *mk;
- char description[FSCRYPT_MK_DESCRIPTION_SIZE];
- struct key *key;
int err;
mk = kzalloc(sizeof(*mk), GFP_KERNEL);
if (!mk)
return -ENOMEM;
+ init_rwsem(&mk->mk_sem);
+ refcount_set(&mk->mk_struct_refs, 1);
mk->mk_spec = *mk_spec;
- move_master_key_secret(&mk->mk_secret, secret);
-
- refcount_set(&mk->mk_refcount, 1); /* secret is present */
INIT_LIST_HEAD(&mk->mk_decrypted_inodes);
spin_lock_init(&mk->mk_decrypted_inodes_lock);
if (mk_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) {
err = allocate_master_key_users_keyring(mk);
if (err)
- goto out_free_mk;
+ goto out_put;
err = add_master_key_user(mk);
if (err)
- goto out_free_mk;
+ goto out_put;
}
- /*
- * Note that we don't charge this key to anyone's quota, since when
- * ->mk_users is in use those keys are charged instead, and otherwise
- * (when ->mk_users isn't in use) only root can add these keys.
- */
- format_mk_description(description, mk_spec);
- key = key_alloc(&key_type_fscrypt, description,
- GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(),
- KEY_POS_SEARCH | KEY_USR_SEARCH | KEY_USR_VIEW,
- KEY_ALLOC_NOT_IN_QUOTA, NULL);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto out_free_mk;
- }
- err = key_instantiate_and_link(key, mk, sizeof(*mk), keyring, NULL);
- key_put(key);
- if (err)
- goto out_free_mk;
+ move_master_key_secret(&mk->mk_secret, secret);
+ refcount_set(&mk->mk_active_refs, 1); /* ->mk_secret is present */
+ spin_lock(&keyring->lock);
+ hlist_add_head_rcu(&mk->mk_node,
+ fscrypt_mk_hash_bucket(keyring, mk_spec));
+ spin_unlock(&keyring->lock);
return 0;
-out_free_mk:
- free_master_key(mk);
+out_put:
+ fscrypt_put_master_key(mk);
return err;
}
@@ -392,42 +458,34 @@ out_free_mk:
static int add_existing_master_key(struct fscrypt_master_key *mk,
struct fscrypt_master_key_secret *secret)
{
- struct key *mk_user;
- bool rekey;
int err;
/*
* If the current user is already in ->mk_users, then there's nothing to
- * do. (Not applicable for v1 policy keys, which have NULL ->mk_users.)
+ * do. Otherwise, we need to add the user to ->mk_users. (Neither is
+ * applicable for v1 policy keys, which have NULL ->mk_users.)
*/
if (mk->mk_users) {
- mk_user = find_master_key_user(mk);
+ struct key *mk_user = find_master_key_user(mk);
+
if (mk_user != ERR_PTR(-ENOKEY)) {
if (IS_ERR(mk_user))
return PTR_ERR(mk_user);
key_put(mk_user);
return 0;
}
- }
-
- /* If we'll be re-adding ->mk_secret, try to take the reference. */
- rekey = !is_master_key_secret_present(&mk->mk_secret);
- if (rekey && !refcount_inc_not_zero(&mk->mk_refcount))
- return KEY_DEAD;
-
- /* Add the current user to ->mk_users, if applicable. */
- if (mk->mk_users) {
err = add_master_key_user(mk);
- if (err) {
- if (rekey && refcount_dec_and_test(&mk->mk_refcount))
- return KEY_DEAD;
+ if (err)
return err;
- }
}
/* Re-add the secret if needed. */
- if (rekey)
+ if (!is_master_key_secret_present(&mk->mk_secret)) {
+ if (!refcount_inc_not_zero(&mk->mk_active_refs))
+ return KEY_DEAD;
move_master_key_secret(&mk->mk_secret, secret);
+ }
+
return 0;
}
@@ -436,38 +494,36 @@ static int do_add_master_key(struct super_block *sb,
const struct fscrypt_key_specifier *mk_spec)
{
static DEFINE_MUTEX(fscrypt_add_key_mutex);
- struct key *key;
+ struct fscrypt_master_key *mk;
int err;
mutex_lock(&fscrypt_add_key_mutex); /* serialize find + link */
-retry:
- key = fscrypt_find_master_key(sb, mk_spec);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- if (err != -ENOKEY)
- goto out_unlock;
+
+ mk = fscrypt_find_master_key(sb, mk_spec);
+ if (!mk) {
/* Didn't find the key in ->s_master_keys. Add it. */
err = allocate_filesystem_keyring(sb);
- if (err)
- goto out_unlock;
- err = add_new_master_key(secret, mk_spec, sb->s_master_keys);
+ if (!err)
+ err = add_new_master_key(sb, secret, mk_spec);
} else {
/*
* Found the key in ->s_master_keys. Re-add the secret if
* needed, and add the user to ->mk_users if needed.
*/
- down_write(&key->sem);
- err = add_existing_master_key(key->payload.data[0], secret);
- up_write(&key->sem);
+ down_write(&mk->mk_sem);
+ err = add_existing_master_key(mk, secret);
+ up_write(&mk->mk_sem);
if (err == KEY_DEAD) {
- /* Key being removed or needs to be removed */
- key_invalidate(key);
- key_put(key);
- goto retry;
+ /*
+ * We found a key struct, but it's already been fully
+ * removed. Ignore the old struct and add a new one.
+ * fscrypt_add_key_mutex means we don't need to worry
+ * about concurrent adds.
+ */
+ err = add_new_master_key(sb, secret, mk_spec);
}
- key_put(key);
+ fscrypt_put_master_key(mk);
}
-out_unlock:
mutex_unlock(&fscrypt_add_key_mutex);
return err;
}
@@ -771,19 +827,19 @@ int fscrypt_verify_key_added(struct super_block *sb,
const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE])
{
struct fscrypt_key_specifier mk_spec;
- struct key *key, *mk_user;
struct fscrypt_master_key *mk;
+ struct key *mk_user;
int err;
mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER;
memcpy(mk_spec.u.identifier, identifier, FSCRYPT_KEY_IDENTIFIER_SIZE);
- key = fscrypt_find_master_key(sb, &mk_spec);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
+ mk = fscrypt_find_master_key(sb, &mk_spec);
+ if (!mk) {
+ err = -ENOKEY;
goto out;
}
- mk = key->payload.data[0];
+ down_read(&mk->mk_sem);
mk_user = find_master_key_user(mk);
if (IS_ERR(mk_user)) {
err = PTR_ERR(mk_user);
@@ -791,7 +847,8 @@ int fscrypt_verify_key_added(struct super_block *sb,
key_put(mk_user);
err = 0;
}
- key_put(key);
+ up_read(&mk->mk_sem);
+ fscrypt_put_master_key(mk);
out:
if (err == -ENOKEY && capable(CAP_FOWNER))
err = 0;
@@ -953,11 +1010,10 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
struct super_block *sb = file_inode(filp)->i_sb;
struct fscrypt_remove_key_arg __user *uarg = _uarg;
struct fscrypt_remove_key_arg arg;
- struct key *key;
struct fscrypt_master_key *mk;
u32 status_flags = 0;
int err;
- bool dead;
+ bool inodes_remain;
if (copy_from_user(&arg, uarg, sizeof(arg)))
return -EFAULT;
@@ -977,12 +1033,10 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
return -EACCES;
/* Find the key being removed. */
- key = fscrypt_find_master_key(sb, &arg.key_spec);
- if (IS_ERR(key))
- return PTR_ERR(key);
- mk = key->payload.data[0];
-
- down_write(&key->sem);
+ mk = fscrypt_find_master_key(sb, &arg.key_spec);
+ if (!mk)
+ return -ENOKEY;
+ down_write(&mk->mk_sem);
/* If relevant, remove current user's (or all users) claim to the key */
if (mk->mk_users && mk->mk_users->keys.nr_leaves_on_tree != 0) {
@@ -991,7 +1045,7 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
else
err = remove_master_key_user(mk);
if (err) {
- up_write(&key->sem);
+ up_write(&mk->mk_sem);
goto out_put_key;
}
if (mk->mk_users->keys.nr_leaves_on_tree != 0) {
@@ -1003,26 +1057,22 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
status_flags |=
FSCRYPT_KEY_REMOVAL_STATUS_FLAG_OTHER_USERS;
err = 0;
- up_write(&key->sem);
+ up_write(&mk->mk_sem);
goto out_put_key;
}
}
/* No user claims remaining. Go ahead and wipe the secret. */
- dead = false;
+ err = -ENOKEY;
if (is_master_key_secret_present(&mk->mk_secret)) {
wipe_master_key_secret(&mk->mk_secret);
- dead = refcount_dec_and_test(&mk->mk_refcount);
- }
- up_write(&key->sem);
- if (dead) {
- /*
- * No inodes reference the key, and we wiped the secret, so the
- * key object is free to be removed from the keyring.
- */
- key_invalidate(key);
+ fscrypt_put_master_key_activeref(sb, mk);
err = 0;
- } else {
+ }
+ inodes_remain = refcount_read(&mk->mk_active_refs) > 0;
+ up_write(&mk->mk_sem);
+
+ if (inodes_remain) {
/* Some inodes still reference this key; try to evict them. */
err = try_to_lock_encrypted_files(sb, mk);
if (err == -EBUSY) {
@@ -1038,7 +1088,7 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
* has been fully removed including all files locked.
*/
out_put_key:
- key_put(key);
+ fscrypt_put_master_key(mk);
if (err == 0)
err = put_user(status_flags, &uarg->removal_status_flags);
return err;
@@ -1085,7 +1135,6 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg)
{
struct super_block *sb = file_inode(filp)->i_sb;
struct fscrypt_get_key_status_arg arg;
- struct key *key;
struct fscrypt_master_key *mk;
int err;
@@ -1102,19 +1151,18 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg)
arg.user_count = 0;
memset(arg.__out_reserved, 0, sizeof(arg.__out_reserved));
- key = fscrypt_find_master_key(sb, &arg.key_spec);
- if (IS_ERR(key)) {
- if (key != ERR_PTR(-ENOKEY))
- return PTR_ERR(key);
+ mk = fscrypt_find_master_key(sb, &arg.key_spec);
+ if (!mk) {
arg.status = FSCRYPT_KEY_STATUS_ABSENT;
err = 0;
goto out;
}
- mk = key->payload.data[0];
- down_read(&key->sem);
+ down_read(&mk->mk_sem);
if (!is_master_key_secret_present(&mk->mk_secret)) {
- arg.status = FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED;
+ arg.status = refcount_read(&mk->mk_active_refs) > 0 ?
+ FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED :
+ FSCRYPT_KEY_STATUS_ABSENT /* raced with full removal */;
err = 0;
goto out_release_key;
}
@@ -1136,8 +1184,8 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg)
}
err = 0;
out_release_key:
- up_read(&key->sem);
- key_put(key);
+ up_read(&mk->mk_sem);
+ fscrypt_put_master_key(mk);
out:
if (!err && copy_to_user(uarg, &arg, sizeof(arg)))
err = -EFAULT;
@@ -1149,13 +1197,9 @@ int __init fscrypt_init_keyring(void)
{
int err;
- err = register_key_type(&key_type_fscrypt);
- if (err)
- return err;
-
err = register_key_type(&key_type_fscrypt_user);
if (err)
- goto err_unregister_fscrypt;
+ return err;
err = register_key_type(&key_type_fscrypt_provisioning);
if (err)
@@ -1165,7 +1209,5 @@ int __init fscrypt_init_keyring(void)
err_unregister_fscrypt_user:
unregister_key_type(&key_type_fscrypt_user);
-err_unregister_fscrypt:
- unregister_key_type(&key_type_fscrypt);
return err;
}
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index fbc71abdabe3..94757ccd3056 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -9,7 +9,6 @@
*/
#include <crypto/skcipher.h>
-#include <linux/key.h>
#include <linux/random.h>
#include "fscrypt_private.h"
@@ -45,6 +44,21 @@ struct fscrypt_mode fscrypt_modes[] = {
.security_strength = 16,
.ivsize = 16,
},
+ [FSCRYPT_MODE_SM4_XTS] = {
+ .friendly_name = "SM4-XTS",
+ .cipher_str = "xts(sm4)",
+ .keysize = 32,
+ .security_strength = 16,
+ .ivsize = 16,
+ .blk_crypto_mode = BLK_ENCRYPTION_MODE_SM4_XTS,
+ },
+ [FSCRYPT_MODE_SM4_CTS] = {
+ .friendly_name = "SM4-CTS-CBC",
+ .cipher_str = "cts(cbc(sm4))",
+ .keysize = 16,
+ .security_strength = 16,
+ .ivsize = 16,
+ },
[FSCRYPT_MODE_ADIANTUM] = {
.friendly_name = "Adiantum",
.cipher_str = "adiantum(xchacha12,aes)",
@@ -155,10 +169,12 @@ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
}
/* Destroy a crypto transform object and/or blk-crypto key. */
-void fscrypt_destroy_prepared_key(struct fscrypt_prepared_key *prep_key)
+void fscrypt_destroy_prepared_key(struct super_block *sb,
+ struct fscrypt_prepared_key *prep_key)
{
crypto_free_skcipher(prep_key->tfm);
- fscrypt_destroy_inline_crypt_key(prep_key);
+ fscrypt_destroy_inline_crypt_key(sb, prep_key);
+ memzero_explicit(prep_key, sizeof(*prep_key));
}
/* Given a per-file encryption key, set up the file's crypto transform object */
@@ -412,20 +428,18 @@ static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk,
/*
* Find the master key, then set up the inode's actual encryption key.
*
- * If the master key is found in the filesystem-level keyring, then the
- * corresponding 'struct key' is returned in *master_key_ret with its semaphore
- * read-locked. This is needed to ensure that only one task links the
- * fscrypt_info into ->mk_decrypted_inodes (as multiple tasks may race to create
- * an fscrypt_info for the same inode), and to synchronize the master key being
- * removed with a new inode starting to use it.
+ * If the master key is found in the filesystem-level keyring, then it is
+ * returned in *mk_ret with its semaphore read-locked. This is needed to ensure
+ * that only one task links the fscrypt_info into ->mk_decrypted_inodes (as
+ * multiple tasks may race to create an fscrypt_info for the same inode), and to
+ * synchronize the master key being removed with a new inode starting to use it.
*/
static int setup_file_encryption_key(struct fscrypt_info *ci,
bool need_dirhash_key,
- struct key **master_key_ret)
+ struct fscrypt_master_key **mk_ret)
{
- struct key *key;
- struct fscrypt_master_key *mk = NULL;
struct fscrypt_key_specifier mk_spec;
+ struct fscrypt_master_key *mk;
int err;
err = fscrypt_select_encryption_impl(ci);
@@ -436,11 +450,10 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
if (err)
return err;
- key = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec);
- if (IS_ERR(key)) {
- if (key != ERR_PTR(-ENOKEY) ||
- ci->ci_policy.version != FSCRYPT_POLICY_V1)
- return PTR_ERR(key);
+ mk = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec);
+ if (!mk) {
+ if (ci->ci_policy.version != FSCRYPT_POLICY_V1)
+ return -ENOKEY;
/*
* As a legacy fallback for v1 policies, search for the key in
@@ -450,9 +463,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
*/
return fscrypt_setup_v1_file_key_via_subscribed_keyrings(ci);
}
-
- mk = key->payload.data[0];
- down_read(&key->sem);
+ down_read(&mk->mk_sem);
/* Has the secret been removed (via FS_IOC_REMOVE_ENCRYPTION_KEY)? */
if (!is_master_key_secret_present(&mk->mk_secret)) {
@@ -480,18 +491,18 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
if (err)
goto out_release_key;
- *master_key_ret = key;
+ *mk_ret = mk;
return 0;
out_release_key:
- up_read(&key->sem);
- key_put(key);
+ up_read(&mk->mk_sem);
+ fscrypt_put_master_key(mk);
return err;
}
static void put_crypt_info(struct fscrypt_info *ci)
{
- struct key *key;
+ struct fscrypt_master_key *mk;
if (!ci)
return;
@@ -499,26 +510,21 @@ static void put_crypt_info(struct fscrypt_info *ci)
if (ci->ci_direct_key)
fscrypt_put_direct_key(ci->ci_direct_key);
else if (ci->ci_owns_key)
- fscrypt_destroy_prepared_key(&ci->ci_enc_key);
-
- key = ci->ci_master_key;
- if (key) {
- struct fscrypt_master_key *mk = key->payload.data[0];
+ fscrypt_destroy_prepared_key(ci->ci_inode->i_sb,
+ &ci->ci_enc_key);
+ mk = ci->ci_master_key;
+ if (mk) {
/*
* Remove this inode from the list of inodes that were unlocked
- * with the master key.
- *
- * In addition, if we're removing the last inode from a key that
- * already had its secret removed, invalidate the key so that it
- * gets removed from ->s_master_keys.
+ * with the master key. In addition, if we're removing the last
+ * inode from a master key struct that already had its secret
+ * removed, then complete the full removal of the struct.
*/
spin_lock(&mk->mk_decrypted_inodes_lock);
list_del(&ci->ci_master_key_link);
spin_unlock(&mk->mk_decrypted_inodes_lock);
- if (refcount_dec_and_test(&mk->mk_refcount))
- key_invalidate(key);
- key_put(key);
+ fscrypt_put_master_key_activeref(ci->ci_inode->i_sb, mk);
}
memzero_explicit(ci, sizeof(*ci));
kmem_cache_free(fscrypt_info_cachep, ci);
@@ -532,7 +538,7 @@ fscrypt_setup_encryption_info(struct inode *inode,
{
struct fscrypt_info *crypt_info;
struct fscrypt_mode *mode;
- struct key *master_key = NULL;
+ struct fscrypt_master_key *mk = NULL;
int res;
res = fscrypt_initialize(inode->i_sb->s_cop->flags);
@@ -555,8 +561,7 @@ fscrypt_setup_encryption_info(struct inode *inode,
WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE);
crypt_info->ci_mode = mode;
- res = setup_file_encryption_key(crypt_info, need_dirhash_key,
- &master_key);
+ res = setup_file_encryption_key(crypt_info, need_dirhash_key, &mk);
if (res)
goto out;
@@ -571,12 +576,9 @@ fscrypt_setup_encryption_info(struct inode *inode,
* We won the race and set ->i_crypt_info to our crypt_info.
* Now link it into the master key's inode list.
*/
- if (master_key) {
- struct fscrypt_master_key *mk =
- master_key->payload.data[0];
-
- refcount_inc(&mk->mk_refcount);
- crypt_info->ci_master_key = key_get(master_key);
+ if (mk) {
+ crypt_info->ci_master_key = mk;
+ refcount_inc(&mk->mk_active_refs);
spin_lock(&mk->mk_decrypted_inodes_lock);
list_add(&crypt_info->ci_master_key_link,
&mk->mk_decrypted_inodes);
@@ -586,9 +588,9 @@ fscrypt_setup_encryption_info(struct inode *inode,
}
res = 0;
out:
- if (master_key) {
- up_read(&master_key->sem);
- key_put(master_key);
+ if (mk) {
+ up_read(&mk->mk_sem);
+ fscrypt_put_master_key(mk);
}
put_crypt_info(crypt_info);
return res;
@@ -753,7 +755,6 @@ EXPORT_SYMBOL(fscrypt_free_inode);
int fscrypt_drop_inode(struct inode *inode)
{
const struct fscrypt_info *ci = fscrypt_get_info(inode);
- const struct fscrypt_master_key *mk;
/*
* If ci is NULL, then the inode doesn't have an encryption key set up
@@ -763,7 +764,6 @@ int fscrypt_drop_inode(struct inode *inode)
*/
if (!ci || !ci->ci_master_key)
return 0;
- mk = ci->ci_master_key->payload.data[0];
/*
* With proper, non-racy use of FS_IOC_REMOVE_ENCRYPTION_KEY, all inodes
@@ -782,6 +782,6 @@ int fscrypt_drop_inode(struct inode *inode)
* then the thread removing the key will either evict the inode itself
* or will correctly detect that it wasn't evicted due to the race.
*/
- return !is_master_key_secret_present(&mk->mk_secret);
+ return !is_master_key_secret_present(&ci->ci_master_key->mk_secret);
}
EXPORT_SYMBOL_GPL(fscrypt_drop_inode);
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
index 2762c5350432..75dabd9b27f9 100644
--- a/fs/crypto/keysetup_v1.c
+++ b/fs/crypto/keysetup_v1.c
@@ -143,6 +143,7 @@ invalid:
/* Master key referenced by DIRECT_KEY policy */
struct fscrypt_direct_key {
+ struct super_block *dk_sb;
struct hlist_node dk_node;
refcount_t dk_refcount;
const struct fscrypt_mode *dk_mode;
@@ -154,7 +155,7 @@ struct fscrypt_direct_key {
static void free_direct_key(struct fscrypt_direct_key *dk)
{
if (dk) {
- fscrypt_destroy_prepared_key(&dk->dk_key);
+ fscrypt_destroy_prepared_key(dk->dk_sb, &dk->dk_key);
kfree_sensitive(dk);
}
}
@@ -231,6 +232,7 @@ fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key)
dk = kzalloc(sizeof(*dk), GFP_KERNEL);
if (!dk)
return ERR_PTR(-ENOMEM);
+ dk->dk_sb = ci->ci_inode->i_sb;
refcount_set(&dk->dk_refcount, 1);
dk->dk_mode = ci->ci_mode;
err = fscrypt_prepare_key(&dk->dk_key, raw_key, ci);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 80b8ca0f340b..893661b52376 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -61,6 +61,13 @@ fscrypt_get_dummy_policy(struct super_block *sb)
return sb->s_cop->get_dummy_policy(sb);
}
+/*
+ * Return %true if the given combination of encryption modes is supported for v1
+ * (and later) encryption policies.
+ *
+ * Do *not* add anything new here, since v1 encryption policies are deprecated.
+ * New combinations of modes should go in fscrypt_valid_enc_modes_v2() only.
+ */
static bool fscrypt_valid_enc_modes_v1(u32 contents_mode, u32 filenames_mode)
{
if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
@@ -83,6 +90,11 @@ static bool fscrypt_valid_enc_modes_v2(u32 contents_mode, u32 filenames_mode)
if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
filenames_mode == FSCRYPT_MODE_AES_256_HCTR2)
return true;
+
+ if (contents_mode == FSCRYPT_MODE_SM4_XTS &&
+ filenames_mode == FSCRYPT_MODE_SM4_CTS)
+ return true;
+
return fscrypt_valid_enc_modes_v1(contents_mode, filenames_mode);
}
@@ -744,12 +756,8 @@ int fscrypt_set_context(struct inode *inode, void *fs_data)
* delayed key setup that requires the inode number.
*/
if (ci->ci_policy.version == FSCRYPT_POLICY_V2 &&
- (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) {
- const struct fscrypt_master_key *mk =
- ci->ci_master_key->payload.data[0];
-
- fscrypt_hash_inode_number(ci, mk);
- }
+ (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))
+ fscrypt_hash_inode_number(ci, ci->ci_master_key);
return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, fs_data);
}
@@ -833,19 +841,6 @@ bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1,
}
EXPORT_SYMBOL_GPL(fscrypt_dummy_policies_equal);
-/* Deprecated, do not use */
-int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg,
- struct fscrypt_dummy_policy *dummy_policy)
-{
- struct fs_parameter param = {
- .type = fs_value_is_string,
- .string = arg ? (char *)arg : "",
- };
- return fscrypt_parse_test_dummy_encryption(&param, dummy_policy) ?:
- fscrypt_add_test_dummy_key(sb, dummy_policy);
-}
-EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption);
-
/**
* fscrypt_show_test_dummy_encryption() - show '-o test_dummy_encryption'
* @seq: the seq_file to print the option to
diff --git a/fs/d_path.c b/fs/d_path.c
index e4e0ebad1f15..56a6ee4c6331 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -34,7 +34,7 @@ static bool prepend_char(struct prepend_buffer *p, unsigned char c)
}
/*
- * The source of the prepend data can be an optimistoc load
+ * The source of the prepend data can be an optimistic load
* of a dentry name and length. And because we don't hold any
* locks, the length and the pointer to the name may not be
* in sync if a concurrent rename happens, and the kernel
@@ -297,8 +297,7 @@ EXPORT_SYMBOL(d_path);
/*
* Helper function for dentry_operations.d_dname() members
*/
-char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
- const char *fmt, ...)
+char *dynamic_dname(char *buffer, int buflen, const char *fmt, ...)
{
va_list args;
char temp[64];
diff --git a/fs/dax.c b/fs/dax.c
index 1c6867810cbd..c48a3a93ab29 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -334,35 +334,41 @@ static unsigned long dax_end_pfn(void *entry)
for (pfn = dax_to_pfn(entry); \
pfn < dax_end_pfn(entry); pfn++)
-static inline bool dax_mapping_is_cow(struct address_space *mapping)
+static inline bool dax_page_is_shared(struct page *page)
{
- return (unsigned long)mapping == PAGE_MAPPING_DAX_COW;
+ return page->mapping == PAGE_MAPPING_DAX_SHARED;
}
/*
- * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount.
+ * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
+ * refcount.
*/
-static inline void dax_mapping_set_cow(struct page *page)
+static inline void dax_page_share_get(struct page *page)
{
- if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) {
+ if (page->mapping != PAGE_MAPPING_DAX_SHARED) {
/*
* Reset the index if the page was already mapped
* regularly before.
*/
if (page->mapping)
- page->index = 1;
- page->mapping = (void *)PAGE_MAPPING_DAX_COW;
+ page->share = 1;
+ page->mapping = PAGE_MAPPING_DAX_SHARED;
}
- page->index++;
+ page->share++;
+}
+
+static inline unsigned long dax_page_share_put(struct page *page)
+{
+ return --page->share;
}
/*
- * When it is called in dax_insert_entry(), the cow flag will indicate that
+ * When it is called in dax_insert_entry(), the shared flag will indicate that
* whether this entry is shared by multiple files. If so, set the page->mapping
- * FS_DAX_MAPPING_COW, and use page->index as refcount.
+ * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
*/
static void dax_associate_entry(void *entry, struct address_space *mapping,
- struct vm_area_struct *vma, unsigned long address, bool cow)
+ struct vm_area_struct *vma, unsigned long address, bool shared)
{
unsigned long size = dax_entry_size(entry), pfn, index;
int i = 0;
@@ -374,8 +380,8 @@ static void dax_associate_entry(void *entry, struct address_space *mapping,
for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);
- if (cow) {
- dax_mapping_set_cow(page);
+ if (shared) {
+ dax_page_share_get(page);
} else {
WARN_ON_ONCE(page->mapping);
page->mapping = mapping;
@@ -396,9 +402,9 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
struct page *page = pfn_to_page(pfn);
WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
- if (dax_mapping_is_cow(page->mapping)) {
- /* keep the CoW flag if this page is still shared */
- if (page->index-- > 0)
+ if (dax_page_is_shared(page)) {
+ /* keep the shared flag if this page is still shared */
+ if (dax_page_share_put(page) > 0)
continue;
} else
WARN_ON_ONCE(page->mapping && page->mapping != mapping);
@@ -840,12 +846,6 @@ static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
(iter->iomap.flags & IOMAP_F_DIRTY);
}
-static bool dax_fault_is_cow(const struct iomap_iter *iter)
-{
- return (iter->flags & IOMAP_WRITE) &&
- (iter->iomap.flags & IOMAP_F_SHARED);
-}
-
/*
* By this point grab_mapping_entry() has ensured that we have a locked entry
* of the appropriate size so we don't have to worry about downgrading PMDs to
@@ -859,13 +859,14 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
void *new_entry = dax_make_entry(pfn, flags);
- bool dirty = !dax_fault_is_synchronous(iter, vmf->vma);
- bool cow = dax_fault_is_cow(iter);
+ bool write = iter->flags & IOMAP_WRITE;
+ bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma);
+ bool shared = iter->iomap.flags & IOMAP_F_SHARED;
if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
+ if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
unsigned long index = xas->xa_index;
/* we are replacing a zero page with block mapping */
if (dax_is_pmd_entry(entry))
@@ -877,12 +878,12 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
xas_reset(xas);
xas_lock_irq(xas);
- if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+ if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
void *old;
dax_disassociate_entry(entry, mapping, false);
dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
- cow);
+ shared);
/*
* Only swap our new entry into the page cache if the current
* entry is a zero page or an empty entry. If a normal PTE or
@@ -902,7 +903,7 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
if (dirty)
xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
- if (cow)
+ if (write && shared)
xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
xas_unlock_irq(xas);
@@ -1086,7 +1087,8 @@ out:
}
/**
- * dax_iomap_cow_copy - Copy the data from source to destination before write
+ * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
+ * by copying the data before and after the range to be written.
* @pos: address to do copy from.
* @length: size of copy operation.
* @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
@@ -1095,35 +1097,50 @@ out:
*
* This can be called from two places. Either during DAX write fault (page
* aligned), to copy the length size data to daddr. Or, while doing normal DAX
- * write operation, dax_iomap_actor() might call this to do the copy of either
+ * write operation, dax_iomap_iter() might call this to do the copy of either
* start or end unaligned address. In the latter case the rest of the copy of
- * aligned ranges is taken care by dax_iomap_actor() itself.
+ * aligned ranges is taken care by dax_iomap_iter() itself.
+ * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the
+ * area to make sure no old data remains.
*/
-static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size,
+static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size,
const struct iomap *srcmap, void *daddr)
{
loff_t head_off = pos & (align_size - 1);
size_t size = ALIGN(head_off + length, align_size);
loff_t end = pos + length;
loff_t pg_end = round_up(end, align_size);
+ /* copy_all is usually in page fault case */
bool copy_all = head_off == 0 && end == pg_end;
+ /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */
+ bool zero_edge = srcmap->flags & IOMAP_F_SHARED ||
+ srcmap->type == IOMAP_UNWRITTEN;
void *saddr = 0;
int ret = 0;
- ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
- if (ret)
- return ret;
+ if (!zero_edge) {
+ ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
+ if (ret)
+ return ret;
+ }
if (copy_all) {
- ret = copy_mc_to_kernel(daddr, saddr, length);
- return ret ? -EIO : 0;
+ if (zero_edge)
+ memset(daddr, 0, size);
+ else
+ ret = copy_mc_to_kernel(daddr, saddr, length);
+ goto out;
}
/* Copy the head part of the range */
if (head_off) {
- ret = copy_mc_to_kernel(daddr, saddr, head_off);
- if (ret)
- return -EIO;
+ if (zero_edge)
+ memset(daddr, 0, head_off);
+ else {
+ ret = copy_mc_to_kernel(daddr, saddr, head_off);
+ if (ret)
+ return -EIO;
+ }
}
/* Copy the tail part of the range */
@@ -1131,12 +1148,19 @@ static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size,
loff_t tail_off = head_off + length;
loff_t tail_len = pg_end - end;
- ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off,
- tail_len);
- if (ret)
- return -EIO;
+ if (zero_edge)
+ memset(daddr + tail_off, 0, tail_len);
+ else {
+ ret = copy_mc_to_kernel(daddr + tail_off,
+ saddr + tail_off, tail_len);
+ if (ret)
+ return -EIO;
+ }
}
- return 0;
+out:
+ if (zero_edge)
+ dax_flush(srcmap->dax_dev, daddr, size);
+ return ret ? -EIO : 0;
}
/*
@@ -1221,6 +1245,58 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
}
#endif /* CONFIG_FS_DAX_PMD */
+static s64 dax_unshare_iter(struct iomap_iter *iter)
+{
+ struct iomap *iomap = &iter->iomap;
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ loff_t pos = iter->pos;
+ loff_t length = iomap_length(iter);
+ int id = 0;
+ s64 ret = 0;
+ void *daddr = NULL, *saddr = NULL;
+
+ /* don't bother with blocks that are not shared to start with */
+ if (!(iomap->flags & IOMAP_F_SHARED))
+ return length;
+ /* don't bother with holes or unwritten extents */
+ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
+ return length;
+
+ id = dax_read_lock();
+ ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL);
+ if (ret < 0)
+ goto out_unlock;
+
+ ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL);
+ if (ret < 0)
+ goto out_unlock;
+
+ ret = copy_mc_to_kernel(daddr, saddr, length);
+ if (ret)
+ ret = -EIO;
+
+out_unlock:
+ dax_read_unlock(id);
+ return ret;
+}
+
+int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
+ const struct iomap_ops *ops)
+{
+ struct iomap_iter iter = {
+ .inode = inode,
+ .pos = pos,
+ .len = len,
+ .flags = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX,
+ };
+ int ret;
+
+ while ((ret = iomap_iter(&iter, ops)) > 0)
+ iter.processed = dax_unshare_iter(&iter);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dax_file_unshare);
+
static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
{
const struct iomap *iomap = &iter->iomap;
@@ -1235,13 +1311,10 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
if (ret < 0)
return ret;
memset(kaddr + offset, 0, size);
- if (srcmap->addr != iomap->addr) {
- ret = dax_iomap_cow_copy(pos, size, PAGE_SIZE, srcmap,
- kaddr);
- if (ret < 0)
- return ret;
- dax_flush(iomap->dax_dev, kaddr, PAGE_SIZE);
- } else
+ if (iomap->flags & IOMAP_F_SHARED)
+ ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap,
+ kaddr);
+ else
dax_flush(iomap->dax_dev, kaddr + offset, size);
return ret;
}
@@ -1258,6 +1331,15 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
return length;
+ /*
+ * invalidate the pages whose sharing state is to be changed
+ * because of CoW.
+ */
+ if (iomap->flags & IOMAP_F_SHARED)
+ invalidate_inode_pages2_range(iter->inode->i_mapping,
+ pos >> PAGE_SHIFT,
+ (pos + length - 1) >> PAGE_SHIFT);
+
do {
unsigned offset = offset_in_page(pos);
unsigned size = min_t(u64, PAGE_SIZE - offset, length);
@@ -1318,12 +1400,13 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
struct iov_iter *iter)
{
const struct iomap *iomap = &iomi->iomap;
- const struct iomap *srcmap = &iomi->srcmap;
+ const struct iomap *srcmap = iomap_iter_srcmap(iomi);
loff_t length = iomap_length(iomi);
loff_t pos = iomi->pos;
struct dax_device *dax_dev = iomap->dax_dev;
loff_t end = pos + length, done = 0;
bool write = iov_iter_rw(iter) == WRITE;
+ bool cow = write && iomap->flags & IOMAP_F_SHARED;
ssize_t ret = 0;
size_t xfer;
int id;
@@ -1350,7 +1433,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
* into page tables. We have to tear down these mappings so that data
* written by write(2) is visible in mmap.
*/
- if (iomap->flags & IOMAP_F_NEW) {
+ if (iomap->flags & IOMAP_F_NEW || cow) {
invalidate_inode_pages2_range(iomi->inode->i_mapping,
pos >> PAGE_SHIFT,
(end - 1) >> PAGE_SHIFT);
@@ -1384,10 +1467,9 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
break;
}
- if (write &&
- srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
- ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap,
- kaddr);
+ if (cow) {
+ ret = dax_iomap_copy_around(pos, length, PAGE_SIZE,
+ srcmap, kaddr);
if (ret)
break;
}
@@ -1532,7 +1614,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
struct xa_state *xas, void **entry, bool pmd)
{
const struct iomap *iomap = &iter->iomap;
- const struct iomap *srcmap = &iter->srcmap;
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
bool write = iter->flags & IOMAP_WRITE;
@@ -1563,9 +1645,8 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);
- if (write &&
- srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
- err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr);
+ if (write && iomap->flags & IOMAP_F_SHARED) {
+ err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr);
if (err)
return dax_fault_return(err);
}
@@ -1936,15 +2017,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
.len = len,
.flags = IOMAP_DAX,
};
- int ret;
+ int ret, compared = 0;
- while ((ret = iomap_iter(&src_iter, ops)) > 0) {
- while ((ret = iomap_iter(&dst_iter, ops)) > 0) {
- dst_iter.processed = dax_range_compare_iter(&src_iter,
- &dst_iter, len, same);
- }
- if (ret <= 0)
- src_iter.processed = ret;
+ while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
+ (ret = iomap_iter(&dst_iter, ops)) > 0) {
+ compared = dax_range_compare_iter(&src_iter, &dst_iter, len,
+ same);
+ if (compared < 0)
+ return ret;
+ src_iter.processed = dst_iter.processed = compared;
}
return ret;
}
diff --git a/fs/dcache.c b/fs/dcache.c
index bb0c4d0038db..52e6d5fdab6b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2597,15 +2597,7 @@ EXPORT_SYMBOL(d_rehash);
static inline unsigned start_dir_add(struct inode *dir)
{
- /*
- * The caller holds a spinlock (dentry::d_lock). On !PREEMPT_RT
- * kernels spin_lock() implicitly disables preemption, but not on
- * PREEMPT_RT. So for RT it has to be done explicitly to protect
- * the sequence count write side critical section against a reader
- * or another writer preempting, which would result in a live lock.
- */
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_disable();
+ preempt_disable_nested();
for (;;) {
unsigned n = dir->i_dir_seq;
if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
@@ -2618,8 +2610,7 @@ static inline void end_dir_add(struct inode *dir, unsigned int n,
wait_queue_head_t *d_wait)
{
smp_store_release(&dir->i_dir_seq, n + 2);
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_enable();
+ preempt_enable_nested();
wake_up_all(d_wait);
}
@@ -3258,8 +3249,10 @@ void d_genocide(struct dentry *parent)
EXPORT_SYMBOL(d_genocide);
-void d_tmpfile(struct dentry *dentry, struct inode *inode)
+void d_tmpfile(struct file *file, struct inode *inode)
{
+ struct dentry *dentry = file->f_path.dentry;
+
inode_dec_link_count(inode);
BUG_ON(dentry->d_name.name != dentry->d_iname ||
!hlist_unhashed(&dentry->d_u.d_alias) ||
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 950c63fa4d0b..b54f470e0d03 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -378,8 +378,8 @@ ssize_t debugfs_attr_read(struct file *file, char __user *buf,
}
EXPORT_SYMBOL_GPL(debugfs_attr_read);
-ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
- size_t len, loff_t *ppos)
+static ssize_t debugfs_attr_write_xsigned(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos, bool is_signed)
{
struct dentry *dentry = F_DENTRY(file);
ssize_t ret;
@@ -387,12 +387,28 @@ ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
ret = debugfs_file_get(dentry);
if (unlikely(ret))
return ret;
- ret = simple_attr_write(file, buf, len, ppos);
+ if (is_signed)
+ ret = simple_attr_write_signed(file, buf, len, ppos);
+ else
+ ret = simple_attr_write(file, buf, len, ppos);
debugfs_file_put(dentry);
return ret;
}
+
+ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return debugfs_attr_write_xsigned(file, buf, len, ppos, false);
+}
EXPORT_SYMBOL_GPL(debugfs_attr_write);
+ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return debugfs_attr_write_xsigned(file, buf, len, ppos, true);
+}
+EXPORT_SYMBOL_GPL(debugfs_attr_write_signed);
+
static struct dentry *debugfs_create_mode_unsafe(const char *name, umode_t mode,
struct dentry *parent, void *value,
const struct file_operations *fops,
@@ -738,11 +754,11 @@ static int debugfs_atomic_t_get(void *data, u64 *val)
*val = atomic_read((atomic_t *)data);
return 0;
}
-DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get,
+DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t, debugfs_atomic_t_get,
debugfs_atomic_t_set, "%lld\n");
-DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL,
+DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_ro, debugfs_atomic_t_get, NULL,
"%lld\n");
-DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set,
+DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_wo, NULL, debugfs_atomic_t_set,
"%lld\n");
/**
@@ -1121,7 +1137,7 @@ void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs,
}
EXPORT_SYMBOL_GPL(debugfs_print_regs32);
-static int debugfs_show_regset32(struct seq_file *s, void *data)
+static int debugfs_regset32_show(struct seq_file *s, void *data)
{
struct debugfs_regset32 *regset = s->private;
@@ -1136,17 +1152,7 @@ static int debugfs_show_regset32(struct seq_file *s, void *data)
return 0;
}
-static int debugfs_open_regset32(struct inode *inode, struct file *file)
-{
- return single_open(file, debugfs_show_regset32, inode->i_private);
-}
-
-static const struct file_operations fops_regset32 = {
- .open = debugfs_open_regset32,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(debugfs_regset32);
/**
* debugfs_create_regset32 - create a debugfs file that returns register values
@@ -1167,7 +1173,7 @@ void debugfs_create_regset32(const char *name, umode_t mode,
struct dentry *parent,
struct debugfs_regset32 *regset)
{
- debugfs_create_file(name, mode, parent, regset, &fops_regset32);
+ debugfs_create_file(name, mode, parent, regset, &debugfs_regset32_fops);
}
EXPORT_SYMBOL_GPL(debugfs_create_regset32);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 232cfdf095ae..2e8e112b1993 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -82,6 +82,8 @@ struct debugfs_mount_opts {
kuid_t uid;
kgid_t gid;
umode_t mode;
+ /* Opt_* bitfield. */
+ unsigned int opts;
};
enum {
@@ -111,6 +113,7 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts)
kgid_t gid;
char *p;
+ opts->opts = 0;
opts->mode = DEBUGFS_DEFAULT_MODE;
while ((p = strsep(&data, ",")) != NULL) {
@@ -145,24 +148,44 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts)
* but traditionally debugfs has ignored all mount options
*/
}
+
+ opts->opts |= BIT(token);
}
return 0;
}
-static int debugfs_apply_options(struct super_block *sb)
+static void _debugfs_apply_options(struct super_block *sb, bool remount)
{
struct debugfs_fs_info *fsi = sb->s_fs_info;
struct inode *inode = d_inode(sb->s_root);
struct debugfs_mount_opts *opts = &fsi->mount_opts;
- inode->i_mode &= ~S_IALLUGO;
- inode->i_mode |= opts->mode;
+ /*
+ * On remount, only reset mode/uid/gid if they were provided as mount
+ * options.
+ */
- inode->i_uid = opts->uid;
- inode->i_gid = opts->gid;
+ if (!remount || opts->opts & BIT(Opt_mode)) {
+ inode->i_mode &= ~S_IALLUGO;
+ inode->i_mode |= opts->mode;
+ }
- return 0;
+ if (!remount || opts->opts & BIT(Opt_uid))
+ inode->i_uid = opts->uid;
+
+ if (!remount || opts->opts & BIT(Opt_gid))
+ inode->i_gid = opts->gid;
+}
+
+static void debugfs_apply_options(struct super_block *sb)
+{
+ _debugfs_apply_options(sb, false);
+}
+
+static void debugfs_apply_options_remount(struct super_block *sb)
+{
+ _debugfs_apply_options(sb, true);
}
static int debugfs_remount(struct super_block *sb, int *flags, char *data)
@@ -175,7 +198,7 @@ static int debugfs_remount(struct super_block *sb, int *flags, char *data)
if (err)
goto fail;
- debugfs_apply_options(sb);
+ debugfs_apply_options_remount(sb);
fail:
return err;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index f669163d5860..03d381377ae1 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -421,8 +421,6 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
unsigned long flags;
bio->bi_private = dio;
- /* don't account direct I/O as memory stall */
- bio_clear_flag(bio, BIO_WORKINGSET);
spin_lock_irqsave(&dio->bio_lock, flags);
dio->refcount++;
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 19ef136f9e4f..26fef9945cc9 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -12,55 +12,67 @@
#include <trace/events/dlm.h>
#include "dlm_internal.h"
+#include "memory.h"
#include "lock.h"
#include "user.h"
#include "ast.h"
-static uint64_t dlm_cb_seq;
-static DEFINE_SPINLOCK(dlm_cb_seq_spin);
+void dlm_release_callback(struct kref *ref)
+{
+ struct dlm_callback *cb = container_of(ref, struct dlm_callback, ref);
+
+ dlm_free_cb(cb);
+}
+
+void dlm_callback_set_last_ptr(struct dlm_callback **from,
+ struct dlm_callback *to)
+{
+ if (*from)
+ kref_put(&(*from)->ref, dlm_release_callback);
+
+ if (to)
+ kref_get(&to->ref);
+
+ *from = to;
+}
-static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb)
+void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb)
{
- int i;
-
- log_print("last_bast %x %llu flags %x mode %d sb %d %x",
- lkb->lkb_id,
- (unsigned long long)lkb->lkb_last_bast.seq,
- lkb->lkb_last_bast.flags,
- lkb->lkb_last_bast.mode,
- lkb->lkb_last_bast.sb_status,
- lkb->lkb_last_bast.sb_flags);
-
- log_print("last_cast %x %llu flags %x mode %d sb %d %x",
- lkb->lkb_id,
- (unsigned long long)lkb->lkb_last_cast.seq,
- lkb->lkb_last_cast.flags,
- lkb->lkb_last_cast.mode,
- lkb->lkb_last_cast.sb_status,
- lkb->lkb_last_cast.sb_flags);
-
- for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
- log_print("cb %x %llu flags %x mode %d sb %d %x",
- lkb->lkb_id,
- (unsigned long long)lkb->lkb_callbacks[i].seq,
- lkb->lkb_callbacks[i].flags,
- lkb->lkb_callbacks[i].mode,
- lkb->lkb_callbacks[i].sb_status,
- lkb->lkb_callbacks[i].sb_flags);
+ struct dlm_callback *cb, *safe;
+
+ list_for_each_entry_safe(cb, safe, &lkb->lkb_callbacks, list) {
+ list_del(&cb->list);
+ kref_put(&cb->ref, dlm_release_callback);
}
+
+ lkb->lkb_flags &= ~DLM_IFL_CB_PENDING;
+
+ /* invalidate */
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL);
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cb, NULL);
+ lkb->lkb_last_bast_mode = -1;
}
-int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
- int status, uint32_t sbflags, uint64_t seq)
+int dlm_enqueue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
+ int status, uint32_t sbflags)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
- uint64_t prev_seq;
+ int rv = DLM_ENQUEUE_CALLBACK_SUCCESS;
+ struct dlm_callback *cb;
int prev_mode;
- int i, rv;
- for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
- if (lkb->lkb_callbacks[i].seq)
- continue;
+ if (flags & DLM_CB_BAST) {
+ /* if cb is a bast, it should be skipped if the blocking mode is
+ * compatible with the last granted mode
+ */
+ if (lkb->lkb_last_cast) {
+ if (dlm_modes_compat(mode, lkb->lkb_last_cast->mode)) {
+ log_debug(ls, "skip %x bast mode %d for cast mode %d",
+ lkb->lkb_id, mode,
+ lkb->lkb_last_cast->mode);
+ goto out;
+ }
+ }
/*
* Suppress some redundant basts here, do more on removal.
@@ -68,148 +80,95 @@ int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
* is a bast for the same mode or a more restrictive mode.
* (the addional > PR check is needed for PR/CW inversion)
*/
-
- if ((i > 0) && (flags & DLM_CB_BAST) &&
- (lkb->lkb_callbacks[i-1].flags & DLM_CB_BAST)) {
-
- prev_seq = lkb->lkb_callbacks[i-1].seq;
- prev_mode = lkb->lkb_callbacks[i-1].mode;
+ if (lkb->lkb_last_cb && lkb->lkb_last_cb->flags & DLM_CB_BAST) {
+ prev_mode = lkb->lkb_last_cb->mode;
if ((prev_mode == mode) ||
(prev_mode > mode && prev_mode > DLM_LOCK_PR)) {
-
- log_debug(ls, "skip %x add bast %llu mode %d "
- "for bast %llu mode %d",
- lkb->lkb_id,
- (unsigned long long)seq,
- mode,
- (unsigned long long)prev_seq,
- prev_mode);
- rv = 0;
+ log_debug(ls, "skip %x add bast mode %d for bast mode %d",
+ lkb->lkb_id, mode, prev_mode);
goto out;
}
}
-
- lkb->lkb_callbacks[i].seq = seq;
- lkb->lkb_callbacks[i].flags = flags;
- lkb->lkb_callbacks[i].mode = mode;
- lkb->lkb_callbacks[i].sb_status = status;
- lkb->lkb_callbacks[i].sb_flags = (sbflags & 0x000000FF);
- rv = 0;
- break;
}
- if (i == DLM_CALLBACKS_SIZE) {
- log_error(ls, "no callbacks %x %llu flags %x mode %d sb %d %x",
- lkb->lkb_id, (unsigned long long)seq,
- flags, mode, status, sbflags);
- dlm_dump_lkb_callbacks(lkb);
- rv = -1;
- goto out;
- }
- out:
- return rv;
-}
-
-int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
- struct dlm_callback *cb, int *resid)
-{
- int i, rv;
-
- *resid = 0;
-
- if (!lkb->lkb_callbacks[0].seq) {
- rv = -ENOENT;
+ cb = dlm_allocate_cb();
+ if (!cb) {
+ rv = DLM_ENQUEUE_CALLBACK_FAILURE;
goto out;
}
- /* oldest undelivered cb is callbacks[0] */
-
- memcpy(cb, &lkb->lkb_callbacks[0], sizeof(struct dlm_callback));
- memset(&lkb->lkb_callbacks[0], 0, sizeof(struct dlm_callback));
-
- /* shift others down */
-
- for (i = 1; i < DLM_CALLBACKS_SIZE; i++) {
- if (!lkb->lkb_callbacks[i].seq)
- break;
- memcpy(&lkb->lkb_callbacks[i-1], &lkb->lkb_callbacks[i],
- sizeof(struct dlm_callback));
- memset(&lkb->lkb_callbacks[i], 0, sizeof(struct dlm_callback));
- (*resid)++;
+ cb->flags = flags;
+ cb->mode = mode;
+ cb->sb_status = status;
+ cb->sb_flags = (sbflags & 0x000000FF);
+ kref_init(&cb->ref);
+ if (!(lkb->lkb_flags & DLM_IFL_CB_PENDING)) {
+ lkb->lkb_flags |= DLM_IFL_CB_PENDING;
+ rv = DLM_ENQUEUE_CALLBACK_NEED_SCHED;
}
+ list_add_tail(&cb->list, &lkb->lkb_callbacks);
- /* if cb is a bast, it should be skipped if the blocking mode is
- compatible with the last granted mode */
-
- if ((cb->flags & DLM_CB_BAST) && lkb->lkb_last_cast.seq) {
- if (dlm_modes_compat(cb->mode, lkb->lkb_last_cast.mode)) {
- cb->flags |= DLM_CB_SKIP;
-
- log_debug(ls, "skip %x bast %llu mode %d "
- "for cast %llu mode %d",
- lkb->lkb_id,
- (unsigned long long)cb->seq,
- cb->mode,
- (unsigned long long)lkb->lkb_last_cast.seq,
- lkb->lkb_last_cast.mode);
- rv = 0;
- goto out;
- }
- }
+ if (flags & DLM_CB_CAST)
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cast, cb);
- if (cb->flags & DLM_CB_CAST) {
- memcpy(&lkb->lkb_last_cast, cb, sizeof(struct dlm_callback));
- lkb->lkb_last_cast_time = ktime_get();
- }
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cb, cb);
- if (cb->flags & DLM_CB_BAST) {
- memcpy(&lkb->lkb_last_bast, cb, sizeof(struct dlm_callback));
- lkb->lkb_last_bast_time = ktime_get();
- }
- rv = 0;
out:
return rv;
}
+int dlm_dequeue_lkb_callback(struct dlm_lkb *lkb, struct dlm_callback **cb)
+{
+ /* oldest undelivered cb is callbacks first entry */
+ *cb = list_first_entry_or_null(&lkb->lkb_callbacks,
+ struct dlm_callback, list);
+ if (!*cb)
+ return DLM_DEQUEUE_CALLBACK_EMPTY;
+
+ /* remove it from callbacks so shift others down */
+ list_del(&(*cb)->list);
+ if (list_empty(&lkb->lkb_callbacks))
+ return DLM_DEQUEUE_CALLBACK_LAST;
+
+ return DLM_DEQUEUE_CALLBACK_SUCCESS;
+}
+
void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
uint32_t sbflags)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
- uint64_t new_seq, prev_seq;
int rv;
- spin_lock(&dlm_cb_seq_spin);
- new_seq = ++dlm_cb_seq;
- if (!dlm_cb_seq)
- new_seq = ++dlm_cb_seq;
- spin_unlock(&dlm_cb_seq_spin);
-
if (lkb->lkb_flags & DLM_IFL_USER) {
- dlm_user_add_ast(lkb, flags, mode, status, sbflags, new_seq);
+ dlm_user_add_ast(lkb, flags, mode, status, sbflags);
return;
}
- mutex_lock(&lkb->lkb_cb_mutex);
- prev_seq = lkb->lkb_callbacks[0].seq;
-
- rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, new_seq);
- if (rv < 0)
- goto out;
-
- if (!prev_seq) {
+ spin_lock(&lkb->lkb_cb_lock);
+ rv = dlm_enqueue_lkb_callback(lkb, flags, mode, status, sbflags);
+ switch (rv) {
+ case DLM_ENQUEUE_CALLBACK_NEED_SCHED:
kref_get(&lkb->lkb_ref);
+ spin_lock(&ls->ls_cb_lock);
if (test_bit(LSFL_CB_DELAY, &ls->ls_flags)) {
- mutex_lock(&ls->ls_cb_mutex);
list_add(&lkb->lkb_cb_list, &ls->ls_cb_delay);
- mutex_unlock(&ls->ls_cb_mutex);
} else {
queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work);
}
+ spin_unlock(&ls->ls_cb_lock);
+ break;
+ case DLM_ENQUEUE_CALLBACK_FAILURE:
+ WARN_ON_ONCE(1);
+ break;
+ case DLM_ENQUEUE_CALLBACK_SUCCESS:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
}
- out:
- mutex_unlock(&lkb->lkb_cb_mutex);
+ spin_unlock(&lkb->lkb_cb_lock);
}
void dlm_callback_work(struct work_struct *work)
@@ -218,53 +177,46 @@ void dlm_callback_work(struct work_struct *work)
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
void (*castfn) (void *astparam);
void (*bastfn) (void *astparam, int mode);
- struct dlm_callback callbacks[DLM_CALLBACKS_SIZE];
- int i, rv, resid;
-
- memset(&callbacks, 0, sizeof(callbacks));
+ struct dlm_callback *cb;
+ int rv;
- mutex_lock(&lkb->lkb_cb_mutex);
- if (!lkb->lkb_callbacks[0].seq) {
- /* no callback work exists, shouldn't happen */
- log_error(ls, "dlm_callback_work %x no work", lkb->lkb_id);
- dlm_print_lkb(lkb);
- dlm_dump_lkb_callbacks(lkb);
- }
+ spin_lock(&lkb->lkb_cb_lock);
+ rv = dlm_dequeue_lkb_callback(lkb, &cb);
+ spin_unlock(&lkb->lkb_cb_lock);
- for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
- rv = dlm_rem_lkb_callback(ls, lkb, &callbacks[i], &resid);
- if (rv < 0)
- break;
- }
+ if (WARN_ON_ONCE(rv == DLM_DEQUEUE_CALLBACK_EMPTY))
+ goto out;
- if (resid) {
- /* cbs remain, loop should have removed all, shouldn't happen */
- log_error(ls, "dlm_callback_work %x resid %d", lkb->lkb_id,
- resid);
- dlm_print_lkb(lkb);
- dlm_dump_lkb_callbacks(lkb);
- }
- mutex_unlock(&lkb->lkb_cb_mutex);
+ for (;;) {
+ castfn = lkb->lkb_astfn;
+ bastfn = lkb->lkb_bastfn;
+
+ if (cb->flags & DLM_CB_BAST) {
+ trace_dlm_bast(ls, lkb, cb->mode);
+ lkb->lkb_last_bast_time = ktime_get();
+ lkb->lkb_last_bast_mode = cb->mode;
+ bastfn(lkb->lkb_astparam, cb->mode);
+ } else if (cb->flags & DLM_CB_CAST) {
+ lkb->lkb_lksb->sb_status = cb->sb_status;
+ lkb->lkb_lksb->sb_flags = cb->sb_flags;
+ trace_dlm_ast(ls, lkb);
+ lkb->lkb_last_cast_time = ktime_get();
+ castfn(lkb->lkb_astparam);
+ }
- castfn = lkb->lkb_astfn;
- bastfn = lkb->lkb_bastfn;
+ kref_put(&cb->ref, dlm_release_callback);
- for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
- if (!callbacks[i].seq)
+ spin_lock(&lkb->lkb_cb_lock);
+ rv = dlm_dequeue_lkb_callback(lkb, &cb);
+ if (rv == DLM_DEQUEUE_CALLBACK_EMPTY) {
+ lkb->lkb_flags &= ~DLM_IFL_CB_PENDING;
+ spin_unlock(&lkb->lkb_cb_lock);
break;
- if (callbacks[i].flags & DLM_CB_SKIP) {
- continue;
- } else if (callbacks[i].flags & DLM_CB_BAST) {
- trace_dlm_bast(ls, lkb, callbacks[i].mode);
- bastfn(lkb->lkb_astparam, callbacks[i].mode);
- } else if (callbacks[i].flags & DLM_CB_CAST) {
- lkb->lkb_lksb->sb_status = callbacks[i].sb_status;
- lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags;
- trace_dlm_ast(ls, lkb);
- castfn(lkb->lkb_astparam);
}
+ spin_unlock(&lkb->lkb_cb_lock);
}
+out:
/* undo kref_get from dlm_add_callback, may cause lkb to be freed */
dlm_put_lkb(lkb);
}
@@ -288,10 +240,13 @@ void dlm_callback_stop(struct dlm_ls *ls)
void dlm_callback_suspend(struct dlm_ls *ls)
{
- set_bit(LSFL_CB_DELAY, &ls->ls_flags);
+ if (ls->ls_callback_wq) {
+ spin_lock(&ls->ls_cb_lock);
+ set_bit(LSFL_CB_DELAY, &ls->ls_flags);
+ spin_unlock(&ls->ls_cb_lock);
- if (ls->ls_callback_wq)
flush_workqueue(ls->ls_callback_wq);
+ }
}
#define MAX_CB_QUEUE 25
@@ -302,13 +257,11 @@ void dlm_callback_resume(struct dlm_ls *ls)
int count = 0, sum = 0;
bool empty;
- clear_bit(LSFL_CB_DELAY, &ls->ls_flags);
-
if (!ls->ls_callback_wq)
return;
more:
- mutex_lock(&ls->ls_cb_mutex);
+ spin_lock(&ls->ls_cb_lock);
list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) {
list_del_init(&lkb->lkb_cb_list);
queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work);
@@ -317,7 +270,9 @@ more:
break;
}
empty = list_empty(&ls->ls_cb_delay);
- mutex_unlock(&ls->ls_cb_mutex);
+ if (empty)
+ clear_bit(LSFL_CB_DELAY, &ls->ls_flags);
+ spin_unlock(&ls->ls_cb_lock);
sum += count;
if (!empty) {
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 181ad7d20c4d..880b11882495 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -11,14 +11,22 @@
#ifndef __ASTD_DOT_H__
#define __ASTD_DOT_H__
-void dlm_del_ast(struct dlm_lkb *lkb);
-int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
- int status, uint32_t sbflags, uint64_t seq);
-int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
- struct dlm_callback *cb, int *resid);
+#define DLM_ENQUEUE_CALLBACK_NEED_SCHED 1
+#define DLM_ENQUEUE_CALLBACK_SUCCESS 0
+#define DLM_ENQUEUE_CALLBACK_FAILURE -1
+int dlm_enqueue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
+ int status, uint32_t sbflags);
+#define DLM_DEQUEUE_CALLBACK_EMPTY 2
+#define DLM_DEQUEUE_CALLBACK_LAST 1
+#define DLM_DEQUEUE_CALLBACK_SUCCESS 0
+int dlm_dequeue_lkb_callback(struct dlm_lkb *lkb, struct dlm_callback **cb);
void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
uint32_t sbflags);
+void dlm_callback_set_last_ptr(struct dlm_callback **from,
+ struct dlm_callback *to);
+void dlm_release_callback(struct kref *ref);
+void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb);
void dlm_callback_work(struct work_struct *work);
int dlm_callback_start(struct dlm_ls *ls);
void dlm_callback_stop(struct dlm_ls *ls);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index ac8b62106ce0..20b60709eccf 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -183,7 +183,7 @@ static int dlm_check_protocol_and_dlm_running(unsigned int x)
return -EINVAL;
}
- if (dlm_allow_conn)
+ if (dlm_lowcomms_is_running())
return -EBUSY;
return 0;
@@ -194,7 +194,7 @@ static int dlm_check_zero_and_dlm_running(unsigned int x)
if (!x)
return -EINVAL;
- if (dlm_allow_conn)
+ if (dlm_lowcomms_is_running())
return -EBUSY;
return 0;
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 8fb04ebbafb5..8a0e1b1f74ad 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -246,7 +246,7 @@ static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
lkb->lkb_status,
lkb->lkb_grmode,
lkb->lkb_rqmode,
- lkb->lkb_last_bast.mode,
+ lkb->lkb_last_bast_mode,
rsb_lookup,
lkb->lkb_wait_type,
lkb->lkb_lvbseq,
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 8aca8085d24e..ab1a55337a6e 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -211,6 +211,7 @@ struct dlm_args {
#endif
#define DLM_IFL_DEADLOCK_CANCEL 0x01000000
#define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */
+#define DLM_IFL_CB_PENDING 0x04000000
/* least significant 2 bytes are message changed, they are full transmitted
* but at receive side only the 2 bytes LSB will be set.
*
@@ -222,18 +223,17 @@ struct dlm_args {
#define DLM_IFL_USER 0x00000001
#define DLM_IFL_ORPHAN 0x00000002
-#define DLM_CALLBACKS_SIZE 6
-
#define DLM_CB_CAST 0x00000001
#define DLM_CB_BAST 0x00000002
-#define DLM_CB_SKIP 0x00000004
struct dlm_callback {
- uint64_t seq;
uint32_t flags; /* DLM_CBF_ */
int sb_status; /* copy to lksb status */
uint8_t sb_flags; /* copy to lksb flags */
int8_t mode; /* rq mode of bast, gr mode of cast */
+
+ struct list_head list;
+ struct kref ref;
};
struct dlm_lkb {
@@ -268,12 +268,13 @@ struct dlm_lkb {
unsigned long lkb_timeout_cs;
#endif
- struct mutex lkb_cb_mutex;
+ spinlock_t lkb_cb_lock;
struct work_struct lkb_cb_work;
struct list_head lkb_cb_list; /* for ls_cb_delay or proc->asts */
- struct dlm_callback lkb_callbacks[DLM_CALLBACKS_SIZE];
- struct dlm_callback lkb_last_cast;
- struct dlm_callback lkb_last_bast;
+ struct list_head lkb_callbacks;
+ struct dlm_callback *lkb_last_cast;
+ struct dlm_callback *lkb_last_cb;
+ int lkb_last_bast_mode;
ktime_t lkb_last_cast_time; /* for debugging */
ktime_t lkb_last_bast_time; /* for debugging */
@@ -591,11 +592,7 @@ struct dlm_ls {
int ls_new_rsb_count;
struct list_head ls_new_rsb; /* new rsb structs */
- spinlock_t ls_remove_spin;
- wait_queue_head_t ls_remove_wait;
- char ls_remove_name[DLM_RESNAME_MAXLEN+1];
char *ls_remove_names[DLM_REMOVE_NAMES_MAX];
- int ls_remove_len;
int ls_remove_lens[DLM_REMOVE_NAMES_MAX];
struct list_head ls_nodes; /* current nodes in ls */
@@ -631,7 +628,7 @@ struct dlm_ls {
/* recovery related */
- struct mutex ls_cb_mutex;
+ spinlock_t ls_cb_lock;
struct list_head ls_cb_delay; /* save for queue_work later */
struct timer_list ls_timer;
struct task_struct *ls_recoverd_task;
@@ -661,7 +658,7 @@ struct dlm_ls {
spinlock_t ls_recover_idr_lock;
wait_queue_head_t ls_wait_general;
wait_queue_head_t ls_recover_lock_wait;
- struct mutex ls_clear_proc_locks;
+ spinlock_t ls_clear_proc_locks;
struct list_head ls_root_list; /* root resources */
struct rw_semaphore ls_root_sem; /* protect root_list */
@@ -670,7 +667,7 @@ struct dlm_ls {
void *ls_ops_arg;
int ls_namelen;
- char ls_name[1];
+ char ls_name[DLM_LOCKSPACE_LEN + 1];
};
/*
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index dac7eb75dba9..e1adfa5aed05 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -401,7 +401,7 @@ static int pre_rsb_struct(struct dlm_ls *ls)
unlock any spinlocks, go back and call pre_rsb_struct again.
Otherwise, take an rsb off the list and return it. */
-static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
+static int get_rsb_struct(struct dlm_ls *ls, const void *name, int len,
struct dlm_rsb **r_ret)
{
struct dlm_rsb *r;
@@ -412,7 +412,8 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
count = ls->ls_new_rsb_count;
spin_unlock(&ls->ls_new_rsb_spin);
log_debug(ls, "find_rsb retry %d %d %s",
- count, dlm_config.ci_new_rsb_count, name);
+ count, dlm_config.ci_new_rsb_count,
+ (const char *)name);
return -EAGAIN;
}
@@ -448,7 +449,7 @@ static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen)
return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN);
}
-int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len,
+int dlm_search_rsb_tree(struct rb_root *tree, const void *name, int len,
struct dlm_rsb **r_ret)
{
struct rb_node *node = tree->rb_node;
@@ -546,7 +547,7 @@ static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree)
* while that rsb has a potentially stale master.)
*/
-static int find_rsb_dir(struct dlm_ls *ls, char *name, int len,
+static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len,
uint32_t hash, uint32_t b,
int dir_nodeid, int from_nodeid,
unsigned int flags, struct dlm_rsb **r_ret)
@@ -724,7 +725,7 @@ static int find_rsb_dir(struct dlm_ls *ls, char *name, int len,
dlm_recover_locks) before we've made ourself master (in
dlm_recover_masters). */
-static int find_rsb_nodir(struct dlm_ls *ls, char *name, int len,
+static int find_rsb_nodir(struct dlm_ls *ls, const void *name, int len,
uint32_t hash, uint32_t b,
int dir_nodeid, int from_nodeid,
unsigned int flags, struct dlm_rsb **r_ret)
@@ -818,8 +819,9 @@ static int find_rsb_nodir(struct dlm_ls *ls, char *name, int len,
return error;
}
-static int find_rsb(struct dlm_ls *ls, char *name, int len, int from_nodeid,
- unsigned int flags, struct dlm_rsb **r_ret)
+static int find_rsb(struct dlm_ls *ls, const void *name, int len,
+ int from_nodeid, unsigned int flags,
+ struct dlm_rsb **r_ret)
{
uint32_t hash, b;
int dir_nodeid;
@@ -1207,6 +1209,7 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret,
if (!lkb)
return -ENOMEM;
+ lkb->lkb_last_bast_mode = -1;
lkb->lkb_nodeid = -1;
lkb->lkb_grmode = DLM_LOCK_IV;
kref_init(&lkb->lkb_ref);
@@ -1216,7 +1219,8 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret,
INIT_LIST_HEAD(&lkb->lkb_time_list);
#endif
INIT_LIST_HEAD(&lkb->lkb_cb_list);
- mutex_init(&lkb->lkb_cb_mutex);
+ INIT_LIST_HEAD(&lkb->lkb_callbacks);
+ spin_lock_init(&lkb->lkb_cb_lock);
INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work);
idr_preload(GFP_NOFS);
@@ -1585,37 +1589,6 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
return error;
}
-/* If there's an rsb for the same resource being removed, ensure
- * that the remove message is sent before the new lookup message.
- */
-
-#define DLM_WAIT_PENDING_COND(ls, r) \
- (ls->ls_remove_len && \
- !rsb_cmp(r, ls->ls_remove_name, \
- ls->ls_remove_len))
-
-static void wait_pending_remove(struct dlm_rsb *r)
-{
- struct dlm_ls *ls = r->res_ls;
- restart:
- spin_lock(&ls->ls_remove_spin);
- if (DLM_WAIT_PENDING_COND(ls, r)) {
- log_debug(ls, "delay lookup for remove dir %d %s",
- r->res_dir_nodeid, r->res_name);
- spin_unlock(&ls->ls_remove_spin);
- wait_event(ls->ls_remove_wait, !DLM_WAIT_PENDING_COND(ls, r));
- goto restart;
- }
- spin_unlock(&ls->ls_remove_spin);
-}
-
-/*
- * ls_remove_spin protects ls_remove_name and ls_remove_len which are
- * read by other threads in wait_pending_remove. ls_remove_names
- * and ls_remove_lens are only used by the scan thread, so they do
- * not need protection.
- */
-
static void shrink_bucket(struct dlm_ls *ls, int b)
{
struct rb_node *n, *next;
@@ -1697,11 +1670,6 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
* list and sending the removal. Keeping this gap small is
* important to keep us (the master node) from being out of sync
* with the remote dir node for very long.
- *
- * From the time the rsb is removed from toss until just after
- * send_remove, the rsb name is saved in ls_remove_name. A new
- * lookup checks this to ensure that a new lookup message for the
- * same resource name is not sent just before the remove message.
*/
for (i = 0; i < remote_count; i++) {
@@ -1748,22 +1716,8 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
}
rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
-
- /* block lookup of same name until we've sent remove */
- spin_lock(&ls->ls_remove_spin);
- ls->ls_remove_len = len;
- memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
- spin_unlock(&ls->ls_remove_spin);
- spin_unlock(&ls->ls_rsbtbl[b].lock);
-
send_remove(r);
-
- /* allow lookup of name again */
- spin_lock(&ls->ls_remove_spin);
- ls->ls_remove_len = 0;
- memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
- spin_unlock(&ls->ls_remove_spin);
- wake_up(&ls->ls_remove_wait);
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
dlm_free_rsb(r);
}
@@ -2714,8 +2668,6 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
return 0;
}
- wait_pending_remove(r);
-
r->res_first_lkid = lkb->lkb_id;
send_lookup(r, lkb);
return 1;
@@ -2864,17 +2816,9 @@ static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
struct dlm_args *args)
{
- int rv = -EINVAL;
+ int rv = -EBUSY;
if (args->flags & DLM_LKF_CONVERT) {
- if (lkb->lkb_flags & DLM_IFL_MSTCPY)
- goto out;
-
- if (args->flags & DLM_LKF_QUECVT &&
- !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
- goto out;
-
- rv = -EBUSY;
if (lkb->lkb_status != DLM_LKSTS_GRANTED)
goto out;
@@ -2884,6 +2828,14 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
if (is_overlap(lkb))
goto out;
+
+ rv = -EINVAL;
+ if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+ goto out;
+
+ if (args->flags & DLM_LKF_QUECVT &&
+ !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
+ goto out;
}
lkb->lkb_exflags = args->flags;
@@ -2900,11 +2852,25 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
#endif
rv = 0;
out:
- if (rv)
- log_debug(ls, "validate_lock_args %d %x %x %x %d %d %s",
+ switch (rv) {
+ case 0:
+ break;
+ case -EINVAL:
+ /* annoy the user because dlm usage is wrong */
+ WARN_ON(1);
+ log_error(ls, "%s %d %x %x %x %d %d %s", __func__,
+ rv, lkb->lkb_id, lkb->lkb_flags, args->flags,
+ lkb->lkb_status, lkb->lkb_wait_type,
+ lkb->lkb_resource->res_name);
+ break;
+ default:
+ log_debug(ls, "%s %d %x %x %x %d %d %s", __func__,
rv, lkb->lkb_id, lkb->lkb_flags, args->flags,
lkb->lkb_status, lkb->lkb_wait_type,
lkb->lkb_resource->res_name);
+ break;
+ }
+
return rv;
}
@@ -2918,23 +2884,12 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
- int rv = -EINVAL;
+ int rv = -EBUSY;
- if (lkb->lkb_flags & DLM_IFL_MSTCPY) {
- log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id);
- dlm_print_lkb(lkb);
+ /* normal unlock not allowed if there's any op in progress */
+ if (!(args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) &&
+ (lkb->lkb_wait_type || lkb->lkb_wait_count))
goto out;
- }
-
- /* an lkb may still exist even though the lock is EOL'ed due to a
- cancel, unlock or failed noqueue request; an app can't use these
- locks; return same error as if the lkid had not been found at all */
-
- if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) {
- log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id);
- rv = -ENOENT;
- goto out;
- }
/* an lkb may be waiting for an rsb lookup to complete where the
lookup was initiated by another lock */
@@ -2949,7 +2904,24 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
unhold_lkb(lkb); /* undoes create_lkb() */
}
/* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */
- rv = -EBUSY;
+ goto out;
+ }
+
+ rv = -EINVAL;
+ if (lkb->lkb_flags & DLM_IFL_MSTCPY) {
+ log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id);
+ dlm_print_lkb(lkb);
+ goto out;
+ }
+
+ /* an lkb may still exist even though the lock is EOL'ed due to a
+ * cancel, unlock or failed noqueue request; an app can't use these
+ * locks; return same error as if the lkid had not been found at all
+ */
+
+ if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) {
+ log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id);
+ rv = -ENOENT;
goto out;
}
@@ -3022,14 +2994,8 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
goto out;
}
/* add_to_waiters() will set OVERLAP_UNLOCK */
- goto out_ok;
}
- /* normal unlock not allowed if there's any op in progress */
- rv = -EBUSY;
- if (lkb->lkb_wait_type || lkb->lkb_wait_count)
- goto out;
-
out_ok:
/* an overlapping op shouldn't blow away exflags from other op */
lkb->lkb_exflags |= args->flags;
@@ -3037,11 +3003,25 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
lkb->lkb_astparam = args->astparam;
rv = 0;
out:
- if (rv)
- log_debug(ls, "validate_unlock_args %d %x %x %x %x %d %s", rv,
+ switch (rv) {
+ case 0:
+ break;
+ case -EINVAL:
+ /* annoy the user because dlm usage is wrong */
+ WARN_ON(1);
+ log_error(ls, "%s %d %x %x %x %x %d %s", __func__, rv,
+ lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags,
+ args->flags, lkb->lkb_wait_type,
+ lkb->lkb_resource->res_name);
+ break;
+ default:
+ log_debug(ls, "%s %d %x %x %x %x %d %s", __func__, rv,
lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags,
args->flags, lkb->lkb_wait_type,
lkb->lkb_resource->res_name);
+ break;
+ }
+
return rv;
}
@@ -3292,8 +3272,9 @@ static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
* request_lock(), convert_lock(), unlock_lock(), cancel_lock()
*/
-static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
- int len, struct dlm_args *args)
+static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ const void *name, int len,
+ struct dlm_args *args)
{
struct dlm_rsb *r;
int error;
@@ -3392,7 +3373,7 @@ int dlm_lock(dlm_lockspace_t *lockspace,
int mode,
struct dlm_lksb *lksb,
uint32_t flags,
- void *name,
+ const void *name,
unsigned int namelen,
uint32_t parent_lkid,
void (*ast) (void *astarg),
@@ -3438,7 +3419,7 @@ int dlm_lock(dlm_lockspace_t *lockspace,
if (error == -EINPROGRESS)
error = 0;
out_put:
- trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error);
+ trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error, true);
if (convert || error)
__put_lkb(ls, lkb);
@@ -3521,7 +3502,8 @@ int dlm_unlock(dlm_lockspace_t *lockspace,
static int _create_message(struct dlm_ls *ls, int mb_len,
int to_nodeid, int mstype,
struct dlm_message **ms_ret,
- struct dlm_mhandle **mh_ret)
+ struct dlm_mhandle **mh_ret,
+ gfp_t allocation)
{
struct dlm_message *ms;
struct dlm_mhandle *mh;
@@ -3531,7 +3513,7 @@ static int _create_message(struct dlm_ls *ls, int mb_len,
pass into midcomms_commit and a message buffer (mb) that we
write our data into */
- mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, GFP_NOFS, &mb);
+ mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, allocation, &mb);
if (!mh)
return -ENOBUFS;
@@ -3553,7 +3535,8 @@ static int _create_message(struct dlm_ls *ls, int mb_len,
static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
int to_nodeid, int mstype,
struct dlm_message **ms_ret,
- struct dlm_mhandle **mh_ret)
+ struct dlm_mhandle **mh_ret,
+ gfp_t allocation)
{
int mb_len = sizeof(struct dlm_message);
@@ -3574,15 +3557,16 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
}
return _create_message(r->res_ls, mb_len, to_nodeid, mstype,
- ms_ret, mh_ret);
+ ms_ret, mh_ret, allocation);
}
/* further lowcomms enhancements or alternate implementations may make
the return value from this function useful at some point */
-static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
+static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms,
+ const void *name, int namelen)
{
- dlm_midcomms_commit_mhandle(mh);
+ dlm_midcomms_commit_mhandle(mh, name, namelen);
return 0;
}
@@ -3623,7 +3607,7 @@ static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
case cpu_to_le32(DLM_MSG_REQUEST_REPLY):
case cpu_to_le32(DLM_MSG_CONVERT_REPLY):
case cpu_to_le32(DLM_MSG_GRANT):
- if (!lkb->lkb_lvbptr)
+ if (!lkb->lkb_lvbptr || !(lkb->lkb_exflags & DLM_LKF_VALBLK))
break;
memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
break;
@@ -3642,13 +3626,13 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
if (error)
return error;
- error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+ error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh, GFP_NOFS);
if (error)
goto fail;
send_args(r, lkb, ms);
- error = send_message(mh, ms);
+ error = send_message(mh, ms, r->res_name, r->res_length);
if (error)
goto fail;
return 0;
@@ -3703,7 +3687,8 @@ static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
to_nodeid = lkb->lkb_nodeid;
- error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
+ error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh,
+ GFP_NOFS);
if (error)
goto out;
@@ -3711,7 +3696,7 @@ static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
ms->m_result = 0;
- error = send_message(mh, ms);
+ error = send_message(mh, ms, r->res_name, r->res_length);
out:
return error;
}
@@ -3724,7 +3709,8 @@ static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
to_nodeid = lkb->lkb_nodeid;
- error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
+ error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh,
+ GFP_NOFS);
if (error)
goto out;
@@ -3732,7 +3718,7 @@ static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
ms->m_bastmode = cpu_to_le32(mode);
- error = send_message(mh, ms);
+ error = send_message(mh, ms, r->res_name, r->res_length);
out:
return error;
}
@@ -3749,13 +3735,14 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
if (error)
return error;
- error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
+ error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh,
+ GFP_NOFS);
if (error)
goto fail;
send_args(r, lkb, ms);
- error = send_message(mh, ms);
+ error = send_message(mh, ms, r->res_name, r->res_length);
if (error)
goto fail;
return 0;
@@ -3773,14 +3760,15 @@ static int send_remove(struct dlm_rsb *r)
to_nodeid = dlm_dir_nodeid(r);
- error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
+ error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh,
+ GFP_ATOMIC);
if (error)
goto out;
memcpy(ms->m_extra, r->res_name, r->res_length);
ms->m_hash = cpu_to_le32(r->res_hash);
- error = send_message(mh, ms);
+ error = send_message(mh, ms, r->res_name, r->res_length);
out:
return error;
}
@@ -3794,7 +3782,7 @@ static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
to_nodeid = lkb->lkb_nodeid;
- error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+ error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh, GFP_NOFS);
if (error)
goto out;
@@ -3802,7 +3790,7 @@ static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
ms->m_result = cpu_to_le32(to_dlm_errno(rv));
- error = send_message(mh, ms);
+ error = send_message(mh, ms, r->res_name, r->res_length);
out:
return error;
}
@@ -3835,7 +3823,8 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
struct dlm_mhandle *mh;
int error, nodeid = le32_to_cpu(ms_in->m_header.h_nodeid);
- error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
+ error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh,
+ GFP_NOFS);
if (error)
goto out;
@@ -3843,7 +3832,7 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
ms->m_result = cpu_to_le32(to_dlm_errno(rv));
ms->m_nodeid = cpu_to_le32(ret_nodeid);
- error = send_message(mh, ms);
+ error = send_message(mh, ms, ms_in->m_extra, receive_extralen(ms_in));
out:
return error;
}
@@ -4013,66 +4002,6 @@ out:
return error;
}
-static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len)
-{
- char name[DLM_RESNAME_MAXLEN + 1];
- struct dlm_message *ms;
- struct dlm_mhandle *mh;
- struct dlm_rsb *r;
- uint32_t hash, b;
- int rv, dir_nodeid;
-
- memset(name, 0, sizeof(name));
- memcpy(name, ms_name, len);
-
- hash = jhash(name, len, 0);
- b = hash & (ls->ls_rsbtbl_size - 1);
-
- dir_nodeid = dlm_hash2nodeid(ls, hash);
-
- log_error(ls, "send_repeat_remove dir %d %s", dir_nodeid, name);
-
- spin_lock(&ls->ls_rsbtbl[b].lock);
- rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
- if (!rv) {
- spin_unlock(&ls->ls_rsbtbl[b].lock);
- log_error(ls, "repeat_remove on keep %s", name);
- return;
- }
-
- rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
- if (!rv) {
- spin_unlock(&ls->ls_rsbtbl[b].lock);
- log_error(ls, "repeat_remove on toss %s", name);
- return;
- }
-
- /* use ls->remove_name2 to avoid conflict with shrink? */
-
- spin_lock(&ls->ls_remove_spin);
- ls->ls_remove_len = len;
- memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
- spin_unlock(&ls->ls_remove_spin);
- spin_unlock(&ls->ls_rsbtbl[b].lock);
-
- rv = _create_message(ls, sizeof(struct dlm_message) + len,
- dir_nodeid, DLM_MSG_REMOVE, &ms, &mh);
- if (rv)
- goto out;
-
- memcpy(ms->m_extra, name, len);
- ms->m_hash = cpu_to_le32(hash);
-
- send_message(mh, ms);
-
-out:
- spin_lock(&ls->ls_remove_spin);
- ls->ls_remove_len = 0;
- memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
- spin_unlock(&ls->ls_remove_spin);
- wake_up(&ls->ls_remove_wait);
-}
-
static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
@@ -4142,25 +4071,11 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
ENOTBLK request failures when the lookup reply designating us
as master is delayed. */
- /* We could repeatedly return -EBADR here if our send_remove() is
- delayed in being sent/arriving/being processed on the dir node.
- Another node would repeatedly lookup up the master, and the dir
- node would continue returning our nodeid until our send_remove
- took effect.
-
- We send another remove message in case our previous send_remove
- was lost/ignored/missed somehow. */
-
if (error != -ENOTBLK) {
log_limit(ls, "receive_request %x from %d %d",
le32_to_cpu(ms->m_lkid), from_nodeid, error);
}
- if (namelen && error == -EBADR) {
- send_repeat_remove(ls, ms->m_extra, namelen);
- msleep(1000);
- }
-
setup_stub_lkb(ls, ms);
send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
return error;
@@ -5080,8 +4995,11 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid)
down_read(&ls->ls_recv_active);
if (hd->h_cmd == DLM_MSG)
dlm_receive_message(ls, &p->message, nodeid);
- else
+ else if (hd->h_cmd == DLM_RCOM)
dlm_receive_rcom(ls, &p->rcom, nodeid);
+ else
+ log_error(ls, "invalid h_cmd %d from %d lockspace %x",
+ hd->h_cmd, nodeid, le32_to_cpu(hd->u.h_lockspace));
up_read(&ls->ls_recv_active);
dlm_put_lockspace(ls);
@@ -5801,6 +5719,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
{
struct dlm_lkb *lkb;
struct dlm_args args;
+ bool do_put = true;
int error;
dlm_lock_recovery(ls);
@@ -5811,13 +5730,14 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
goto out;
}
+ trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags);
+
if (flags & DLM_LKF_VALBLK) {
ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
if (!ua->lksb.sb_lvbptr) {
kfree(ua);
- __put_lkb(ls, lkb);
error = -ENOMEM;
- goto out;
+ goto out_put;
}
}
#ifdef CONFIG_DLM_DEPRECATED_API
@@ -5831,8 +5751,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
kfree(ua->lksb.sb_lvbptr);
ua->lksb.sb_lvbptr = NULL;
kfree(ua);
- __put_lkb(ls, lkb);
- goto out;
+ goto out_put;
}
/* After ua is attached to lkb it will be freed by dlm_free_lkb().
@@ -5851,8 +5770,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
error = 0;
fallthrough;
default:
- __put_lkb(ls, lkb);
- goto out;
+ goto out_put;
}
/* add this new lkb to the per-process list of locks */
@@ -5860,6 +5778,11 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
hold_lkb(lkb);
list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
spin_unlock(&ua->proc->locks_spin);
+ do_put = false;
+ out_put:
+ trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error, false);
+ if (do_put)
+ __put_lkb(ls, lkb);
out:
dlm_unlock_recovery(ls);
return error;
@@ -5885,6 +5808,8 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
if (error)
goto out;
+ trace_dlm_lock_start(ls, lkb, NULL, 0, mode, flags);
+
/* user can change the params on its lock when it converts it, or
add an lvb that didn't exist before */
@@ -5922,6 +5847,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
if (error == -EINPROGRESS || error == -EAGAIN || error == -EDEADLK)
error = 0;
out_put:
+ trace_dlm_lock_end(ls, lkb, NULL, 0, mode, flags, error, false);
dlm_put_lkb(lkb);
out:
dlm_unlock_recovery(ls);
@@ -6014,6 +5940,8 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
if (error)
goto out;
+ trace_dlm_unlock_start(ls, lkb, flags);
+
ua = lkb->lkb_ua;
if (lvb_in && ua->lksb.sb_lvbptr)
@@ -6042,6 +5970,7 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking);
spin_unlock(&ua->proc->locks_spin);
out_put:
+ trace_dlm_unlock_end(ls, lkb, flags, error);
dlm_put_lkb(lkb);
out:
dlm_unlock_recovery(ls);
@@ -6063,6 +5992,8 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
if (error)
goto out;
+ trace_dlm_unlock_start(ls, lkb, flags);
+
ua = lkb->lkb_ua;
if (ua_tmp->castparam)
ua->castparam = ua_tmp->castparam;
@@ -6080,6 +6011,7 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
if (error == -EBUSY)
error = 0;
out_put:
+ trace_dlm_unlock_end(ls, lkb, flags, error);
dlm_put_lkb(lkb);
out:
dlm_unlock_recovery(ls);
@@ -6101,6 +6033,8 @@ int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid)
if (error)
goto out;
+ trace_dlm_unlock_start(ls, lkb, flags);
+
ua = lkb->lkb_ua;
error = set_unlock_args(flags, ua, &args);
@@ -6129,6 +6063,7 @@ int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid)
if (error == -EBUSY)
error = 0;
out_put:
+ trace_dlm_unlock_end(ls, lkb, flags, error);
dlm_put_lkb(lkb);
out:
dlm_unlock_recovery(ls);
@@ -6184,7 +6119,7 @@ static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls,
{
struct dlm_lkb *lkb = NULL;
- mutex_lock(&ls->ls_clear_proc_locks);
+ spin_lock(&ls->ls_clear_proc_locks);
if (list_empty(&proc->locks))
goto out;
@@ -6196,7 +6131,7 @@ static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls,
else
lkb->lkb_flags |= DLM_IFL_DEAD;
out:
- mutex_unlock(&ls->ls_clear_proc_locks);
+ spin_unlock(&ls->ls_clear_proc_locks);
return lkb;
}
@@ -6233,7 +6168,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
dlm_put_lkb(lkb);
}
- mutex_lock(&ls->ls_clear_proc_locks);
+ spin_lock(&ls->ls_clear_proc_locks);
/* in-progress unlocks */
list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
@@ -6243,13 +6178,12 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
}
list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) {
- memset(&lkb->lkb_callbacks, 0,
- sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
+ dlm_purge_lkb_callbacks(lkb);
list_del_init(&lkb->lkb_cb_list);
dlm_put_lkb(lkb);
}
- mutex_unlock(&ls->ls_clear_proc_locks);
+ spin_unlock(&ls->ls_clear_proc_locks);
dlm_unlock_recovery(ls);
}
@@ -6285,8 +6219,7 @@ static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
spin_lock(&proc->asts_spin);
list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) {
- memset(&lkb->lkb_callbacks, 0,
- sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
+ dlm_purge_lkb_callbacks(lkb);
list_del_init(&lkb->lkb_cb_list);
dlm_put_lkb(lkb);
}
@@ -6317,13 +6250,13 @@ static int send_purge(struct dlm_ls *ls, int nodeid, int pid)
int error;
error = _create_message(ls, sizeof(struct dlm_message), nodeid,
- DLM_MSG_PURGE, &ms, &mh);
+ DLM_MSG_PURGE, &ms, &mh, GFP_NOFS);
if (error)
return error;
ms->m_nodeid = cpu_to_le32(nodeid);
ms->m_pid = cpu_to_le32(pid);
- return send_message(mh, ms);
+ return send_message(mh, ms, NULL, 0);
}
int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index a7b6474f009d..40c76b5544da 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -36,7 +36,7 @@ static inline void dlm_adjust_timeouts(struct dlm_ls *ls) { }
int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len,
unsigned int flags, int *r_nodeid, int *result);
-int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len,
+int dlm_search_rsb_tree(struct rb_root *tree, const void *name, int len,
struct dlm_rsb **r_ret);
void dlm_recover_purge(struct dlm_ls *ls);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 3972f4d86c75..d0b4e2181a5f 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -17,7 +17,6 @@
#include "recoverd.h"
#include "dir.h"
#include "midcomms.h"
-#include "lowcomms.h"
#include "config.h"
#include "memory.h"
#include "lock.h"
@@ -391,7 +390,7 @@ static int threads_start(void)
/* Thread for sending/receiving messages for all lockspace's */
error = dlm_midcomms_start();
if (error) {
- log_print("cannot start dlm lowcomms %d", error);
+ log_print("cannot start dlm midcomms %d", error);
goto scand_fail;
}
@@ -416,7 +415,7 @@ static int new_lockspace(const char *name, const char *cluster,
if (namelen > DLM_LOCKSPACE_LEN || namelen == 0)
return -EINVAL;
- if (!lvblen || (lvblen % 8))
+ if (lvblen % 8)
return -EINVAL;
if (!try_module_get(THIS_MODULE))
@@ -473,7 +472,7 @@ static int new_lockspace(const char *name, const char *cluster,
error = -ENOMEM;
- ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_NOFS);
+ ls = kzalloc(sizeof(*ls), GFP_NOFS);
if (!ls)
goto out;
memcpy(ls->ls_name, name, namelen);
@@ -524,9 +523,6 @@ static int new_lockspace(const char *name, const char *cluster,
spin_lock_init(&ls->ls_rsbtbl[i].lock);
}
- spin_lock_init(&ls->ls_remove_spin);
- init_waitqueue_head(&ls->ls_remove_wait);
-
for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) {
ls->ls_remove_names[i] = kzalloc(DLM_RESNAME_MAXLEN+1,
GFP_KERNEL);
@@ -567,7 +563,7 @@ static int new_lockspace(const char *name, const char *cluster,
init_completion(&ls->ls_recovery_done);
ls->ls_recovery_result = -1;
- mutex_init(&ls->ls_cb_mutex);
+ spin_lock_init(&ls->ls_cb_lock);
INIT_LIST_HEAD(&ls->ls_cb_delay);
ls->ls_recoverd_task = NULL;
@@ -584,7 +580,7 @@ static int new_lockspace(const char *name, const char *cluster,
atomic_set(&ls->ls_requestqueue_cnt, 0);
init_waitqueue_head(&ls->ls_requestqueue_wait);
mutex_init(&ls->ls_requestqueue_mutex);
- mutex_init(&ls->ls_clear_proc_locks);
+ spin_lock_init(&ls->ls_clear_proc_locks);
/* Due backwards compatibility with 3.1 we need to use maximum
* possible dlm message size to be sure the message will fit and
@@ -703,10 +699,11 @@ static int new_lockspace(const char *name, const char *cluster,
return error;
}
-int dlm_new_lockspace(const char *name, const char *cluster,
- uint32_t flags, int lvblen,
- const struct dlm_lockspace_ops *ops, void *ops_arg,
- int *ops_result, dlm_lockspace_t **lockspace)
+static int __dlm_new_lockspace(const char *name, const char *cluster,
+ uint32_t flags, int lvblen,
+ const struct dlm_lockspace_ops *ops,
+ void *ops_arg, int *ops_result,
+ dlm_lockspace_t **lockspace)
{
int error = 0;
@@ -725,13 +722,32 @@ int dlm_new_lockspace(const char *name, const char *cluster,
if (!ls_count) {
dlm_scand_stop();
dlm_midcomms_shutdown();
- dlm_lowcomms_stop();
+ dlm_midcomms_stop();
}
out:
mutex_unlock(&ls_lock);
return error;
}
+int dlm_new_lockspace(const char *name, const char *cluster, uint32_t flags,
+ int lvblen, const struct dlm_lockspace_ops *ops,
+ void *ops_arg, int *ops_result,
+ dlm_lockspace_t **lockspace)
+{
+ return __dlm_new_lockspace(name, cluster, flags | DLM_LSFL_FS, lvblen,
+ ops, ops_arg, ops_result, lockspace);
+}
+
+int dlm_new_user_lockspace(const char *name, const char *cluster,
+ uint32_t flags, int lvblen,
+ const struct dlm_lockspace_ops *ops,
+ void *ops_arg, int *ops_result,
+ dlm_lockspace_t **lockspace)
+{
+ return __dlm_new_lockspace(name, cluster, flags, lvblen, ops,
+ ops_arg, ops_result, lockspace);
+}
+
static int lkb_idr_is_local(int id, void *p, void *data)
{
struct dlm_lkb *lkb = p;
@@ -909,7 +925,7 @@ int dlm_release_lockspace(void *lockspace, int force)
if (!error)
ls_count--;
if (!ls_count)
- dlm_lowcomms_stop();
+ dlm_midcomms_stop();
mutex_unlock(&ls_lock);
return error;
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
index 306fc4f4ea15..03f4a4a3a871 100644
--- a/fs/dlm/lockspace.h
+++ b/fs/dlm/lockspace.h
@@ -12,6 +12,14 @@
#ifndef __LOCKSPACE_DOT_H__
#define __LOCKSPACE_DOT_H__
+/* DLM_LSFL_FS
+ * The lockspace user is in the kernel (i.e. filesystem). Enables
+ * direct bast/cast callbacks.
+ *
+ * internal lockspace flag - will be removed in future
+ */
+#define DLM_LSFL_FS 0x00000004
+
int dlm_lockspace_init(void);
void dlm_lockspace_exit(void);
struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
@@ -20,6 +28,11 @@ struct dlm_ls *dlm_find_lockspace_device(int minor);
void dlm_put_lockspace(struct dlm_ls *ls);
void dlm_stop_lockspaces(void);
void dlm_stop_lockspaces_check(void);
+int dlm_new_user_lockspace(const char *name, const char *cluster,
+ uint32_t flags, int lvblen,
+ const struct dlm_lockspace_ops *ops,
+ void *ops_arg, int *ops_result,
+ dlm_lockspace_t **lockspace);
#endif /* __LOCKSPACE_DOT_H__ */
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index a4e84e8d94c8..4450721ec83c 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -63,41 +63,49 @@
#define NEEDED_RMEM (4*1024*1024)
-/* Number of messages to send before rescheduling */
-#define MAX_SEND_MSG_COUNT 25
-#define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(10000)
-
struct connection {
struct socket *sock; /* NULL if not connected */
uint32_t nodeid; /* So we know who we are in the list */
- struct mutex sock_mutex;
+ /* this semaphore is used to allow parallel recv/send in read
+ * lock mode. When we release a sock we need to held the write lock.
+ *
+ * However this is locking code and not nice. When we remove the
+ * othercon handling we can look into other mechanism to synchronize
+ * io handling to call sock_release() at the right time.
+ */
+ struct rw_semaphore sock_lock;
unsigned long flags;
-#define CF_READ_PENDING 1
-#define CF_WRITE_PENDING 2
-#define CF_INIT_PENDING 4
+#define CF_APP_LIMITED 0
+#define CF_RECV_PENDING 1
+#define CF_SEND_PENDING 2
+#define CF_RECV_INTR 3
+#define CF_IO_STOP 4
#define CF_IS_OTHERCON 5
-#define CF_CLOSE 6
-#define CF_APP_LIMITED 7
-#define CF_CLOSING 8
-#define CF_SHUTDOWN 9
-#define CF_CONNECTED 10
-#define CF_RECONNECT 11
-#define CF_DELAY_CONNECT 12
-#define CF_EOF 13
struct list_head writequeue; /* List of outgoing writequeue_entries */
spinlock_t writequeue_lock;
- atomic_t writequeue_cnt;
int retries;
-#define MAX_CONNECT_RETRIES 3
struct hlist_node list;
+ /* due some connect()/accept() races we currently have this cross over
+ * connection attempt second connection for one node.
+ *
+ * There is a solution to avoid the race by introducing a connect
+ * rule as e.g. our_nodeid > nodeid_to_connect who is allowed to
+ * connect. Otherside can connect but will only be considered that
+ * the other side wants to have a reconnect.
+ *
+ * However changing to this behaviour will break backwards compatible.
+ * In a DLM protocol major version upgrade we should remove this!
+ */
struct connection *othercon;
- struct connection *sendcon;
- struct work_struct rwork; /* Receive workqueue */
- struct work_struct swork; /* Send workqueue */
- wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */
- unsigned char *rx_buf;
- int rx_buflen;
+ struct work_struct rwork; /* receive worker */
+ struct work_struct swork; /* send worker */
+ unsigned char rx_leftover_buf[DLM_MAX_SOCKET_BUFSIZE];
int rx_leftover;
+ int mark;
+ int addr_count;
+ int curr_addr_index;
+ struct sockaddr_storage addr[DLM_MAX_ADDR_COUNT];
+ spinlock_t addrs_lock;
struct rcu_head rcu;
};
#define sock2con(x) ((struct connection *)(x)->sk_user_data)
@@ -136,13 +144,12 @@ struct dlm_msg {
struct kref ref;
};
-struct dlm_node_addr {
- struct list_head list;
+struct processqueue_entry {
+ unsigned char *buf;
int nodeid;
- int mark;
- int addr_count;
- int curr_addr_index;
- struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
+ int buflen;
+
+ struct list_head list;
};
struct dlm_proto_ops {
@@ -157,10 +164,6 @@ struct dlm_proto_ops {
int (*listen_validate)(void);
void (*listen_sockopts)(struct socket *sock);
int (*listen_bind)(struct socket *sock);
- /* What to do to shutdown */
- void (*shutdown_action)(struct connection *con);
- /* What to do to eof check */
- bool (*eof_condition)(struct connection *con);
};
static struct listen_sock_callbacks {
@@ -170,17 +173,13 @@ static struct listen_sock_callbacks {
void (*sk_write_space)(struct sock *);
} listen_sock;
-static LIST_HEAD(dlm_node_addrs);
-static DEFINE_SPINLOCK(dlm_node_addrs_spin);
-
static struct listen_connection listen_con;
-static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
+static struct sockaddr_storage dlm_local_addr[DLM_MAX_ADDR_COUNT];
static int dlm_local_count;
-int dlm_allow_conn;
/* Work queues */
-static struct workqueue_struct *recv_workqueue;
-static struct workqueue_struct *send_workqueue;
+static struct workqueue_struct *io_workqueue;
+static struct workqueue_struct *process_workqueue;
static struct hlist_head connection_hash[CONN_HASH_SIZE];
static DEFINE_SPINLOCK(connections_lock);
@@ -188,8 +187,45 @@ DEFINE_STATIC_SRCU(connections_srcu);
static const struct dlm_proto_ops *dlm_proto_ops;
+#define DLM_IO_SUCCESS 0
+#define DLM_IO_END 1
+#define DLM_IO_EOF 2
+#define DLM_IO_RESCHED 3
+
static void process_recv_sockets(struct work_struct *work);
static void process_send_sockets(struct work_struct *work);
+static void process_dlm_messages(struct work_struct *work);
+
+static DECLARE_WORK(process_work, process_dlm_messages);
+static DEFINE_SPINLOCK(processqueue_lock);
+static bool process_dlm_messages_pending;
+static LIST_HEAD(processqueue);
+
+bool dlm_lowcomms_is_running(void)
+{
+ return !!listen_con.sock;
+}
+
+static void lowcomms_queue_swork(struct connection *con)
+{
+ assert_spin_locked(&con->writequeue_lock);
+
+ if (!test_bit(CF_IO_STOP, &con->flags) &&
+ !test_bit(CF_APP_LIMITED, &con->flags) &&
+ !test_and_set_bit(CF_SEND_PENDING, &con->flags))
+ queue_work(io_workqueue, &con->swork);
+}
+
+static void lowcomms_queue_rwork(struct connection *con)
+{
+#ifdef CONFIG_LOCKDEP
+ WARN_ON_ONCE(!lockdep_sock_is_held(con->sock->sk));
+#endif
+
+ if (!test_bit(CF_IO_STOP, &con->flags) &&
+ !test_and_set_bit(CF_RECV_PENDING, &con->flags))
+ queue_work(io_workqueue, &con->rwork);
+}
static void writequeue_entry_ctor(void *data)
{
@@ -214,15 +250,12 @@ static struct writequeue_entry *con_next_wq(struct connection *con)
{
struct writequeue_entry *e;
- if (list_empty(&con->writequeue))
- return NULL;
-
- e = list_first_entry(&con->writequeue, struct writequeue_entry,
- list);
+ e = list_first_entry_or_null(&con->writequeue, struct writequeue_entry,
+ list);
/* if len is zero nothing is to send, if there are users filling
* buffers we wait until the users are done so we can send more.
*/
- if (e->users || e->len == 0)
+ if (!e || e->users || e->len == 0)
return NULL;
return e;
@@ -240,28 +273,15 @@ static struct connection *__find_con(int nodeid, int r)
return NULL;
}
-static bool tcp_eof_condition(struct connection *con)
-{
- return atomic_read(&con->writequeue_cnt);
-}
-
-static int dlm_con_init(struct connection *con, int nodeid)
+static void dlm_con_init(struct connection *con, int nodeid)
{
- con->rx_buflen = dlm_config.ci_buffer_size;
- con->rx_buf = kmalloc(con->rx_buflen, GFP_NOFS);
- if (!con->rx_buf)
- return -ENOMEM;
-
con->nodeid = nodeid;
- mutex_init(&con->sock_mutex);
+ init_rwsem(&con->sock_lock);
INIT_LIST_HEAD(&con->writequeue);
spin_lock_init(&con->writequeue_lock);
- atomic_set(&con->writequeue_cnt, 0);
INIT_WORK(&con->swork, process_send_sockets);
INIT_WORK(&con->rwork, process_recv_sockets);
- init_waitqueue_head(&con->shutdown_wait);
-
- return 0;
+ spin_lock_init(&con->addrs_lock);
}
/*
@@ -271,7 +291,7 @@ static int dlm_con_init(struct connection *con, int nodeid)
static struct connection *nodeid2con(int nodeid, gfp_t alloc)
{
struct connection *con, *tmp;
- int r, ret;
+ int r;
r = nodeid_hash(nodeid);
con = __find_con(nodeid, r);
@@ -282,11 +302,7 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
if (!con)
return NULL;
- ret = dlm_con_init(con, nodeid);
- if (ret) {
- kfree(con);
- return NULL;
- }
+ dlm_con_init(con, nodeid);
spin_lock(&connections_lock);
/* Because multiple workqueues/threads calls this function it can
@@ -298,7 +314,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
tmp = __find_con(nodeid, r);
if (tmp) {
spin_unlock(&connections_lock);
- kfree(con->rx_buf);
kfree(con);
return tmp;
}
@@ -309,29 +324,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
return con;
}
-/* Loop round all connections */
-static void foreach_conn(void (*conn_func)(struct connection *c))
-{
- int i;
- struct connection *con;
-
- for (i = 0; i < CONN_HASH_SIZE; i++) {
- hlist_for_each_entry_rcu(con, &connection_hash[i], list)
- conn_func(con);
- }
-}
-
-static struct dlm_node_addr *find_node_addr(int nodeid)
-{
- struct dlm_node_addr *na;
-
- list_for_each_entry(na, &dlm_node_addrs, list) {
- if (na->nodeid == nodeid)
- return na;
- }
- return NULL;
-}
-
static int addr_compare(const struct sockaddr_storage *x,
const struct sockaddr_storage *y)
{
@@ -365,40 +357,47 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
unsigned int *mark)
{
struct sockaddr_storage sas;
- struct dlm_node_addr *na;
+ struct connection *con;
+ int idx;
if (!dlm_local_count)
return -1;
- spin_lock(&dlm_node_addrs_spin);
- na = find_node_addr(nodeid);
- if (na && na->addr_count) {
- memcpy(&sas, na->addr[na->curr_addr_index],
- sizeof(struct sockaddr_storage));
+ idx = srcu_read_lock(&connections_srcu);
+ con = nodeid2con(nodeid, 0);
+ if (!con) {
+ srcu_read_unlock(&connections_srcu, idx);
+ return -ENOENT;
+ }
- if (try_new_addr) {
- na->curr_addr_index++;
- if (na->curr_addr_index == na->addr_count)
- na->curr_addr_index = 0;
- }
+ spin_lock(&con->addrs_lock);
+ if (!con->addr_count) {
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
+ return -ENOENT;
}
- spin_unlock(&dlm_node_addrs_spin);
- if (!na)
- return -EEXIST;
+ memcpy(&sas, &con->addr[con->curr_addr_index],
+ sizeof(struct sockaddr_storage));
- if (!na->addr_count)
- return -ENOENT;
+ if (try_new_addr) {
+ con->curr_addr_index++;
+ if (con->curr_addr_index == con->addr_count)
+ con->curr_addr_index = 0;
+ }
- *mark = na->mark;
+ *mark = con->mark;
+ spin_unlock(&con->addrs_lock);
if (sas_out)
memcpy(sas_out, &sas, sizeof(struct sockaddr_storage));
- if (!sa_out)
+ if (!sa_out) {
+ srcu_read_unlock(&connections_srcu, idx);
return 0;
+ }
- if (dlm_local_addr[0]->ss_family == AF_INET) {
+ if (dlm_local_addr[0].ss_family == AF_INET) {
struct sockaddr_in *in4 = (struct sockaddr_in *) &sas;
struct sockaddr_in *ret4 = (struct sockaddr_in *) sa_out;
ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
@@ -408,43 +407,46 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
ret6->sin6_addr = in6->sin6_addr;
}
+ srcu_read_unlock(&connections_srcu, idx);
return 0;
}
static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid,
unsigned int *mark)
{
- struct dlm_node_addr *na;
- int rv = -EEXIST;
- int addr_i;
-
- spin_lock(&dlm_node_addrs_spin);
- list_for_each_entry(na, &dlm_node_addrs, list) {
- if (!na->addr_count)
- continue;
-
- for (addr_i = 0; addr_i < na->addr_count; addr_i++) {
- if (addr_compare(na->addr[addr_i], addr)) {
- *nodeid = na->nodeid;
- *mark = na->mark;
- rv = 0;
- goto unlock;
+ struct connection *con;
+ int i, idx, addr_i;
+
+ idx = srcu_read_lock(&connections_srcu);
+ for (i = 0; i < CONN_HASH_SIZE; i++) {
+ hlist_for_each_entry_rcu(con, &connection_hash[i], list) {
+ WARN_ON_ONCE(!con->addr_count);
+
+ spin_lock(&con->addrs_lock);
+ for (addr_i = 0; addr_i < con->addr_count; addr_i++) {
+ if (addr_compare(&con->addr[addr_i], addr)) {
+ *nodeid = con->nodeid;
+ *mark = con->mark;
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
+ return 0;
+ }
}
+ spin_unlock(&con->addrs_lock);
}
}
-unlock:
- spin_unlock(&dlm_node_addrs_spin);
- return rv;
+ srcu_read_unlock(&connections_srcu, idx);
+
+ return -ENOENT;
}
-/* caller need to held dlm_node_addrs_spin lock */
-static bool dlm_lowcomms_na_has_addr(const struct dlm_node_addr *na,
- const struct sockaddr_storage *addr)
+static bool dlm_lowcomms_con_has_addr(const struct connection *con,
+ const struct sockaddr_storage *addr)
{
int i;
- for (i = 0; i < na->addr_count; i++) {
- if (addr_compare(na->addr[i], addr))
+ for (i = 0; i < con->addr_count; i++) {
+ if (addr_compare(&con->addr[i], addr))
return true;
}
@@ -453,118 +455,82 @@ static bool dlm_lowcomms_na_has_addr(const struct dlm_node_addr *na,
int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
{
- struct sockaddr_storage *new_addr;
- struct dlm_node_addr *new_node, *na;
- bool ret;
-
- new_node = kzalloc(sizeof(struct dlm_node_addr), GFP_NOFS);
- if (!new_node)
- return -ENOMEM;
+ struct connection *con;
+ bool ret, idx;
- new_addr = kzalloc(sizeof(struct sockaddr_storage), GFP_NOFS);
- if (!new_addr) {
- kfree(new_node);
+ idx = srcu_read_lock(&connections_srcu);
+ con = nodeid2con(nodeid, GFP_NOFS);
+ if (!con) {
+ srcu_read_unlock(&connections_srcu, idx);
return -ENOMEM;
}
- memcpy(new_addr, addr, len);
-
- spin_lock(&dlm_node_addrs_spin);
- na = find_node_addr(nodeid);
- if (!na) {
- new_node->nodeid = nodeid;
- new_node->addr[0] = new_addr;
- new_node->addr_count = 1;
- new_node->mark = dlm_config.ci_mark;
- list_add(&new_node->list, &dlm_node_addrs);
- spin_unlock(&dlm_node_addrs_spin);
+ spin_lock(&con->addrs_lock);
+ if (!con->addr_count) {
+ memcpy(&con->addr[0], addr, sizeof(*addr));
+ con->addr_count = 1;
+ con->mark = dlm_config.ci_mark;
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
return 0;
}
- ret = dlm_lowcomms_na_has_addr(na, addr);
+ ret = dlm_lowcomms_con_has_addr(con, addr);
if (ret) {
- spin_unlock(&dlm_node_addrs_spin);
- kfree(new_addr);
- kfree(new_node);
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
return -EEXIST;
}
- if (na->addr_count >= DLM_MAX_ADDR_COUNT) {
- spin_unlock(&dlm_node_addrs_spin);
- kfree(new_addr);
- kfree(new_node);
+ if (con->addr_count >= DLM_MAX_ADDR_COUNT) {
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
return -ENOSPC;
}
- na->addr[na->addr_count++] = new_addr;
- spin_unlock(&dlm_node_addrs_spin);
- kfree(new_node);
+ memcpy(&con->addr[con->addr_count++], addr, sizeof(*addr));
+ srcu_read_unlock(&connections_srcu, idx);
+ spin_unlock(&con->addrs_lock);
return 0;
}
/* Data available on socket or listen socket received a connect */
static void lowcomms_data_ready(struct sock *sk)
{
- struct connection *con;
-
- con = sock2con(sk);
- if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags))
- queue_work(recv_workqueue, &con->rwork);
-}
-
-static void lowcomms_listen_data_ready(struct sock *sk)
-{
- if (!dlm_allow_conn)
- return;
+ struct connection *con = sock2con(sk);
- queue_work(recv_workqueue, &listen_con.rwork);
+ set_bit(CF_RECV_INTR, &con->flags);
+ lowcomms_queue_rwork(con);
}
static void lowcomms_write_space(struct sock *sk)
{
- struct connection *con;
-
- con = sock2con(sk);
- if (!con)
- return;
-
- if (!test_and_set_bit(CF_CONNECTED, &con->flags)) {
- log_print("connected to node %d", con->nodeid);
- queue_work(send_workqueue, &con->swork);
- return;
- }
+ struct connection *con = sock2con(sk);
clear_bit(SOCK_NOSPACE, &con->sock->flags);
+ spin_lock_bh(&con->writequeue_lock);
if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
con->sock->sk->sk_write_pending--;
clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags);
}
- queue_work(send_workqueue, &con->swork);
-}
-
-static inline void lowcomms_connect_sock(struct connection *con)
-{
- if (test_bit(CF_CLOSE, &con->flags))
- return;
- queue_work(send_workqueue, &con->swork);
- cond_resched();
+ lowcomms_queue_swork(con);
+ spin_unlock_bh(&con->writequeue_lock);
}
static void lowcomms_state_change(struct sock *sk)
{
/* SCTP layer is not calling sk_data_ready when the connection
- * is done, so we catch the signal through here. Also, it
- * doesn't switch socket state when entering shutdown, so we
- * skip the write in that case.
+ * is done, so we catch the signal through here.
*/
- if (sk->sk_shutdown) {
- if (sk->sk_shutdown == RCV_SHUTDOWN)
- lowcomms_data_ready(sk);
- } else if (sk->sk_state == TCP_ESTABLISHED) {
- lowcomms_write_space(sk);
- }
+ if (sk->sk_shutdown == RCV_SHUTDOWN)
+ lowcomms_data_ready(sk);
+}
+
+static void lowcomms_listen_data_ready(struct sock *sk)
+{
+ queue_work(io_workqueue, &listen_con.rwork);
}
int dlm_lowcomms_connect_node(int nodeid)
@@ -576,47 +542,49 @@ int dlm_lowcomms_connect_node(int nodeid)
return 0;
idx = srcu_read_lock(&connections_srcu);
- con = nodeid2con(nodeid, GFP_NOFS);
- if (!con) {
+ con = nodeid2con(nodeid, 0);
+ if (WARN_ON_ONCE(!con)) {
srcu_read_unlock(&connections_srcu, idx);
- return -ENOMEM;
+ return -ENOENT;
}
- lowcomms_connect_sock(con);
+ down_read(&con->sock_lock);
+ if (!con->sock) {
+ spin_lock_bh(&con->writequeue_lock);
+ lowcomms_queue_swork(con);
+ spin_unlock_bh(&con->writequeue_lock);
+ }
+ up_read(&con->sock_lock);
srcu_read_unlock(&connections_srcu, idx);
+ cond_resched();
return 0;
}
int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark)
{
- struct dlm_node_addr *na;
+ struct connection *con;
+ int idx;
- spin_lock(&dlm_node_addrs_spin);
- na = find_node_addr(nodeid);
- if (!na) {
- spin_unlock(&dlm_node_addrs_spin);
+ idx = srcu_read_lock(&connections_srcu);
+ con = nodeid2con(nodeid, 0);
+ if (!con) {
+ srcu_read_unlock(&connections_srcu, idx);
return -ENOENT;
}
- na->mark = mark;
- spin_unlock(&dlm_node_addrs_spin);
-
+ spin_lock(&con->addrs_lock);
+ con->mark = mark;
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
return 0;
}
static void lowcomms_error_report(struct sock *sk)
{
- struct connection *con;
- void (*orig_report)(struct sock *) = NULL;
+ struct connection *con = sock2con(sk);
struct inet_sock *inet;
- con = sock2con(sk);
- if (con == NULL)
- goto out;
-
- orig_report = listen_sock.sk_error_report;
-
inet = inet_sk(sk);
switch (sk->sk_family) {
case AF_INET:
@@ -642,66 +610,25 @@ static void lowcomms_error_report(struct sock *sk)
"invalid socket family %d set, "
"sk_err=%d/%d\n", dlm_our_nodeid(),
sk->sk_family, sk->sk_err, sk->sk_err_soft);
- goto out;
- }
-
- /* below sendcon only handling */
- if (test_bit(CF_IS_OTHERCON, &con->flags))
- con = con->sendcon;
-
- switch (sk->sk_err) {
- case ECONNREFUSED:
- set_bit(CF_DELAY_CONNECT, &con->flags);
- break;
- default:
break;
}
- if (!test_and_set_bit(CF_RECONNECT, &con->flags))
- queue_work(send_workqueue, &con->swork);
+ dlm_midcomms_unack_msg_resend(con->nodeid);
-out:
- if (orig_report)
- orig_report(sk);
+ listen_sock.sk_error_report(sk);
}
-/* Note: sk_callback_lock must be locked before calling this function. */
-static void save_listen_callbacks(struct socket *sock)
+static void restore_callbacks(struct sock *sk)
{
- struct sock *sk = sock->sk;
-
- listen_sock.sk_data_ready = sk->sk_data_ready;
- listen_sock.sk_state_change = sk->sk_state_change;
- listen_sock.sk_write_space = sk->sk_write_space;
- listen_sock.sk_error_report = sk->sk_error_report;
-}
-
-static void restore_callbacks(struct socket *sock)
-{
- struct sock *sk = sock->sk;
+#ifdef CONFIG_LOCKDEP
+ WARN_ON_ONCE(!lockdep_sock_is_held(sk));
+#endif
- lock_sock(sk);
sk->sk_user_data = NULL;
sk->sk_data_ready = listen_sock.sk_data_ready;
sk->sk_state_change = listen_sock.sk_state_change;
sk->sk_write_space = listen_sock.sk_write_space;
sk->sk_error_report = listen_sock.sk_error_report;
- release_sock(sk);
-}
-
-static void add_listen_sock(struct socket *sock, struct listen_connection *con)
-{
- struct sock *sk = sock->sk;
-
- lock_sock(sk);
- save_listen_callbacks(sock);
- con->sock = sock;
-
- sk->sk_user_data = con;
- sk->sk_allocation = GFP_NOFS;
- /* Install a data_ready callback */
- sk->sk_data_ready = lowcomms_listen_data_ready;
- release_sock(sk);
}
/* Make a socket active */
@@ -713,11 +640,12 @@ static void add_sock(struct socket *sock, struct connection *con)
con->sock = sock;
sk->sk_user_data = con;
- /* Install a data_ready callback */
sk->sk_data_ready = lowcomms_data_ready;
sk->sk_write_space = lowcomms_write_space;
- sk->sk_state_change = lowcomms_state_change;
+ if (dlm_config.ci_protocol == DLM_PROTO_SCTP)
+ sk->sk_state_change = lowcomms_state_change;
sk->sk_allocation = GFP_NOFS;
+ sk->sk_use_task_frag = false;
sk->sk_error_report = lowcomms_error_report;
release_sock(sk);
}
@@ -727,7 +655,7 @@ static void add_sock(struct socket *sock, struct connection *con)
static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
int *addr_len)
{
- saddr->ss_family = dlm_local_addr[0]->ss_family;
+ saddr->ss_family = dlm_local_addr[0].ss_family;
if (saddr->ss_family == AF_INET) {
struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
in4_addr->sin_port = cpu_to_be16(port);
@@ -773,43 +701,67 @@ static void free_entry(struct writequeue_entry *e)
}
list_del(&e->list);
- atomic_dec(&e->con->writequeue_cnt);
kref_put(&e->ref, dlm_page_release);
}
static void dlm_close_sock(struct socket **sock)
{
- if (*sock) {
- restore_callbacks(*sock);
- sock_release(*sock);
- *sock = NULL;
+ lock_sock((*sock)->sk);
+ restore_callbacks((*sock)->sk);
+ release_sock((*sock)->sk);
+
+ sock_release(*sock);
+ *sock = NULL;
+}
+
+static void allow_connection_io(struct connection *con)
+{
+ if (con->othercon)
+ clear_bit(CF_IO_STOP, &con->othercon->flags);
+ clear_bit(CF_IO_STOP, &con->flags);
+}
+
+static void stop_connection_io(struct connection *con)
+{
+ if (con->othercon)
+ stop_connection_io(con->othercon);
+
+ down_write(&con->sock_lock);
+ if (con->sock) {
+ lock_sock(con->sock->sk);
+ restore_callbacks(con->sock->sk);
+
+ spin_lock_bh(&con->writequeue_lock);
+ set_bit(CF_IO_STOP, &con->flags);
+ spin_unlock_bh(&con->writequeue_lock);
+ release_sock(con->sock->sk);
+ } else {
+ spin_lock_bh(&con->writequeue_lock);
+ set_bit(CF_IO_STOP, &con->flags);
+ spin_unlock_bh(&con->writequeue_lock);
}
+ up_write(&con->sock_lock);
+
+ cancel_work_sync(&con->swork);
+ cancel_work_sync(&con->rwork);
}
/* Close a remote connection and tidy up */
-static void close_connection(struct connection *con, bool and_other,
- bool tx, bool rx)
+static void close_connection(struct connection *con, bool and_other)
{
- bool closing = test_and_set_bit(CF_CLOSING, &con->flags);
struct writequeue_entry *e;
- if (tx && !closing && cancel_work_sync(&con->swork)) {
- log_print("canceled swork for node %d", con->nodeid);
- clear_bit(CF_WRITE_PENDING, &con->flags);
- }
- if (rx && !closing && cancel_work_sync(&con->rwork)) {
- log_print("canceled rwork for node %d", con->nodeid);
- clear_bit(CF_READ_PENDING, &con->flags);
+ if (con->othercon && and_other)
+ close_connection(con->othercon, false);
+
+ down_write(&con->sock_lock);
+ if (!con->sock) {
+ up_write(&con->sock_lock);
+ return;
}
- mutex_lock(&con->sock_mutex);
dlm_close_sock(&con->sock);
- if (con->othercon && and_other) {
- /* Will only re-enter once. */
- close_connection(con->othercon, false, tx, rx);
- }
-
/* if we send a writequeue entry only a half way, we drop the
* whole entry because reconnection and that we not start of the
* middle of a msg which will confuse the other end.
@@ -821,200 +773,209 @@ static void close_connection(struct connection *con, bool and_other,
* our policy is to start on a clean state when disconnects, we don't
* know what's send/received on transport layer in this case.
*/
- spin_lock(&con->writequeue_lock);
+ spin_lock_bh(&con->writequeue_lock);
if (!list_empty(&con->writequeue)) {
e = list_first_entry(&con->writequeue, struct writequeue_entry,
list);
if (e->dirty)
free_entry(e);
}
- spin_unlock(&con->writequeue_lock);
+ spin_unlock_bh(&con->writequeue_lock);
con->rx_leftover = 0;
con->retries = 0;
clear_bit(CF_APP_LIMITED, &con->flags);
- clear_bit(CF_CONNECTED, &con->flags);
- clear_bit(CF_DELAY_CONNECT, &con->flags);
- clear_bit(CF_RECONNECT, &con->flags);
- clear_bit(CF_EOF, &con->flags);
- mutex_unlock(&con->sock_mutex);
- clear_bit(CF_CLOSING, &con->flags);
+ clear_bit(CF_RECV_PENDING, &con->flags);
+ clear_bit(CF_SEND_PENDING, &con->flags);
+ up_write(&con->sock_lock);
}
-static void shutdown_connection(struct connection *con)
+static struct processqueue_entry *new_processqueue_entry(int nodeid,
+ int buflen)
{
- int ret;
-
- flush_work(&con->swork);
+ struct processqueue_entry *pentry;
- mutex_lock(&con->sock_mutex);
- /* nothing to shutdown */
- if (!con->sock) {
- mutex_unlock(&con->sock_mutex);
- return;
- }
+ pentry = kmalloc(sizeof(*pentry), GFP_NOFS);
+ if (!pentry)
+ return NULL;
- set_bit(CF_SHUTDOWN, &con->flags);
- ret = kernel_sock_shutdown(con->sock, SHUT_WR);
- mutex_unlock(&con->sock_mutex);
- if (ret) {
- log_print("Connection %p failed to shutdown: %d will force close",
- con, ret);
- goto force_close;
- } else {
- ret = wait_event_timeout(con->shutdown_wait,
- !test_bit(CF_SHUTDOWN, &con->flags),
- DLM_SHUTDOWN_WAIT_TIMEOUT);
- if (ret == 0) {
- log_print("Connection %p shutdown timed out, will force close",
- con);
- goto force_close;
- }
+ pentry->buf = kmalloc(buflen, GFP_NOFS);
+ if (!pentry->buf) {
+ kfree(pentry);
+ return NULL;
}
- return;
+ pentry->nodeid = nodeid;
+ return pentry;
+}
-force_close:
- clear_bit(CF_SHUTDOWN, &con->flags);
- close_connection(con, false, true, true);
+static void free_processqueue_entry(struct processqueue_entry *pentry)
+{
+ kfree(pentry->buf);
+ kfree(pentry);
}
-static void dlm_tcp_shutdown(struct connection *con)
+struct dlm_processed_nodes {
+ int nodeid;
+
+ struct list_head list;
+};
+
+static void add_processed_node(int nodeid, struct list_head *processed_nodes)
{
- if (con->othercon)
- shutdown_connection(con->othercon);
- shutdown_connection(con);
+ struct dlm_processed_nodes *n;
+
+ list_for_each_entry(n, processed_nodes, list) {
+ /* we already remembered this node */
+ if (n->nodeid == nodeid)
+ return;
+ }
+
+ /* if it's fails in worst case we simple don't send an ack back.
+ * We try it next time.
+ */
+ n = kmalloc(sizeof(*n), GFP_NOFS);
+ if (!n)
+ return;
+
+ n->nodeid = nodeid;
+ list_add(&n->list, processed_nodes);
}
-static int con_realloc_receive_buf(struct connection *con, int newlen)
+static void process_dlm_messages(struct work_struct *work)
{
- unsigned char *newbuf;
+ struct dlm_processed_nodes *n, *n_tmp;
+ struct processqueue_entry *pentry;
+ LIST_HEAD(processed_nodes);
- newbuf = kmalloc(newlen, GFP_NOFS);
- if (!newbuf)
- return -ENOMEM;
+ spin_lock(&processqueue_lock);
+ pentry = list_first_entry_or_null(&processqueue,
+ struct processqueue_entry, list);
+ if (WARN_ON_ONCE(!pentry)) {
+ spin_unlock(&processqueue_lock);
+ return;
+ }
+
+ list_del(&pentry->list);
+ spin_unlock(&processqueue_lock);
- /* copy any leftover from last receive */
- if (con->rx_leftover)
- memmove(newbuf, con->rx_buf, con->rx_leftover);
+ for (;;) {
+ dlm_process_incoming_buffer(pentry->nodeid, pentry->buf,
+ pentry->buflen);
+ add_processed_node(pentry->nodeid, &processed_nodes);
+ free_processqueue_entry(pentry);
+
+ spin_lock(&processqueue_lock);
+ pentry = list_first_entry_or_null(&processqueue,
+ struct processqueue_entry, list);
+ if (!pentry) {
+ process_dlm_messages_pending = false;
+ spin_unlock(&processqueue_lock);
+ break;
+ }
- /* swap to new buffer space */
- kfree(con->rx_buf);
- con->rx_buflen = newlen;
- con->rx_buf = newbuf;
+ list_del(&pentry->list);
+ spin_unlock(&processqueue_lock);
+ }
- return 0;
+ /* send ack back after we processed couple of messages */
+ list_for_each_entry_safe(n, n_tmp, &processed_nodes, list) {
+ list_del(&n->list);
+ dlm_midcomms_receive_done(n->nodeid);
+ kfree(n);
+ }
}
/* Data received from remote end */
-static int receive_from_sock(struct connection *con)
+static int receive_from_sock(struct connection *con, int buflen)
{
+ struct processqueue_entry *pentry;
+ int ret, buflen_real;
struct msghdr msg;
struct kvec iov;
- int ret, buflen;
-
- mutex_lock(&con->sock_mutex);
- if (con->sock == NULL) {
- ret = -EAGAIN;
- goto out_close;
- }
+ pentry = new_processqueue_entry(con->nodeid, buflen);
+ if (!pentry)
+ return DLM_IO_RESCHED;
- /* realloc if we get new buffer size to read out */
- buflen = dlm_config.ci_buffer_size;
- if (con->rx_buflen != buflen && con->rx_leftover <= buflen) {
- ret = con_realloc_receive_buf(con, buflen);
- if (ret < 0)
- goto out_resched;
- }
+ memcpy(pentry->buf, con->rx_leftover_buf, con->rx_leftover);
- for (;;) {
- /* calculate new buffer parameter regarding last receive and
- * possible leftover bytes
- */
- iov.iov_base = con->rx_buf + con->rx_leftover;
- iov.iov_len = con->rx_buflen - con->rx_leftover;
-
- memset(&msg, 0, sizeof(msg));
- msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
- ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len,
- msg.msg_flags);
- trace_dlm_recv(con->nodeid, ret);
- if (ret == -EAGAIN)
- break;
- else if (ret <= 0)
- goto out_close;
-
- /* new buflen according readed bytes and leftover from last receive */
- buflen = ret + con->rx_leftover;
- ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen);
- if (ret < 0)
- goto out_close;
-
- /* calculate leftover bytes from process and put it into begin of
- * the receive buffer, so next receive we have the full message
- * at the start address of the receive buffer.
- */
- con->rx_leftover = buflen - ret;
- if (con->rx_leftover) {
- memmove(con->rx_buf, con->rx_buf + ret,
- con->rx_leftover);
+ /* calculate new buffer parameter regarding last receive and
+ * possible leftover bytes
+ */
+ iov.iov_base = pentry->buf + con->rx_leftover;
+ iov.iov_len = buflen - con->rx_leftover;
+
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+ clear_bit(CF_RECV_INTR, &con->flags);
+again:
+ ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len,
+ msg.msg_flags);
+ trace_dlm_recv(con->nodeid, ret);
+ if (ret == -EAGAIN) {
+ lock_sock(con->sock->sk);
+ if (test_and_clear_bit(CF_RECV_INTR, &con->flags)) {
+ release_sock(con->sock->sk);
+ goto again;
}
+
+ clear_bit(CF_RECV_PENDING, &con->flags);
+ release_sock(con->sock->sk);
+ free_processqueue_entry(pentry);
+ return DLM_IO_END;
+ } else if (ret == 0) {
+ /* close will clear CF_RECV_PENDING */
+ free_processqueue_entry(pentry);
+ return DLM_IO_EOF;
+ } else if (ret < 0) {
+ free_processqueue_entry(pentry);
+ return ret;
}
- dlm_midcomms_receive_done(con->nodeid);
- mutex_unlock(&con->sock_mutex);
- return 0;
+ /* new buflen according readed bytes and leftover from last receive */
+ buflen_real = ret + con->rx_leftover;
+ ret = dlm_validate_incoming_buffer(con->nodeid, pentry->buf,
+ buflen_real);
+ if (ret < 0) {
+ free_processqueue_entry(pentry);
+ return ret;
+ }
-out_resched:
- if (!test_and_set_bit(CF_READ_PENDING, &con->flags))
- queue_work(recv_workqueue, &con->rwork);
- mutex_unlock(&con->sock_mutex);
- return -EAGAIN;
-
-out_close:
- if (ret == 0) {
- log_print("connection %p got EOF from %d",
- con, con->nodeid);
-
- if (dlm_proto_ops->eof_condition &&
- dlm_proto_ops->eof_condition(con)) {
- set_bit(CF_EOF, &con->flags);
- mutex_unlock(&con->sock_mutex);
- } else {
- mutex_unlock(&con->sock_mutex);
- close_connection(con, false, true, false);
+ pentry->buflen = ret;
- /* handling for tcp shutdown */
- clear_bit(CF_SHUTDOWN, &con->flags);
- wake_up(&con->shutdown_wait);
- }
+ /* calculate leftover bytes from process and put it into begin of
+ * the receive buffer, so next receive we have the full message
+ * at the start address of the receive buffer.
+ */
+ con->rx_leftover = buflen_real - ret;
+ memmove(con->rx_leftover_buf, pentry->buf + ret,
+ con->rx_leftover);
- /* signal to breaking receive worker */
- ret = -1;
- } else {
- mutex_unlock(&con->sock_mutex);
+ spin_lock(&processqueue_lock);
+ list_add_tail(&pentry->list, &processqueue);
+ if (!process_dlm_messages_pending) {
+ process_dlm_messages_pending = true;
+ queue_work(process_workqueue, &process_work);
}
- return ret;
+ spin_unlock(&processqueue_lock);
+
+ return DLM_IO_SUCCESS;
}
/* Listening socket is busy, accept a connection */
-static int accept_from_sock(struct listen_connection *con)
+static int accept_from_sock(void)
{
- int result;
struct sockaddr_storage peeraddr;
- struct socket *newsock;
- int len, idx;
- int nodeid;
+ int len, idx, result, nodeid;
struct connection *newcon;
- struct connection *addcon;
+ struct socket *newsock;
unsigned int mark;
- if (!con->sock)
- return -ENOTCONN;
-
- result = kernel_accept(con->sock, &newsock, O_NONBLOCK);
- if (result < 0)
+ result = kernel_accept(listen_con.sock, &newsock, O_NONBLOCK);
+ if (result == -EAGAIN)
+ return DLM_IO_END;
+ else if (result < 0)
goto accept_err;
/* Get the connected socket's peer */
@@ -1062,16 +1023,16 @@ static int accept_from_sock(struct listen_connection *con)
* In this case we store the incoming one in "othercon"
*/
idx = srcu_read_lock(&connections_srcu);
- newcon = nodeid2con(nodeid, GFP_NOFS);
- if (!newcon) {
+ newcon = nodeid2con(nodeid, 0);
+ if (WARN_ON_ONCE(!newcon)) {
srcu_read_unlock(&connections_srcu, idx);
- result = -ENOMEM;
+ result = -ENOENT;
goto accept_err;
}
sock_set_mark(newsock->sk, mark);
- mutex_lock(&newcon->sock_mutex);
+ down_write(&newcon->sock_lock);
if (newcon->sock) {
struct connection *othercon = newcon->othercon;
@@ -1079,63 +1040,50 @@ static int accept_from_sock(struct listen_connection *con)
othercon = kzalloc(sizeof(*othercon), GFP_NOFS);
if (!othercon) {
log_print("failed to allocate incoming socket");
- mutex_unlock(&newcon->sock_mutex);
+ up_write(&newcon->sock_lock);
srcu_read_unlock(&connections_srcu, idx);
result = -ENOMEM;
goto accept_err;
}
- result = dlm_con_init(othercon, nodeid);
- if (result < 0) {
- kfree(othercon);
- mutex_unlock(&newcon->sock_mutex);
- srcu_read_unlock(&connections_srcu, idx);
- goto accept_err;
- }
-
- lockdep_set_subclass(&othercon->sock_mutex, 1);
- set_bit(CF_IS_OTHERCON, &othercon->flags);
+ dlm_con_init(othercon, nodeid);
+ lockdep_set_subclass(&othercon->sock_lock, 1);
newcon->othercon = othercon;
- othercon->sendcon = newcon;
+ set_bit(CF_IS_OTHERCON, &othercon->flags);
} else {
/* close other sock con if we have something new */
- close_connection(othercon, false, true, false);
+ close_connection(othercon, false);
}
- mutex_lock(&othercon->sock_mutex);
+ down_write(&othercon->sock_lock);
add_sock(newsock, othercon);
- addcon = othercon;
- mutex_unlock(&othercon->sock_mutex);
+
+ /* check if we receved something while adding */
+ lock_sock(othercon->sock->sk);
+ lowcomms_queue_rwork(othercon);
+ release_sock(othercon->sock->sk);
+ up_write(&othercon->sock_lock);
}
else {
/* accept copies the sk after we've saved the callbacks, so we
don't want to save them a second time or comm errors will
result in calling sk_error_report recursively. */
add_sock(newsock, newcon);
- addcon = newcon;
- }
-
- set_bit(CF_CONNECTED, &addcon->flags);
- mutex_unlock(&newcon->sock_mutex);
-
- /*
- * Add it to the active queue in case we got data
- * between processing the accept adding the socket
- * to the read_sockets list
- */
- if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags))
- queue_work(recv_workqueue, &addcon->rwork);
+ /* check if we receved something while adding */
+ lock_sock(newcon->sock->sk);
+ lowcomms_queue_rwork(newcon);
+ release_sock(newcon->sock->sk);
+ }
+ up_write(&newcon->sock_lock);
srcu_read_unlock(&connections_srcu, idx);
- return 0;
+ return DLM_IO_SUCCESS;
accept_err:
if (newsock)
sock_release(newsock);
- if (result != -EAGAIN)
- log_print("error accepting connection from node: %d", result);
return result;
}
@@ -1167,7 +1115,7 @@ static int sctp_bind_addrs(struct socket *sock, uint16_t port)
int i, addr_len, result = 0;
for (i = 0; i < dlm_local_count; i++) {
- memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
+ memcpy(&localaddr, &dlm_local_addr[i], sizeof(localaddr));
make_sockaddr(&localaddr, port, &addr_len);
if (!i)
@@ -1187,7 +1135,7 @@ static int sctp_bind_addrs(struct socket *sock, uint16_t port)
/* Get local addresses */
static void init_local(void)
{
- struct sockaddr_storage sas, *addr;
+ struct sockaddr_storage sas;
int i;
dlm_local_count = 0;
@@ -1195,21 +1143,10 @@ static void init_local(void)
if (dlm_our_addr(&sas, i))
break;
- addr = kmemdup(&sas, sizeof(*addr), GFP_NOFS);
- if (!addr)
- break;
- dlm_local_addr[dlm_local_count++] = addr;
+ memcpy(&dlm_local_addr[dlm_local_count++], &sas, sizeof(sas));
}
}
-static void deinit_local(void)
-{
- int i;
-
- for (i = 0; i < dlm_local_count; i++)
- kfree(dlm_local_addr[i]);
-}
-
static struct writequeue_entry *new_writequeue_entry(struct connection *con)
{
struct writequeue_entry *entry;
@@ -1240,7 +1177,7 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len,
{
struct writequeue_entry *e;
- spin_lock(&con->writequeue_lock);
+ spin_lock_bh(&con->writequeue_lock);
if (!list_empty(&con->writequeue)) {
e = list_last_entry(&con->writequeue, struct writequeue_entry, list);
if (DLM_WQ_REMAIN_BYTES(e) >= len) {
@@ -1263,14 +1200,13 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len,
kref_get(&e->ref);
*ppc = page_address(e->page);
e->end += len;
- atomic_inc(&con->writequeue_cnt);
if (cb)
cb(data);
list_add_tail(&e->list, &con->writequeue);
out:
- spin_unlock(&con->writequeue_lock);
+ spin_unlock_bh(&con->writequeue_lock);
return e;
};
@@ -1319,13 +1255,13 @@ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
len < sizeof(struct dlm_header)) {
BUILD_BUG_ON(PAGE_SIZE < DLM_MAX_SOCKET_BUFSIZE);
log_print("failed to allocate a buffer of size %d", len);
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return NULL;
}
idx = srcu_read_lock(&connections_srcu);
- con = nodeid2con(nodeid, allocation);
- if (!con) {
+ con = nodeid2con(nodeid, 0);
+ if (WARN_ON_ONCE(!con)) {
srcu_read_unlock(&connections_srcu, idx);
return NULL;
}
@@ -1336,6 +1272,8 @@ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
return NULL;
}
+ /* for dlm_lowcomms_commit_msg() */
+ kref_get(&msg->ref);
/* we assume if successful commit must called */
msg->idx = idx;
return msg;
@@ -1348,7 +1286,7 @@ static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg)
struct connection *con = e->con;
int users;
- spin_lock(&con->writequeue_lock);
+ spin_lock_bh(&con->writequeue_lock);
kref_get(&msg->ref);
list_add(&msg->list, &e->msgs);
@@ -1357,13 +1295,11 @@ static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg)
goto out;
e->len = DLM_WQ_LENGTH_BYTES(e);
- spin_unlock(&con->writequeue_lock);
- queue_work(send_workqueue, &con->swork);
- return;
+ lowcomms_queue_swork(con);
out:
- spin_unlock(&con->writequeue_lock);
+ spin_unlock_bh(&con->writequeue_lock);
return;
}
@@ -1375,6 +1311,8 @@ void dlm_lowcomms_commit_msg(struct dlm_msg *msg)
{
_dlm_lowcomms_commit_msg(msg);
srcu_read_unlock(&connections_srcu, msg->idx);
+ /* because dlm_lowcomms_new_msg() */
+ kref_put(&msg->ref, dlm_msg_release);
}
#endif
@@ -1383,7 +1321,7 @@ void dlm_lowcomms_put_msg(struct dlm_msg *msg)
kref_put(&msg->ref, dlm_msg_release);
}
-/* does not held connections_srcu, usage workqueue only */
+/* does not held connections_srcu, usage lowcomms_error_report only */
int dlm_lowcomms_resend_msg(struct dlm_msg *msg)
{
struct dlm_msg *msg_resend;
@@ -1409,90 +1347,79 @@ int dlm_lowcomms_resend_msg(struct dlm_msg *msg)
}
/* Send a message */
-static void send_to_sock(struct connection *con)
+static int send_to_sock(struct connection *con)
{
const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
struct writequeue_entry *e;
int len, offset, ret;
- int count = 0;
- mutex_lock(&con->sock_mutex);
- if (con->sock == NULL)
- goto out_connect;
+ spin_lock_bh(&con->writequeue_lock);
+ e = con_next_wq(con);
+ if (!e) {
+ clear_bit(CF_SEND_PENDING, &con->flags);
+ spin_unlock_bh(&con->writequeue_lock);
+ return DLM_IO_END;
+ }
- spin_lock(&con->writequeue_lock);
- for (;;) {
- e = con_next_wq(con);
- if (!e)
- break;
+ len = e->len;
+ offset = e->offset;
+ WARN_ON_ONCE(len == 0 && e->users == 0);
+ spin_unlock_bh(&con->writequeue_lock);
- len = e->len;
- offset = e->offset;
- BUG_ON(len == 0 && e->users == 0);
- spin_unlock(&con->writequeue_lock);
-
- ret = kernel_sendpage(con->sock, e->page, offset, len,
- msg_flags);
- trace_dlm_send(con->nodeid, ret);
- if (ret == -EAGAIN || ret == 0) {
- if (ret == -EAGAIN &&
- test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) &&
- !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
- /* Notify TCP that we're limited by the
- * application window size.
- */
- set_bit(SOCK_NOSPACE, &con->sock->flags);
- con->sock->sk->sk_write_pending++;
- }
- cond_resched();
- goto out;
- } else if (ret < 0)
- goto out;
-
- /* Don't starve people filling buffers */
- if (++count >= MAX_SEND_MSG_COUNT) {
- cond_resched();
- count = 0;
+ ret = kernel_sendpage(con->sock, e->page, offset, len,
+ msg_flags);
+ trace_dlm_send(con->nodeid, ret);
+ if (ret == -EAGAIN || ret == 0) {
+ lock_sock(con->sock->sk);
+ spin_lock_bh(&con->writequeue_lock);
+ if (test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) &&
+ !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
+ /* Notify TCP that we're limited by the
+ * application window size.
+ */
+ set_bit(SOCK_NOSPACE, &con->sock->sk->sk_socket->flags);
+ con->sock->sk->sk_write_pending++;
+
+ clear_bit(CF_SEND_PENDING, &con->flags);
+ spin_unlock_bh(&con->writequeue_lock);
+ release_sock(con->sock->sk);
+
+ /* wait for write_space() event */
+ return DLM_IO_END;
}
+ spin_unlock_bh(&con->writequeue_lock);
+ release_sock(con->sock->sk);
- spin_lock(&con->writequeue_lock);
- writequeue_entry_complete(e, ret);
- }
- spin_unlock(&con->writequeue_lock);
-
- /* close if we got EOF */
- if (test_and_clear_bit(CF_EOF, &con->flags)) {
- mutex_unlock(&con->sock_mutex);
- close_connection(con, false, false, true);
-
- /* handling for tcp shutdown */
- clear_bit(CF_SHUTDOWN, &con->flags);
- wake_up(&con->shutdown_wait);
- } else {
- mutex_unlock(&con->sock_mutex);
+ return DLM_IO_RESCHED;
+ } else if (ret < 0) {
+ return ret;
}
- return;
-
-out:
- mutex_unlock(&con->sock_mutex);
- return;
+ spin_lock_bh(&con->writequeue_lock);
+ writequeue_entry_complete(e, ret);
+ spin_unlock_bh(&con->writequeue_lock);
-out_connect:
- mutex_unlock(&con->sock_mutex);
- queue_work(send_workqueue, &con->swork);
- cond_resched();
+ return DLM_IO_SUCCESS;
}
static void clean_one_writequeue(struct connection *con)
{
struct writequeue_entry *e, *safe;
- spin_lock(&con->writequeue_lock);
+ spin_lock_bh(&con->writequeue_lock);
list_for_each_entry_safe(e, safe, &con->writequeue, list) {
free_entry(e);
}
- spin_unlock(&con->writequeue_lock);
+ spin_unlock_bh(&con->writequeue_lock);
+}
+
+static void connection_release(struct rcu_head *rcu)
+{
+ struct connection *con = container_of(rcu, struct connection, rcu);
+
+ WARN_ON_ONCE(!list_empty(&con->writequeue));
+ WARN_ON_ONCE(con->sock);
+ kfree(con);
}
/* Called from recovery when it knows that a node has
@@ -1500,286 +1427,311 @@ static void clean_one_writequeue(struct connection *con)
int dlm_lowcomms_close(int nodeid)
{
struct connection *con;
- struct dlm_node_addr *na;
int idx;
log_print("closing connection to node %d", nodeid);
+
idx = srcu_read_lock(&connections_srcu);
con = nodeid2con(nodeid, 0);
- if (con) {
- set_bit(CF_CLOSE, &con->flags);
- close_connection(con, true, true, true);
- clean_one_writequeue(con);
+ if (WARN_ON_ONCE(!con)) {
+ srcu_read_unlock(&connections_srcu, idx);
+ return -ENOENT;
+ }
+
+ stop_connection_io(con);
+ log_print("io handling for node: %d stopped", nodeid);
+ close_connection(con, true);
+
+ spin_lock(&connections_lock);
+ hlist_del_rcu(&con->list);
+ spin_unlock(&connections_lock);
+
+ clean_one_writequeue(con);
+ call_srcu(&connections_srcu, &con->rcu, connection_release);
+ if (con->othercon) {
+ clean_one_writequeue(con->othercon);
if (con->othercon)
- clean_one_writequeue(con->othercon);
+ call_srcu(&connections_srcu, &con->othercon->rcu, connection_release);
}
srcu_read_unlock(&connections_srcu, idx);
- spin_lock(&dlm_node_addrs_spin);
- na = find_node_addr(nodeid);
- if (na) {
- list_del(&na->list);
- while (na->addr_count--)
- kfree(na->addr[na->addr_count]);
- kfree(na);
- }
- spin_unlock(&dlm_node_addrs_spin);
+ /* for debugging we print when we are done to compare with other
+ * messages in between. This function need to be correctly synchronized
+ * with io handling
+ */
+ log_print("closing connection to node %d done", nodeid);
return 0;
}
-/* Receive workqueue function */
+/* Receive worker function */
static void process_recv_sockets(struct work_struct *work)
{
struct connection *con = container_of(work, struct connection, rwork);
+ int ret, buflen;
+
+ down_read(&con->sock_lock);
+ if (!con->sock) {
+ up_read(&con->sock_lock);
+ return;
+ }
+
+ buflen = READ_ONCE(dlm_config.ci_buffer_size);
+ do {
+ ret = receive_from_sock(con, buflen);
+ } while (ret == DLM_IO_SUCCESS);
+ up_read(&con->sock_lock);
- clear_bit(CF_READ_PENDING, &con->flags);
- receive_from_sock(con);
+ switch (ret) {
+ case DLM_IO_END:
+ /* CF_RECV_PENDING cleared */
+ break;
+ case DLM_IO_EOF:
+ close_connection(con, false);
+ /* CF_RECV_PENDING cleared */
+ break;
+ case DLM_IO_RESCHED:
+ cond_resched();
+ queue_work(io_workqueue, &con->rwork);
+ /* CF_RECV_PENDING not cleared */
+ break;
+ default:
+ if (ret < 0) {
+ if (test_bit(CF_IS_OTHERCON, &con->flags)) {
+ close_connection(con, false);
+ } else {
+ spin_lock_bh(&con->writequeue_lock);
+ lowcomms_queue_swork(con);
+ spin_unlock_bh(&con->writequeue_lock);
+ }
+
+ /* CF_RECV_PENDING cleared for othercon
+ * we trigger send queue if not already done
+ * and process_send_sockets will handle it
+ */
+ break;
+ }
+
+ WARN_ON_ONCE(1);
+ break;
+ }
}
static void process_listen_recv_socket(struct work_struct *work)
{
- accept_from_sock(&listen_con);
+ int ret;
+
+ if (WARN_ON_ONCE(!listen_con.sock))
+ return;
+
+ do {
+ ret = accept_from_sock();
+ } while (ret == DLM_IO_SUCCESS);
+
+ if (ret < 0)
+ log_print("critical error accepting connection: %d", ret);
}
-static void dlm_connect(struct connection *con)
+static int dlm_connect(struct connection *con)
{
struct sockaddr_storage addr;
int result, addr_len;
struct socket *sock;
unsigned int mark;
- /* Some odd races can cause double-connects, ignore them */
- if (con->retries++ > MAX_CONNECT_RETRIES)
- return;
-
- if (con->sock) {
- log_print("node %d already connected.", con->nodeid);
- return;
- }
-
memset(&addr, 0, sizeof(addr));
result = nodeid_to_addr(con->nodeid, &addr, NULL,
dlm_proto_ops->try_new_addr, &mark);
if (result < 0) {
log_print("no address for nodeid %d", con->nodeid);
- return;
+ return result;
}
/* Create a socket to communicate with */
- result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+ result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family,
SOCK_STREAM, dlm_proto_ops->proto, &sock);
if (result < 0)
- goto socket_err;
+ return result;
sock_set_mark(sock->sk, mark);
dlm_proto_ops->sockopts(sock);
- add_sock(sock, con);
-
result = dlm_proto_ops->bind(sock);
- if (result < 0)
- goto add_sock_err;
+ if (result < 0) {
+ sock_release(sock);
+ return result;
+ }
+
+ add_sock(sock, con);
log_print_ratelimited("connecting to %d", con->nodeid);
make_sockaddr(&addr, dlm_config.ci_tcp_port, &addr_len);
result = dlm_proto_ops->connect(con, sock, (struct sockaddr *)&addr,
addr_len);
- if (result < 0)
- goto add_sock_err;
-
- return;
-
-add_sock_err:
- dlm_close_sock(&con->sock);
+ switch (result) {
+ case -EINPROGRESS:
+ /* not an error */
+ fallthrough;
+ case 0:
+ break;
+ default:
+ if (result < 0)
+ dlm_close_sock(&con->sock);
-socket_err:
- /*
- * Some errors are fatal and this list might need adjusting. For other
- * errors we try again until the max number of retries is reached.
- */
- if (result != -EHOSTUNREACH &&
- result != -ENETUNREACH &&
- result != -ENETDOWN &&
- result != -EINVAL &&
- result != -EPROTONOSUPPORT) {
- log_print("connect %d try %d error %d", con->nodeid,
- con->retries, result);
- msleep(1000);
- lowcomms_connect_sock(con);
+ break;
}
+
+ return result;
}
-/* Send workqueue function */
+/* Send worker function */
static void process_send_sockets(struct work_struct *work)
{
struct connection *con = container_of(work, struct connection, swork);
+ int ret;
- WARN_ON(test_bit(CF_IS_OTHERCON, &con->flags));
-
- clear_bit(CF_WRITE_PENDING, &con->flags);
+ WARN_ON_ONCE(test_bit(CF_IS_OTHERCON, &con->flags));
- if (test_and_clear_bit(CF_RECONNECT, &con->flags)) {
- close_connection(con, false, false, true);
- dlm_midcomms_unack_msg_resend(con->nodeid);
+ down_read(&con->sock_lock);
+ if (!con->sock) {
+ up_read(&con->sock_lock);
+ down_write(&con->sock_lock);
+ if (!con->sock) {
+ ret = dlm_connect(con);
+ switch (ret) {
+ case 0:
+ break;
+ case -EINPROGRESS:
+ /* avoid spamming resched on connection
+ * we might can switch to a state_change
+ * event based mechanism if established
+ */
+ msleep(100);
+ break;
+ default:
+ /* CF_SEND_PENDING not cleared */
+ up_write(&con->sock_lock);
+ log_print("connect to node %d try %d error %d",
+ con->nodeid, con->retries++, ret);
+ msleep(1000);
+ /* For now we try forever to reconnect. In
+ * future we should send a event to cluster
+ * manager to fence itself after certain amount
+ * of retries.
+ */
+ queue_work(io_workqueue, &con->swork);
+ return;
+ }
+ }
+ downgrade_write(&con->sock_lock);
}
- if (con->sock == NULL) {
- if (test_and_clear_bit(CF_DELAY_CONNECT, &con->flags))
- msleep(1000);
+ do {
+ ret = send_to_sock(con);
+ } while (ret == DLM_IO_SUCCESS);
+ up_read(&con->sock_lock);
- mutex_lock(&con->sock_mutex);
- dlm_connect(con);
- mutex_unlock(&con->sock_mutex);
- }
+ switch (ret) {
+ case DLM_IO_END:
+ /* CF_SEND_PENDING cleared */
+ break;
+ case DLM_IO_RESCHED:
+ /* CF_SEND_PENDING not cleared */
+ cond_resched();
+ queue_work(io_workqueue, &con->swork);
+ break;
+ default:
+ if (ret < 0) {
+ close_connection(con, false);
+
+ /* CF_SEND_PENDING cleared */
+ spin_lock_bh(&con->writequeue_lock);
+ lowcomms_queue_swork(con);
+ spin_unlock_bh(&con->writequeue_lock);
+ break;
+ }
- if (!list_empty(&con->writequeue))
- send_to_sock(con);
+ WARN_ON_ONCE(1);
+ break;
+ }
}
static void work_stop(void)
{
- if (recv_workqueue) {
- destroy_workqueue(recv_workqueue);
- recv_workqueue = NULL;
+ if (io_workqueue) {
+ destroy_workqueue(io_workqueue);
+ io_workqueue = NULL;
}
- if (send_workqueue) {
- destroy_workqueue(send_workqueue);
- send_workqueue = NULL;
+ if (process_workqueue) {
+ destroy_workqueue(process_workqueue);
+ process_workqueue = NULL;
}
}
static int work_start(void)
{
- recv_workqueue = alloc_ordered_workqueue("dlm_recv", WQ_MEM_RECLAIM);
- if (!recv_workqueue) {
- log_print("can't start dlm_recv");
+ io_workqueue = alloc_workqueue("dlm_io", WQ_HIGHPRI | WQ_MEM_RECLAIM,
+ 0);
+ if (!io_workqueue) {
+ log_print("can't start dlm_io");
return -ENOMEM;
}
- send_workqueue = alloc_ordered_workqueue("dlm_send", WQ_MEM_RECLAIM);
- if (!send_workqueue) {
- log_print("can't start dlm_send");
- destroy_workqueue(recv_workqueue);
- recv_workqueue = NULL;
+ /* ordered dlm message process queue,
+ * should be converted to a tasklet
+ */
+ process_workqueue = alloc_ordered_workqueue("dlm_process",
+ WQ_HIGHPRI | WQ_MEM_RECLAIM);
+ if (!process_workqueue) {
+ log_print("can't start dlm_process");
+ destroy_workqueue(io_workqueue);
+ io_workqueue = NULL;
return -ENOMEM;
}
return 0;
}
-static void shutdown_conn(struct connection *con)
-{
- if (dlm_proto_ops->shutdown_action)
- dlm_proto_ops->shutdown_action(con);
-}
-
void dlm_lowcomms_shutdown(void)
{
- int idx;
-
- /* Set all the flags to prevent any
- * socket activity.
- */
- dlm_allow_conn = 0;
-
- if (recv_workqueue)
- flush_workqueue(recv_workqueue);
- if (send_workqueue)
- flush_workqueue(send_workqueue);
+ /* stop lowcomms_listen_data_ready calls */
+ lock_sock(listen_con.sock->sk);
+ listen_con.sock->sk->sk_data_ready = listen_sock.sk_data_ready;
+ release_sock(listen_con.sock->sk);
+ cancel_work_sync(&listen_con.rwork);
dlm_close_sock(&listen_con.sock);
- idx = srcu_read_lock(&connections_srcu);
- foreach_conn(shutdown_conn);
- srcu_read_unlock(&connections_srcu, idx);
-}
-
-static void _stop_conn(struct connection *con, bool and_other)
-{
- mutex_lock(&con->sock_mutex);
- set_bit(CF_CLOSE, &con->flags);
- set_bit(CF_READ_PENDING, &con->flags);
- set_bit(CF_WRITE_PENDING, &con->flags);
- if (con->sock && con->sock->sk) {
- lock_sock(con->sock->sk);
- con->sock->sk->sk_user_data = NULL;
- release_sock(con->sock->sk);
- }
- if (con->othercon && and_other)
- _stop_conn(con->othercon, false);
- mutex_unlock(&con->sock_mutex);
-}
-
-static void stop_conn(struct connection *con)
-{
- _stop_conn(con, true);
+ flush_workqueue(process_workqueue);
}
-static void connection_release(struct rcu_head *rcu)
+void dlm_lowcomms_shutdown_node(int nodeid, bool force)
{
- struct connection *con = container_of(rcu, struct connection, rcu);
-
- kfree(con->rx_buf);
- kfree(con);
-}
+ struct connection *con;
+ int idx;
-static void free_conn(struct connection *con)
-{
- close_connection(con, true, true, true);
- spin_lock(&connections_lock);
- hlist_del_rcu(&con->list);
- spin_unlock(&connections_lock);
- if (con->othercon) {
- clean_one_writequeue(con->othercon);
- call_srcu(&connections_srcu, &con->othercon->rcu,
- connection_release);
+ idx = srcu_read_lock(&connections_srcu);
+ con = nodeid2con(nodeid, 0);
+ if (WARN_ON_ONCE(!con)) {
+ srcu_read_unlock(&connections_srcu, idx);
+ return;
}
- clean_one_writequeue(con);
- call_srcu(&connections_srcu, &con->rcu, connection_release);
-}
-static void work_flush(void)
-{
- int ok;
- int i;
- struct connection *con;
-
- do {
- ok = 1;
- foreach_conn(stop_conn);
- if (recv_workqueue)
- flush_workqueue(recv_workqueue);
- if (send_workqueue)
- flush_workqueue(send_workqueue);
- for (i = 0; i < CONN_HASH_SIZE && ok; i++) {
- hlist_for_each_entry_rcu(con, &connection_hash[i],
- list) {
- ok &= test_bit(CF_READ_PENDING, &con->flags);
- ok &= test_bit(CF_WRITE_PENDING, &con->flags);
- if (con->othercon) {
- ok &= test_bit(CF_READ_PENDING,
- &con->othercon->flags);
- ok &= test_bit(CF_WRITE_PENDING,
- &con->othercon->flags);
- }
- }
- }
- } while (!ok);
+ flush_work(&con->swork);
+ stop_connection_io(con);
+ WARN_ON_ONCE(!force && !list_empty(&con->writequeue));
+ close_connection(con, true);
+ clean_one_writequeue(con);
+ if (con->othercon)
+ clean_one_writequeue(con->othercon);
+ allow_connection_io(con);
+ srcu_read_unlock(&connections_srcu, idx);
}
void dlm_lowcomms_stop(void)
{
- int idx;
-
- idx = srcu_read_lock(&connections_srcu);
- work_flush();
- foreach_conn(free_conn);
- srcu_read_unlock(&connections_srcu, idx);
work_stop();
- deinit_local();
-
dlm_proto_ops = NULL;
}
@@ -1795,7 +1747,7 @@ static int dlm_listen_for_all(void)
if (result < 0)
return result;
- result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+ result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family,
SOCK_STREAM, dlm_proto_ops->proto, &sock);
if (result < 0) {
log_print("Can't create comms socket: %d", result);
@@ -1809,14 +1761,23 @@ static int dlm_listen_for_all(void)
if (result < 0)
goto out;
- save_listen_callbacks(sock);
- add_listen_sock(sock, &listen_con);
+ lock_sock(sock->sk);
+ listen_sock.sk_data_ready = sock->sk->sk_data_ready;
+ listen_sock.sk_write_space = sock->sk->sk_write_space;
+ listen_sock.sk_error_report = sock->sk->sk_error_report;
+ listen_sock.sk_state_change = sock->sk->sk_state_change;
+
+ listen_con.sock = sock;
+
+ sock->sk->sk_allocation = GFP_NOFS;
+ sock->sk->sk_use_task_frag = false;
+ sock->sk->sk_data_ready = lowcomms_listen_data_ready;
+ release_sock(sock->sk);
- INIT_WORK(&listen_con.rwork, process_listen_recv_socket);
result = sock->ops->listen(sock, 5);
if (result < 0) {
dlm_close_sock(&listen_con.sock);
- goto out;
+ return result;
}
return 0;
@@ -1834,7 +1795,7 @@ static int dlm_tcp_bind(struct socket *sock)
/* Bind to our cluster-known address connecting to avoid
* routing problems.
*/
- memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr));
+ memcpy(&src_addr, &dlm_local_addr[0], sizeof(src_addr));
make_sockaddr(&src_addr, 0, &addr_len);
result = sock->ops->bind(sock, (struct sockaddr *)&src_addr,
@@ -1850,17 +1811,7 @@ static int dlm_tcp_bind(struct socket *sock)
static int dlm_tcp_connect(struct connection *con, struct socket *sock,
struct sockaddr *addr, int addr_len)
{
- int ret;
-
- ret = sock->ops->connect(sock, addr, addr_len, O_NONBLOCK);
- switch (ret) {
- case -EINPROGRESS:
- fallthrough;
- case 0:
- return 0;
- }
-
- return ret;
+ return sock->ops->connect(sock, addr, addr_len, O_NONBLOCK);
}
static int dlm_tcp_listen_validate(void)
@@ -1891,8 +1842,8 @@ static int dlm_tcp_listen_bind(struct socket *sock)
int addr_len;
/* Bind to our port */
- make_sockaddr(dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len);
- return sock->ops->bind(sock, (struct sockaddr *)dlm_local_addr[0],
+ make_sockaddr(&dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len);
+ return sock->ops->bind(sock, (struct sockaddr *)&dlm_local_addr[0],
addr_len);
}
@@ -1905,8 +1856,6 @@ static const struct dlm_proto_ops dlm_tcp_ops = {
.listen_validate = dlm_tcp_listen_validate,
.listen_sockopts = dlm_tcp_listen_sockopts,
.listen_bind = dlm_tcp_listen_bind,
- .shutdown_action = dlm_tcp_shutdown,
- .eof_condition = tcp_eof_condition,
};
static int dlm_sctp_bind(struct socket *sock)
@@ -1927,13 +1876,7 @@ static int dlm_sctp_connect(struct connection *con, struct socket *sock,
sock_set_sndtimeo(sock->sk, 5);
ret = sock->ops->connect(sock, addr, addr_len, 0);
sock_set_sndtimeo(sock->sk, 0);
- if (ret < 0)
- return ret;
-
- if (!test_and_set_bit(CF_CONNECTED, &con->flags))
- log_print("connected to node %d", con->nodeid);
-
- return 0;
+ return ret;
}
static int dlm_sctp_listen_validate(void)
@@ -1973,11 +1916,7 @@ static const struct dlm_proto_ops dlm_sctp_ops = {
int dlm_lowcomms_start(void)
{
- int error = -EINVAL;
- int i;
-
- for (i = 0; i < CONN_HASH_SIZE; i++)
- INIT_HLIST_HEAD(&connection_hash[i]);
+ int error;
init_local();
if (!dlm_local_count) {
@@ -1986,13 +1925,9 @@ int dlm_lowcomms_start(void)
goto fail;
}
- INIT_WORK(&listen_con.rwork, process_listen_recv_socket);
-
error = work_start();
if (error)
- goto fail_local;
-
- dlm_allow_conn = 1;
+ goto fail;
/* Start listening */
switch (dlm_config.ci_protocol) {
@@ -2018,25 +1953,38 @@ int dlm_lowcomms_start(void)
fail_listen:
dlm_proto_ops = NULL;
fail_proto_ops:
- dlm_allow_conn = 0;
- dlm_close_sock(&listen_con.sock);
work_stop();
-fail_local:
- deinit_local();
fail:
return error;
}
+void dlm_lowcomms_init(void)
+{
+ int i;
+
+ for (i = 0; i < CONN_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&connection_hash[i]);
+
+ INIT_WORK(&listen_con.rwork, process_listen_recv_socket);
+}
+
void dlm_lowcomms_exit(void)
{
- struct dlm_node_addr *na, *safe;
+ struct connection *con;
+ int i, idx;
- spin_lock(&dlm_node_addrs_spin);
- list_for_each_entry_safe(na, safe, &dlm_node_addrs, list) {
- list_del(&na->list);
- while (na->addr_count--)
- kfree(na->addr[na->addr_count]);
- kfree(na);
+ idx = srcu_read_lock(&connections_srcu);
+ for (i = 0; i < CONN_HASH_SIZE; i++) {
+ hlist_for_each_entry_rcu(con, &connection_hash[i], list) {
+ spin_lock(&connections_lock);
+ hlist_del_rcu(&con->list);
+ spin_unlock(&connections_lock);
+
+ if (con->othercon)
+ call_srcu(&connections_srcu, &con->othercon->rcu,
+ connection_release);
+ call_srcu(&connections_srcu, &con->rcu, connection_release);
+ }
}
- spin_unlock(&dlm_node_addrs_spin);
+ srcu_read_unlock(&connections_srcu, idx);
}
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
index 29369feea991..3e8dca66183b 100644
--- a/fs/dlm/lowcomms.h
+++ b/fs/dlm/lowcomms.h
@@ -29,12 +29,14 @@ static inline int nodeid_hash(int nodeid)
return nodeid & (CONN_HASH_SIZE-1);
}
-/* switch to check if dlm is running */
-extern int dlm_allow_conn;
+/* check if dlm is running */
+bool dlm_lowcomms_is_running(void);
int dlm_lowcomms_start(void);
void dlm_lowcomms_shutdown(void);
+void dlm_lowcomms_shutdown_node(int nodeid, bool force);
void dlm_lowcomms_stop(void);
+void dlm_lowcomms_init(void);
void dlm_lowcomms_exit(void);
int dlm_lowcomms_close(int nodeid);
struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index 1c5be4b70ac1..a77338be3237 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -17,7 +17,7 @@
#include "user.h"
#include "memory.h"
#include "config.h"
-#include "lowcomms.h"
+#include "midcomms.h"
#define CREATE_TRACE_POINTS
#include <trace/events/dlm.h>
@@ -30,6 +30,8 @@ static int __init init_dlm(void)
if (error)
goto out;
+ dlm_midcomms_init();
+
error = dlm_lockspace_init();
if (error)
goto out_mem;
@@ -66,6 +68,7 @@ static int __init init_dlm(void)
out_lockspace:
dlm_lockspace_exit();
out_mem:
+ dlm_midcomms_exit();
dlm_memory_exit();
out:
return error;
@@ -79,7 +82,7 @@ static void __exit exit_dlm(void)
dlm_config_exit();
dlm_memory_exit();
dlm_lockspace_exit();
- dlm_lowcomms_exit();
+ dlm_midcomms_exit();
dlm_unregister_debugfs();
}
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 2af2ccfe43a9..923c01a8a0aa 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -573,7 +573,10 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
node = &rv->nodes[i];
if (dlm_is_member(ls, node->nodeid))
continue;
- dlm_add_member(ls, node);
+ error = dlm_add_member(ls, node);
+ if (error)
+ return error;
+
log_rinfo(ls, "add member %d", node->nodeid);
}
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index ce35c3c19aeb..eb7a08641fcf 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -14,12 +14,14 @@
#include "lowcomms.h"
#include "config.h"
#include "memory.h"
+#include "ast.h"
static struct kmem_cache *writequeue_cache;
static struct kmem_cache *mhandle_cache;
static struct kmem_cache *msg_cache;
static struct kmem_cache *lkb_cache;
static struct kmem_cache *rsb_cache;
+static struct kmem_cache *cb_cache;
int __init dlm_memory_init(void)
@@ -46,8 +48,16 @@ int __init dlm_memory_init(void)
if (!rsb_cache)
goto rsb;
+ cb_cache = kmem_cache_create("dlm_cb", sizeof(struct dlm_callback),
+ __alignof__(struct dlm_callback), 0,
+ NULL);
+ if (!rsb_cache)
+ goto cb;
+
return 0;
+cb:
+ kmem_cache_destroy(rsb_cache);
rsb:
kmem_cache_destroy(msg_cache);
msg:
@@ -67,6 +77,7 @@ void dlm_memory_exit(void)
kmem_cache_destroy(msg_cache);
kmem_cache_destroy(lkb_cache);
kmem_cache_destroy(rsb_cache);
+ kmem_cache_destroy(cb_cache);
}
char *dlm_allocate_lvb(struct dlm_ls *ls)
@@ -115,12 +126,17 @@ void dlm_free_lkb(struct dlm_lkb *lkb)
kfree(ua);
}
}
+
+ /* drop references if they are set */
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL);
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cb, NULL);
+
kmem_cache_free(lkb_cache, lkb);
}
-struct dlm_mhandle *dlm_allocate_mhandle(void)
+struct dlm_mhandle *dlm_allocate_mhandle(gfp_t allocation)
{
- return kmem_cache_alloc(mhandle_cache, GFP_NOFS);
+ return kmem_cache_alloc(mhandle_cache, allocation);
}
void dlm_free_mhandle(struct dlm_mhandle *mhandle)
@@ -147,3 +163,13 @@ void dlm_free_msg(struct dlm_msg *msg)
{
kmem_cache_free(msg_cache, msg);
}
+
+struct dlm_callback *dlm_allocate_cb(void)
+{
+ return kmem_cache_alloc(cb_cache, GFP_ATOMIC);
+}
+
+void dlm_free_cb(struct dlm_callback *cb)
+{
+ kmem_cache_free(cb_cache, cb);
+}
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
index 7bd3f1a391ca..6b29563d24f7 100644
--- a/fs/dlm/memory.h
+++ b/fs/dlm/memory.h
@@ -20,12 +20,14 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls);
void dlm_free_lkb(struct dlm_lkb *l);
char *dlm_allocate_lvb(struct dlm_ls *ls);
void dlm_free_lvb(char *l);
-struct dlm_mhandle *dlm_allocate_mhandle(void);
+struct dlm_mhandle *dlm_allocate_mhandle(gfp_t allocation);
void dlm_free_mhandle(struct dlm_mhandle *mhandle);
struct writequeue_entry *dlm_allocate_writequeue(void);
void dlm_free_writequeue(struct writequeue_entry *writequeue);
struct dlm_msg *dlm_allocate_msg(gfp_t allocation);
void dlm_free_msg(struct dlm_msg *msg);
+struct dlm_callback *dlm_allocate_cb(void);
+void dlm_free_cb(struct dlm_callback *cb);
#endif /* __MEMORY_DOT_H__ */
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 6489bc22ad61..fc015a6abe17 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -132,6 +132,7 @@
*/
#define DLM_DEBUG_FENCE_TERMINATION 0
+#include <trace/events/dlm.h>
#include <net/tcp.h>
#include "dlm_internal.h"
@@ -194,7 +195,7 @@ struct midcomms_node {
};
struct dlm_mhandle {
- const struct dlm_header *inner_hd;
+ const union dlm_packet *inner_p;
struct midcomms_node *node;
struct dlm_opts *opts;
struct dlm_msg *msg;
@@ -305,11 +306,11 @@ static void dlm_send_queue_flush(struct midcomms_node *node)
pr_debug("flush midcomms send queue of node %d\n", node->nodeid);
rcu_read_lock();
- spin_lock(&node->send_queue_lock);
+ spin_lock_bh(&node->send_queue_lock);
list_for_each_entry_rcu(mh, &node->send_queue, list) {
dlm_mhandle_delete(node, mh);
}
- spin_unlock(&node->send_queue_lock);
+ spin_unlock_bh(&node->send_queue_lock);
rcu_read_unlock();
}
@@ -415,7 +416,7 @@ static int dlm_send_fin(struct midcomms_node *node,
m_header->h_cmd = DLM_FIN;
pr_debug("sending fin msg to node %d\n", node->nodeid);
- dlm_midcomms_commit_mhandle(mh);
+ dlm_midcomms_commit_mhandle(mh, NULL, 0);
set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags);
return 0;
@@ -436,7 +437,7 @@ static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq)
}
}
- spin_lock(&node->send_queue_lock);
+ spin_lock_bh(&node->send_queue_lock);
list_for_each_entry_rcu(mh, &node->send_queue, list) {
if (before(mh->seq, seq)) {
dlm_mhandle_delete(node, mh);
@@ -445,7 +446,7 @@ static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq)
break;
}
}
- spin_unlock(&node->send_queue_lock);
+ spin_unlock_bh(&node->send_queue_lock);
rcu_read_unlock();
}
@@ -468,12 +469,26 @@ static void dlm_pas_fin_ack_rcv(struct midcomms_node *node)
spin_unlock(&node->state_lock);
log_print("%s: unexpected state: %d\n",
__func__, node->state);
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return;
}
spin_unlock(&node->state_lock);
}
+static void dlm_receive_buffer_3_2_trace(uint32_t seq, union dlm_packet *p)
+{
+ switch (p->header.h_cmd) {
+ case DLM_MSG:
+ trace_dlm_recv_message(dlm_our_nodeid(), seq, &p->message);
+ break;
+ case DLM_RCOM:
+ trace_dlm_recv_rcom(dlm_our_nodeid(), seq, &p->rcom);
+ break;
+ default:
+ break;
+ }
+}
+
static void dlm_midcomms_receive_buffer(union dlm_packet *p,
struct midcomms_node *node,
uint32_t seq)
@@ -525,7 +540,7 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p,
spin_unlock(&node->state_lock);
log_print("%s: unexpected state: %d\n",
__func__, node->state);
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return;
}
spin_unlock(&node->state_lock);
@@ -533,7 +548,8 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p,
set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags);
break;
default:
- WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
+ WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
+ dlm_receive_buffer_3_2_trace(seq, p);
dlm_receive_buffer(p, node->nodeid);
set_bit(DLM_NODE_ULP_DELIVERED, &node->flags);
break;
@@ -754,7 +770,7 @@ static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid)
goto out;
}
- WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
+ WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
dlm_receive_buffer(p, nodeid);
break;
case DLM_OPTS:
@@ -874,12 +890,7 @@ static void dlm_midcomms_receive_buffer_3_1(union dlm_packet *p, int nodeid)
dlm_receive_buffer(p, nodeid);
}
-/*
- * Called from the low-level comms layer to process a buffer of
- * commands.
- */
-
-int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
+int dlm_validate_incoming_buffer(int nodeid, unsigned char *buf, int len)
{
const unsigned char *ptr = buf;
const struct dlm_header *hd;
@@ -914,6 +925,32 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
if (msglen > len)
break;
+ ret += msglen;
+ len -= msglen;
+ ptr += msglen;
+ }
+
+ return ret;
+}
+
+/*
+ * Called from the low-level comms layer to process a buffer of
+ * commands.
+ */
+int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
+{
+ const unsigned char *ptr = buf;
+ const struct dlm_header *hd;
+ uint16_t msglen;
+ int ret = 0;
+
+ while (len >= sizeof(struct dlm_header)) {
+ hd = (struct dlm_header *)ptr;
+
+ msglen = le16_to_cpu(hd->h_length);
+ if (msglen > len)
+ break;
+
switch (hd->h_version) {
case cpu_to_le32(DLM_VERSION_3_1):
dlm_midcomms_receive_buffer_3_1((union dlm_packet *)ptr, nodeid);
@@ -1030,9 +1067,9 @@ static void midcomms_new_msg_cb(void *data)
atomic_inc(&mh->node->send_queue_cnt);
- spin_lock(&mh->node->send_queue_lock);
+ spin_lock_bh(&mh->node->send_queue_lock);
list_add_tail_rcu(&mh->list, &mh->node->send_queue);
- spin_unlock(&mh->node->send_queue_lock);
+ spin_unlock_bh(&mh->node->send_queue_lock);
mh->seq = mh->node->seq_send++;
}
@@ -1055,7 +1092,7 @@ static struct dlm_msg *dlm_midcomms_get_msg_3_2(struct dlm_mhandle *mh, int node
dlm_fill_opts_header(opts, len, mh->seq);
*ppc += sizeof(*opts);
- mh->inner_hd = (const struct dlm_header *)*ppc;
+ mh->inner_p = (const union dlm_packet *)*ppc;
return msg;
}
@@ -1079,9 +1116,9 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
}
/* this is a bug, however we going on and hope it will be resolved */
- WARN_ON(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags));
+ WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags));
- mh = dlm_allocate_mhandle();
+ mh = dlm_allocate_mhandle(allocation);
if (!mh)
goto err;
@@ -1111,7 +1148,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
break;
default:
dlm_free_mhandle(mh);
- WARN_ON(1);
+ WARN_ON_ONCE(1);
goto err;
}
@@ -1130,11 +1167,32 @@ err:
}
#endif
-static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh)
+static void dlm_midcomms_commit_msg_3_2_trace(const struct dlm_mhandle *mh,
+ const void *name, int namelen)
+{
+ switch (mh->inner_p->header.h_cmd) {
+ case DLM_MSG:
+ trace_dlm_send_message(mh->node->nodeid, mh->seq,
+ &mh->inner_p->message,
+ name, namelen);
+ break;
+ case DLM_RCOM:
+ trace_dlm_send_rcom(mh->node->nodeid, mh->seq,
+ &mh->inner_p->rcom);
+ break;
+ default:
+ /* nothing to trace */
+ break;
+ }
+}
+
+static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh,
+ const void *name, int namelen)
{
/* nexthdr chain for fast lookup */
- mh->opts->o_nextcmd = mh->inner_hd->h_cmd;
+ mh->opts->o_nextcmd = mh->inner_p->header.h_cmd;
mh->committed = true;
+ dlm_midcomms_commit_msg_3_2_trace(mh, name, namelen);
dlm_lowcomms_commit_msg(mh->msg);
}
@@ -1142,8 +1200,10 @@ static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh)
* dlm_midcomms_get_mhandle
*/
#ifndef __CHECKER__
-void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh)
+void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh,
+ const void *name, int namelen)
{
+
switch (mh->node->version) {
case DLM_VERSION_3_1:
srcu_read_unlock(&nodes_srcu, mh->idx);
@@ -1154,12 +1214,12 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh)
dlm_free_mhandle(mh);
break;
case DLM_VERSION_3_2:
- dlm_midcomms_commit_msg_3_2(mh);
+ dlm_midcomms_commit_msg_3_2(mh, name, namelen);
srcu_read_unlock(&nodes_srcu, mh->idx);
break;
default:
srcu_read_unlock(&nodes_srcu, mh->idx);
- WARN_ON(1);
+ WARN_ON_ONCE(1);
break;
}
}
@@ -1167,12 +1227,27 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh)
int dlm_midcomms_start(void)
{
+ return dlm_lowcomms_start();
+}
+
+void dlm_midcomms_stop(void)
+{
+ dlm_lowcomms_stop();
+}
+
+void dlm_midcomms_init(void)
+{
int i;
for (i = 0; i < CONN_HASH_SIZE; i++)
INIT_HLIST_HEAD(&node_hash[i]);
- return dlm_lowcomms_start();
+ dlm_lowcomms_init();
+}
+
+void dlm_midcomms_exit(void)
+{
+ dlm_lowcomms_exit();
}
static void dlm_act_fin_ack_rcv(struct midcomms_node *node)
@@ -1201,7 +1276,7 @@ static void dlm_act_fin_ack_rcv(struct midcomms_node *node)
spin_unlock(&node->state_lock);
log_print("%s: unexpected state: %d\n",
__func__, node->state);
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return;
}
spin_unlock(&node->state_lock);
@@ -1319,7 +1394,7 @@ static void midcomms_node_release(struct rcu_head *rcu)
{
struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu);
- WARN_ON(atomic_read(&node->send_queue_cnt));
+ WARN_ON_ONCE(atomic_read(&node->send_queue_cnt));
kfree(node);
}
@@ -1372,11 +1447,13 @@ static void midcomms_shutdown(struct midcomms_node *node)
pr_debug("active shutdown timed out for node %d with state %s\n",
node->nodeid, dlm_state_str(node->state));
midcomms_node_reset(node);
+ dlm_lowcomms_shutdown_node(node->nodeid, true);
return;
}
pr_debug("active shutdown done for node %d with state %s\n",
node->nodeid, dlm_state_str(node->state));
+ dlm_lowcomms_shutdown_node(node->nodeid, false);
}
void dlm_midcomms_shutdown(void)
@@ -1384,6 +1461,8 @@ void dlm_midcomms_shutdown(void)
struct midcomms_node *node;
int i, idx;
+ dlm_lowcomms_shutdown();
+
mutex_lock(&close_lock);
idx = srcu_read_lock(&nodes_srcu);
for (i = 0; i < CONN_HASH_SIZE; i++) {
@@ -1401,8 +1480,6 @@ void dlm_midcomms_shutdown(void)
}
srcu_read_unlock(&nodes_srcu, idx);
mutex_unlock(&close_lock);
-
- dlm_lowcomms_shutdown();
}
int dlm_midcomms_close(int nodeid)
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
index 82bcd9661922..bea1cee4279c 100644
--- a/fs/dlm/midcomms.h
+++ b/fs/dlm/midcomms.h
@@ -14,12 +14,17 @@
struct midcomms_node;
+int dlm_validate_incoming_buffer(int nodeid, unsigned char *buf, int len);
int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen);
struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
gfp_t allocation, char **ppc);
-void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh);
+void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name,
+ int namelen);
int dlm_midcomms_close(int nodeid);
int dlm_midcomms_start(void);
+void dlm_midcomms_stop(void);
+void dlm_midcomms_init(void);
+void dlm_midcomms_exit(void);
void dlm_midcomms_shutdown(void);
void dlm_midcomms_add_member(int nodeid);
void dlm_midcomms_remove_member(int nodeid);
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 67f68d48d60c..4de4b8651c6c 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -75,6 +75,7 @@ static struct genl_family family __ro_after_init = {
.version = DLM_GENL_VERSION,
.small_ops = dlm_nl_ops,
.n_small_ops = ARRAY_SIZE(dlm_nl_ops),
+ .resv_start_op = DLM_CMD_HELLO + 1,
.module = THIS_MODULE,
};
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index f19860315043..b76d52e2f6bd 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -91,7 +91,7 @@ static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type,
static void send_rcom(struct dlm_mhandle *mh, struct dlm_rcom *rc)
{
- dlm_midcomms_commit_mhandle(mh);
+ dlm_midcomms_commit_mhandle(mh, NULL, 0);
}
static void send_rcom_stateless(struct dlm_msg *msg, struct dlm_rcom *rc)
@@ -516,7 +516,7 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
rf = (struct rcom_config *) rc->rc_buf;
rf->rf_lvblen = cpu_to_le32(~0U);
- dlm_midcomms_commit_mhandle(mh);
+ dlm_midcomms_commit_mhandle(mh, NULL, 0);
return 0;
}
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index 036a9a0078f6..8be2893ad15b 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -44,7 +44,8 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
e->recover_seq = ls->ls_recover_seq & 0xFFFFFFFF;
e->nodeid = nodeid;
- memcpy(&e->request, ms, le16_to_cpu(ms->m_header.h_length));
+ memcpy(&e->request, ms, sizeof(*ms));
+ memcpy(&e->request.m_extra, ms->m_extra, length);
atomic_inc(&ls->ls_requestqueue_cnt);
mutex_lock(&ls->ls_requestqueue_mutex);
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 99e8f0744513..35129505ddda 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -16,6 +16,8 @@
#include <linux/slab.h>
#include <linux/sched/signal.h>
+#include <trace/events/dlm.h>
+
#include "dlm_internal.h"
#include "lockspace.h"
#include "lock.h"
@@ -23,6 +25,7 @@
#include "user.h"
#include "ast.h"
#include "config.h"
+#include "memory.h"
static const char name_prefix[] = "dlm";
static const struct file_operations device_fops;
@@ -173,7 +176,7 @@ static int lkb_is_endoflife(int mode, int status)
being removed and then remove that lkb from the orphans list and free it */
void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
- int status, uint32_t sbflags, uint64_t seq)
+ int status, uint32_t sbflags)
{
struct dlm_ls *ls;
struct dlm_user_args *ua;
@@ -184,7 +187,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
return;
ls = lkb->lkb_resource->res_ls;
- mutex_lock(&ls->ls_clear_proc_locks);
+ spin_lock(&ls->ls_clear_proc_locks);
/* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
can't be delivered. For ORPHAN's, dlm_clear_proc_locks() freed
@@ -207,16 +210,22 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
spin_lock(&proc->asts_spin);
- rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq);
- if (rv < 0) {
+ rv = dlm_enqueue_lkb_callback(lkb, flags, mode, status, sbflags);
+ switch (rv) {
+ case DLM_ENQUEUE_CALLBACK_FAILURE:
spin_unlock(&proc->asts_spin);
+ WARN_ON_ONCE(1);
goto out;
- }
-
- if (list_empty(&lkb->lkb_cb_list)) {
+ case DLM_ENQUEUE_CALLBACK_NEED_SCHED:
kref_get(&lkb->lkb_ref);
list_add_tail(&lkb->lkb_cb_list, &proc->asts);
wake_up_interruptible(&proc->wait);
+ break;
+ case DLM_ENQUEUE_CALLBACK_SUCCESS:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
}
spin_unlock(&proc->asts_spin);
@@ -230,7 +239,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
spin_unlock(&proc->locks_spin);
}
out:
- mutex_unlock(&ls->ls_clear_proc_locks);
+ spin_unlock(&ls->ls_clear_proc_locks);
}
static int device_user_lock(struct dlm_user_proc *proc,
@@ -421,9 +430,9 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- error = dlm_new_lockspace(params->name, dlm_config.ci_cluster_name, params->flags,
- DLM_USER_LVB_LEN, NULL, NULL, NULL,
- &lockspace);
+ error = dlm_new_user_lockspace(params->name, dlm_config.ci_cluster_name,
+ params->flags, DLM_USER_LVB_LEN, NULL,
+ NULL, NULL, &lockspace);
if (error)
return error;
@@ -798,8 +807,8 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
struct dlm_user_proc *proc = file->private_data;
struct dlm_lkb *lkb;
DECLARE_WAITQUEUE(wait, current);
- struct dlm_callback cb;
- int rv, resid, copy_lvb = 0;
+ struct dlm_callback *cb;
+ int rv, copy_lvb = 0;
int old_mode, new_mode;
if (count == sizeof(struct dlm_device_version)) {
@@ -855,50 +864,58 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
without removing lkb_cb_list; so empty lkb_cb_list is always
consistent with empty lkb_callbacks */
- lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_cb_list);
+ lkb = list_first_entry(&proc->asts, struct dlm_lkb, lkb_cb_list);
/* rem_lkb_callback sets a new lkb_last_cast */
- old_mode = lkb->lkb_last_cast.mode;
+ old_mode = lkb->lkb_last_cast->mode;
- rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid);
- if (rv < 0) {
+ rv = dlm_dequeue_lkb_callback(lkb, &cb);
+ switch (rv) {
+ case DLM_DEQUEUE_CALLBACK_EMPTY:
/* this shouldn't happen; lkb should have been removed from
- list when resid was zero */
+ * list when last item was dequeued
+ */
log_print("dlm_rem_lkb_callback empty %x", lkb->lkb_id);
list_del_init(&lkb->lkb_cb_list);
spin_unlock(&proc->asts_spin);
/* removes ref for proc->asts, may cause lkb to be freed */
dlm_put_lkb(lkb);
+ WARN_ON_ONCE(1);
goto try_another;
- }
- if (!resid)
+ case DLM_DEQUEUE_CALLBACK_LAST:
list_del_init(&lkb->lkb_cb_list);
- spin_unlock(&proc->asts_spin);
-
- if (cb.flags & DLM_CB_SKIP) {
- /* removes ref for proc->asts, may cause lkb to be freed */
- if (!resid)
- dlm_put_lkb(lkb);
- goto try_another;
+ lkb->lkb_flags &= ~DLM_IFL_CB_PENDING;
+ break;
+ case DLM_DEQUEUE_CALLBACK_SUCCESS:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
}
+ spin_unlock(&proc->asts_spin);
- if (cb.flags & DLM_CB_CAST) {
- new_mode = cb.mode;
+ if (cb->flags & DLM_CB_BAST) {
+ trace_dlm_bast(lkb->lkb_resource->res_ls, lkb, cb->mode);
+ } else if (cb->flags & DLM_CB_CAST) {
+ new_mode = cb->mode;
- if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr &&
+ if (!cb->sb_status && lkb->lkb_lksb->sb_lvbptr &&
dlm_lvb_operations[old_mode + 1][new_mode + 1])
copy_lvb = 1;
- lkb->lkb_lksb->sb_status = cb.sb_status;
- lkb->lkb_lksb->sb_flags = cb.sb_flags;
+ lkb->lkb_lksb->sb_status = cb->sb_status;
+ lkb->lkb_lksb->sb_flags = cb->sb_flags;
+ trace_dlm_ast(lkb->lkb_resource->res_ls, lkb);
}
rv = copy_result_to_user(lkb->lkb_ua,
test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
- cb.flags, cb.mode, copy_lvb, buf, count);
+ cb->flags, cb->mode, copy_lvb, buf, count);
+
+ kref_put(&cb->ref, dlm_release_callback);
/* removes ref for proc->asts, may cause lkb to be freed */
- if (!resid)
+ if (rv == DLM_DEQUEUE_CALLBACK_LAST)
dlm_put_lkb(lkb);
return rv;
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 6b9bce6b96e0..33059452d79e 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -7,7 +7,7 @@
#define __USER_DOT_H__
void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
- int status, uint32_t sbflags, uint64_t seq);
+ int status, uint32_t sbflags);
int dlm_user_init(void);
void dlm_user_exit(void);
int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 5f2b49e13731..f2ed0c0266cb 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -506,7 +506,7 @@ ecryptfs_dentry_to_lower(struct dentry *dentry)
return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry;
}
-static inline struct path *
+static inline const struct path *
ecryptfs_dentry_to_lower_path(struct dentry *dentry)
{
return &((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 18d5b91cb573..268b74499c28 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -33,7 +33,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
struct iov_iter *to)
{
ssize_t rc;
- struct path *path;
+ const struct path *path;
struct file *file = iocb->ki_filp;
rc = generic_file_read_iter(iocb, to);
@@ -53,7 +53,7 @@ struct ecryptfs_getdents_callback {
};
/* Inspired by generic filldir in fs/readdir.c */
-static int
+static bool
ecryptfs_filldir(struct dir_context *ctx, const char *lower_name,
int lower_namelen, loff_t offset, u64 ino, unsigned int d_type)
{
@@ -61,18 +61,19 @@ ecryptfs_filldir(struct dir_context *ctx, const char *lower_name,
container_of(ctx, struct ecryptfs_getdents_callback, ctx);
size_t name_size;
char *name;
- int rc;
+ int err;
+ bool res;
buf->filldir_called++;
- rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
- buf->sb, lower_name,
- lower_namelen);
- if (rc) {
- if (rc != -EINVAL) {
+ err = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
+ buf->sb, lower_name,
+ lower_namelen);
+ if (err) {
+ if (err != -EINVAL) {
ecryptfs_printk(KERN_DEBUG,
"%s: Error attempting to decode and decrypt filename [%s]; rc = [%d]\n",
- __func__, lower_name, rc);
- return rc;
+ __func__, lower_name, err);
+ return false;
}
/* Mask -EINVAL errors as these are most likely due a plaintext
@@ -81,16 +82,15 @@ ecryptfs_filldir(struct dir_context *ctx, const char *lower_name,
* the "lost+found" dentry in the root directory of an Ext4
* filesystem.
*/
- return 0;
+ return true;
}
buf->caller->pos = buf->ctx.pos;
- rc = !dir_emit(buf->caller, name, name_size, ino, d_type);
+ res = dir_emit(buf->caller, name, name_size, ino, d_type);
kfree(name);
- if (!rc)
+ if (res)
buf->entries_written++;
-
- return rc;
+ return res;
}
/**
@@ -111,14 +111,8 @@ static int ecryptfs_readdir(struct file *file, struct dir_context *ctx)
lower_file = ecryptfs_file_to_lower(file);
rc = iterate_dir(lower_file, &buf.ctx);
ctx->pos = buf.ctx.pos;
- if (rc < 0)
- goto out;
- if (buf.filldir_called && !buf.entries_written)
- goto out;
- if (rc >= 0)
- fsstack_copy_attr_atime(inode,
- file_inode(lower_file));
-out:
+ if (rc >= 0 && (buf.entries_written || !buf.filldir_called))
+ fsstack_copy_attr_atime(inode, file_inode(lower_file));
return rc;
}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 16d50dface59..f3cd00fac9c3 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -18,6 +18,8 @@
#include <linux/fs_stack.h>
#include <linux/slab.h>
#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
#include <linux/fileattr.h>
#include <asm/unaligned.h>
#include "ecryptfs_kernel.h"
@@ -317,7 +319,7 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
struct dentry *lower_dentry)
{
- struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent);
+ const struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent);
struct inode *inode, *lower_inode;
struct ecryptfs_dentry_info *dentry_info;
int rc = 0;
@@ -1120,6 +1122,28 @@ static int ecryptfs_fileattr_set(struct user_namespace *mnt_userns,
return rc;
}
+static struct posix_acl *ecryptfs_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, int type)
+{
+ return vfs_get_acl(mnt_userns, ecryptfs_dentry_to_lower(dentry),
+ posix_acl_xattr_name(type));
+}
+
+static int ecryptfs_set_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct posix_acl *acl,
+ int type)
+{
+ int rc;
+ struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ struct inode *lower_inode = d_inode(lower_dentry);
+
+ rc = vfs_set_acl(&init_user_ns, lower_dentry,
+ posix_acl_xattr_name(type), acl);
+ if (!rc)
+ fsstack_copy_attr_all(d_inode(dentry), lower_inode);
+ return rc;
+}
+
const struct inode_operations ecryptfs_symlink_iops = {
.get_link = ecryptfs_get_link,
.permission = ecryptfs_permission,
@@ -1143,6 +1167,8 @@ const struct inode_operations ecryptfs_dir_iops = {
.listxattr = ecryptfs_listxattr,
.fileattr_get = ecryptfs_fileattr_get,
.fileattr_set = ecryptfs_fileattr_set,
+ .get_acl = ecryptfs_get_acl,
+ .set_acl = ecryptfs_set_acl,
};
const struct inode_operations ecryptfs_main_iops = {
@@ -1152,6 +1178,8 @@ const struct inode_operations ecryptfs_main_iops = {
.listxattr = ecryptfs_listxattr,
.fileattr_get = ecryptfs_fileattr_get,
.fileattr_set = ecryptfs_fileattr_set,
+ .get_acl = ecryptfs_get_acl,
+ .set_acl = ecryptfs_set_acl,
};
static int ecryptfs_xattr_get(const struct xattr_handler *handler,
@@ -1182,6 +1210,10 @@ static const struct xattr_handler ecryptfs_xattr_handler = {
};
const struct xattr_handler *ecryptfs_xattr_handlers[] = {
+#ifdef CONFIG_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
&ecryptfs_xattr_handler,
NULL
};
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 2dd23a82e0de..2dc927ba067f 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -105,7 +105,7 @@ static int ecryptfs_init_lower_file(struct dentry *dentry,
struct file **lower_file)
{
const struct cred *cred = current_cred();
- struct path *path = ecryptfs_dentry_to_lower_path(dentry);
+ const struct path *path = ecryptfs_dentry_to_lower_path(dentry);
int rc;
rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt,
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 939e5e242b98..617f3ad2485e 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -91,6 +91,10 @@ static int efivarfs_create(struct user_namespace *mnt_userns, struct inode *dir,
err = guid_parse(dentry->d_name.name + namelen + 1, &var->var.VendorGuid);
if (err)
goto out;
+ if (guid_equal(&var->var.VendorGuid, &LINUX_EFI_RANDOM_SEED_TABLE_GUID)) {
+ err = -EPERM;
+ goto out;
+ }
if (efivar_variable_is_removable(var->var.VendorGuid,
dentry->d_name.name, namelen))
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 6780fc81cc11..07e82e246666 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -116,6 +116,9 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
int err = -ENOMEM;
bool is_removable = false;
+ if (guid_equal(&vendor, &LINUX_EFI_RANDOM_SEED_TABLE_GUID))
+ return 0;
+
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return err;
diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c
index a0ef63cfcecb..9e4f47808bd5 100644
--- a/fs/efivarfs/vars.c
+++ b/fs/efivarfs/vars.c
@@ -651,22 +651,6 @@ int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes,
if (err)
return err;
- /*
- * Ensure that the available space hasn't shrunk below the safe level
- */
- status = check_var_size(attributes, *size + ucs2_strsize(name, 1024));
- if (status != EFI_SUCCESS) {
- if (status != EFI_UNSUPPORTED) {
- err = efi_status_to_err(status);
- goto out;
- }
-
- if (*size > 65536) {
- err = -ENOSPC;
- goto out;
- }
- }
-
status = efivar_set_variable_locked(name, vendor, attributes, *size,
data, false);
if (status != EFI_SUCCESS) {
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index fe8ac0e163f7..f57f921683d7 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -13,9 +13,7 @@
void erofs_unmap_metabuf(struct erofs_buf *buf)
{
if (buf->kmap_type == EROFS_KMAP)
- kunmap(buf->page);
- else if (buf->kmap_type == EROFS_KMAP_ATOMIC)
- kunmap_atomic(buf->base);
+ kunmap_local(buf->base);
buf->base = NULL;
buf->kmap_type = EROFS_NO_KMAP;
}
@@ -54,9 +52,7 @@ void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
}
if (buf->kmap_type == EROFS_NO_KMAP) {
if (type == EROFS_KMAP)
- buf->base = kmap(page);
- else if (type == EROFS_KMAP_ATOMIC)
- buf->base = kmap_atomic(page);
+ buf->base = kmap_local_page(page);
buf->kmap_type = type;
} else if (buf->kmap_type != type) {
DBG_BUGON(1);
@@ -403,6 +399,8 @@ const struct address_space_operations erofs_raw_access_aops = {
.readahead = erofs_readahead,
.bmap = erofs_bmap,
.direct_IO = noop_direct_IO,
+ .release_folio = iomap_release_folio,
+ .invalidate_folio = iomap_invalidate_folio,
};
#ifdef CONFIG_FS_DAX
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 2d55569f96ac..51b7ac7166d9 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -317,52 +317,61 @@ dstmap_out:
return ret;
}
-static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq,
- struct page **pagepool)
+static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
{
- const unsigned int nrpages_out =
+ const unsigned int inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
+ const unsigned int outpages =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
const unsigned int righthalf = min_t(unsigned int, rq->outputsize,
PAGE_SIZE - rq->pageofs_out);
const unsigned int lefthalf = rq->outputsize - righthalf;
+ const unsigned int interlaced_offset =
+ rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out;
unsigned char *src, *dst;
- if (nrpages_out > 2) {
+ if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) {
DBG_BUGON(1);
- return -EIO;
+ return -EFSCORRUPTED;
}
if (rq->out[0] == *rq->in) {
- DBG_BUGON(nrpages_out != 1);
+ DBG_BUGON(rq->pageofs_out);
return 0;
}
- src = kmap_atomic(*rq->in) + rq->pageofs_in;
+ src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in;
if (rq->out[0]) {
- dst = kmap_atomic(rq->out[0]);
- memcpy(dst + rq->pageofs_out, src, righthalf);
- kunmap_atomic(dst);
+ dst = kmap_local_page(rq->out[0]);
+ memcpy(dst + rq->pageofs_out, src + interlaced_offset,
+ righthalf);
+ kunmap_local(dst);
}
- if (nrpages_out == 2) {
- DBG_BUGON(!rq->out[1]);
- if (rq->out[1] == *rq->in) {
+ if (outpages > inpages) {
+ DBG_BUGON(!rq->out[outpages - 1]);
+ if (rq->out[outpages - 1] != rq->in[inpages - 1]) {
+ dst = kmap_local_page(rq->out[outpages - 1]);
+ memcpy(dst, interlaced_offset ? src :
+ (src + righthalf), lefthalf);
+ kunmap_local(dst);
+ } else if (!interlaced_offset) {
memmove(src, src + righthalf, lefthalf);
- } else {
- dst = kmap_atomic(rq->out[1]);
- memcpy(dst, src + righthalf, lefthalf);
- kunmap_atomic(dst);
}
}
- kunmap_atomic(src);
+ kunmap_local(src);
return 0;
}
static struct z_erofs_decompressor decompressors[] = {
[Z_EROFS_COMPRESSION_SHIFTED] = {
- .decompress = z_erofs_shifted_transform,
+ .decompress = z_erofs_transform_plain,
.name = "shifted"
},
+ [Z_EROFS_COMPRESSION_INTERLACED] = {
+ .decompress = z_erofs_transform_plain,
+ .name = "interlaced"
+ },
[Z_EROFS_COMPRESSION_LZ4] = {
.decompress = z_erofs_lz4_decompress,
.name = "lz4"
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 5e59b3f523eb..091fd5adf818 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -217,6 +217,9 @@ again:
strm->buf.out_size = min_t(u32, outlen,
PAGE_SIZE - pageofs);
outlen -= strm->buf.out_size;
+ if (!rq->out[no] && rq->fillgaps) /* deduped */
+ rq->out[no] = erofs_allocpage(pagepool,
+ GFP_KERNEL | __GFP_NOFAIL);
if (rq->out[no])
strm->buf.out = kmap(rq->out[no]) + pageofs;
pageofs = 0;
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 2b48373f690b..dbcd24371002 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -25,6 +25,8 @@
#define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008
#define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008
#define EROFS_FEATURE_INCOMPAT_ZTAILPACKING 0x00000010
+#define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020
+#define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020
#define EROFS_ALL_FEATURE_INCOMPAT \
(EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \
EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
@@ -32,7 +34,9 @@
EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \
EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \
EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \
- EROFS_FEATURE_INCOMPAT_ZTAILPACKING)
+ EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \
+ EROFS_FEATURE_INCOMPAT_FRAGMENTS | \
+ EROFS_FEATURE_INCOMPAT_DEDUPE)
#define EROFS_SB_EXTSLOT_SIZE 16
@@ -71,7 +75,9 @@ struct erofs_super_block {
} __packed u1;
__le16 extra_devices; /* # of devices besides the primary device */
__le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */
- __u8 reserved2[38];
+ __u8 reserved[6];
+ __le64 packed_nid; /* nid of the special packed inode */
+ __u8 reserved2[24];
};
/*
@@ -295,16 +301,27 @@ struct z_erofs_lzma_cfgs {
* bit 1 : HEAD1 big pcluster (0 - off; 1 - on)
* bit 2 : HEAD2 big pcluster (0 - off; 1 - on)
* bit 3 : tailpacking inline pcluster (0 - off; 1 - on)
+ * bit 4 : interlaced plain pcluster (0 - off; 1 - on)
+ * bit 5 : fragment pcluster (0 - off; 1 - on)
*/
#define Z_EROFS_ADVISE_COMPACTED_2B 0x0001
#define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002
#define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004
#define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008
+#define Z_EROFS_ADVISE_INTERLACED_PCLUSTER 0x0010
+#define Z_EROFS_ADVISE_FRAGMENT_PCLUSTER 0x0020
+#define Z_EROFS_FRAGMENT_INODE_BIT 7
struct z_erofs_map_header {
- __le16 h_reserved1;
- /* indicates the encoded size of tailpacking data */
- __le16 h_idata_size;
+ union {
+ /* fragment data offset in the packed inode */
+ __le32 h_fragmentoff;
+ struct {
+ __le16 h_reserved1;
+ /* indicates the encoded size of tailpacking data */
+ __le16 h_idata_size;
+ };
+ };
__le16 h_advise;
/*
* bit 0-3 : algorithm type of head 1 (logical cluster type 01);
@@ -313,7 +330,8 @@ struct z_erofs_map_header {
__u8 h_algorithmtype;
/*
* bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096;
- * bit 3-7 : reserved.
+ * bit 3-6 : reserved;
+ * bit 7 : move the whole file into packed inode or not.
*/
__u8 h_clusterbits;
};
@@ -355,6 +373,9 @@ enum {
#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2
#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0
+/* (noncompact only, HEAD) This pcluster refers to partial decompressed data */
+#define Z_EROFS_VLE_DI_PARTIAL_REF (1 << 15)
+
/*
* D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the
* compressed block count of a compressed extent (in logical clusters, aka.
@@ -402,6 +423,10 @@ struct erofs_dirent {
/* check the EROFS on-disk layout strictly at compile time */
static inline void erofs_check_ondisk_layout_definitions(void)
{
+ const __le64 fmh = *(__le64 *)&(struct z_erofs_map_header) {
+ .h_clusterbits = 1 << Z_EROFS_FRAGMENT_INODE_BIT
+ };
+
BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128);
BUILD_BUG_ON(sizeof(struct erofs_inode_compact) != 32);
BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64);
@@ -419,6 +444,9 @@ static inline void erofs_check_ondisk_layout_definitions(void)
BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) <
Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1);
+ /* exclude old compiler versions like gcc 7.5.0 */
+ BUILD_BUG_ON(__builtin_constant_p(fmh) ?
+ fmh != cpu_to_le64(1ULL << 63) : 0);
}
#endif
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index b5fd9d71e67f..014e20962376 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -1,429 +1,440 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2022, Alibaba Cloud
+ * Copyright (C) 2022, Bytedance Inc. All rights reserved.
*/
#include <linux/fscache.h>
#include "internal.h"
-static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space *mapping,
+static DEFINE_MUTEX(erofs_domain_list_lock);
+static DEFINE_MUTEX(erofs_domain_cookies_lock);
+static LIST_HEAD(erofs_domain_list);
+static struct vfsmount *erofs_pseudo_mnt;
+
+struct erofs_fscache_request {
+ struct erofs_fscache_request *primary;
+ struct netfs_cache_resources cache_resources;
+ struct address_space *mapping; /* The mapping being accessed */
+ loff_t start; /* Start position */
+ size_t len; /* Length of the request */
+ size_t submitted; /* Length of submitted */
+ short error; /* 0 or error that occurred */
+ refcount_t ref;
+};
+
+static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_space *mapping,
loff_t start, size_t len)
{
- struct netfs_io_request *rreq;
+ struct erofs_fscache_request *req;
- rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL);
- if (!rreq)
+ req = kzalloc(sizeof(struct erofs_fscache_request), GFP_KERNEL);
+ if (!req)
return ERR_PTR(-ENOMEM);
- rreq->start = start;
- rreq->len = len;
- rreq->mapping = mapping;
- rreq->inode = mapping->host;
- INIT_LIST_HEAD(&rreq->subrequests);
- refcount_set(&rreq->ref, 1);
- return rreq;
-}
+ req->mapping = mapping;
+ req->start = start;
+ req->len = len;
+ refcount_set(&req->ref, 1);
-static void erofs_fscache_put_request(struct netfs_io_request *rreq)
-{
- if (!refcount_dec_and_test(&rreq->ref))
- return;
- if (rreq->cache_resources.ops)
- rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
- kfree(rreq);
+ return req;
}
-static void erofs_fscache_put_subrequest(struct netfs_io_subrequest *subreq)
+static struct erofs_fscache_request *erofs_fscache_req_chain(struct erofs_fscache_request *primary,
+ size_t len)
{
- if (!refcount_dec_and_test(&subreq->ref))
- return;
- erofs_fscache_put_request(subreq->rreq);
- kfree(subreq);
-}
+ struct erofs_fscache_request *req;
-static void erofs_fscache_clear_subrequests(struct netfs_io_request *rreq)
-{
- struct netfs_io_subrequest *subreq;
+ /* use primary request for the first submission */
+ if (!primary->submitted) {
+ refcount_inc(&primary->ref);
+ return primary;
+ }
- while (!list_empty(&rreq->subrequests)) {
- subreq = list_first_entry(&rreq->subrequests,
- struct netfs_io_subrequest, rreq_link);
- list_del(&subreq->rreq_link);
- erofs_fscache_put_subrequest(subreq);
+ req = erofs_fscache_req_alloc(primary->mapping,
+ primary->start + primary->submitted, len);
+ if (!IS_ERR(req)) {
+ req->primary = primary;
+ refcount_inc(&primary->ref);
}
+ return req;
}
-static void erofs_fscache_rreq_unlock_folios(struct netfs_io_request *rreq)
+static void erofs_fscache_req_complete(struct erofs_fscache_request *req)
{
- struct netfs_io_subrequest *subreq;
struct folio *folio;
- unsigned int iopos = 0;
- pgoff_t start_page = rreq->start / PAGE_SIZE;
- pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
- bool subreq_failed = false;
-
- XA_STATE(xas, &rreq->mapping->i_pages, start_page);
+ bool failed = req->error;
+ pgoff_t start_page = req->start / PAGE_SIZE;
+ pgoff_t last_page = ((req->start + req->len) / PAGE_SIZE) - 1;
- subreq = list_first_entry(&rreq->subrequests,
- struct netfs_io_subrequest, rreq_link);
- subreq_failed = (subreq->error < 0);
+ XA_STATE(xas, &req->mapping->i_pages, start_page);
rcu_read_lock();
xas_for_each(&xas, folio, last_page) {
- unsigned int pgpos =
- (folio_index(folio) - start_page) * PAGE_SIZE;
- unsigned int pgend = pgpos + folio_size(folio);
- bool pg_failed = false;
-
- for (;;) {
- if (!subreq) {
- pg_failed = true;
- break;
- }
-
- pg_failed |= subreq_failed;
- if (pgend < iopos + subreq->len)
- break;
-
- iopos += subreq->len;
- if (!list_is_last(&subreq->rreq_link,
- &rreq->subrequests)) {
- subreq = list_next_entry(subreq, rreq_link);
- subreq_failed = (subreq->error < 0);
- } else {
- subreq = NULL;
- subreq_failed = false;
- }
- if (pgend == iopos)
- break;
- }
-
- if (!pg_failed)
+ if (xas_retry(&xas, folio))
+ continue;
+ if (!failed)
folio_mark_uptodate(folio);
-
folio_unlock(folio);
}
rcu_read_unlock();
}
-static void erofs_fscache_rreq_complete(struct netfs_io_request *rreq)
+static void erofs_fscache_req_put(struct erofs_fscache_request *req)
{
- erofs_fscache_rreq_unlock_folios(rreq);
- erofs_fscache_clear_subrequests(rreq);
- erofs_fscache_put_request(rreq);
+ if (refcount_dec_and_test(&req->ref)) {
+ if (req->cache_resources.ops)
+ req->cache_resources.ops->end_operation(&req->cache_resources);
+ if (!req->primary)
+ erofs_fscache_req_complete(req);
+ else
+ erofs_fscache_req_put(req->primary);
+ kfree(req);
+ }
}
-static void erofc_fscache_subreq_complete(void *priv,
+static void erofs_fscache_subreq_complete(void *priv,
ssize_t transferred_or_error, bool was_async)
{
- struct netfs_io_subrequest *subreq = priv;
- struct netfs_io_request *rreq = subreq->rreq;
-
- if (IS_ERR_VALUE(transferred_or_error))
- subreq->error = transferred_or_error;
-
- if (atomic_dec_and_test(&rreq->nr_outstanding))
- erofs_fscache_rreq_complete(rreq);
+ struct erofs_fscache_request *req = priv;
- erofs_fscache_put_subrequest(subreq);
+ if (IS_ERR_VALUE(transferred_or_error)) {
+ if (req->primary)
+ req->primary->error = transferred_or_error;
+ else
+ req->error = transferred_or_error;
+ }
+ erofs_fscache_req_put(req);
}
/*
- * Read data from fscache and fill the read data into page cache described by
- * @rreq, which shall be both aligned with PAGE_SIZE. @pstart describes
- * the start physical address in the cache file.
+ * Read data from fscache (cookie, pstart, len), and fill the read data into
+ * page cache described by (req->mapping, lstart, len). @pstart describeis the
+ * start physical address in the cache file.
*/
static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
- struct netfs_io_request *rreq, loff_t pstart)
+ struct erofs_fscache_request *req, loff_t pstart, size_t len)
{
enum netfs_io_source source;
- struct super_block *sb = rreq->mapping->host->i_sb;
- struct netfs_io_subrequest *subreq;
- struct netfs_cache_resources *cres = &rreq->cache_resources;
+ struct super_block *sb = req->mapping->host->i_sb;
+ struct netfs_cache_resources *cres = &req->cache_resources;
struct iov_iter iter;
- loff_t start = rreq->start;
- size_t len = rreq->len;
+ loff_t lstart = req->start + req->submitted;
size_t done = 0;
int ret;
- atomic_set(&rreq->nr_outstanding, 1);
+ DBG_BUGON(len > req->len - req->submitted);
ret = fscache_begin_read_operation(cres, cookie);
if (ret)
- goto out;
+ return ret;
while (done < len) {
- subreq = kzalloc(sizeof(struct netfs_io_subrequest),
- GFP_KERNEL);
- if (subreq) {
- INIT_LIST_HEAD(&subreq->rreq_link);
- refcount_set(&subreq->ref, 2);
- subreq->rreq = rreq;
- refcount_inc(&rreq->ref);
- } else {
- ret = -ENOMEM;
- goto out;
- }
+ loff_t sstart = pstart + done;
+ size_t slen = len - done;
+ unsigned long flags = 1 << NETFS_SREQ_ONDEMAND;
- subreq->start = pstart + done;
- subreq->len = len - done;
- subreq->flags = 1 << NETFS_SREQ_ONDEMAND;
-
- list_add_tail(&subreq->rreq_link, &rreq->subrequests);
-
- source = cres->ops->prepare_read(subreq, LLONG_MAX);
- if (WARN_ON(subreq->len == 0))
+ source = cres->ops->prepare_ondemand_read(cres,
+ sstart, &slen, LLONG_MAX, &flags, 0);
+ if (WARN_ON(slen == 0))
source = NETFS_INVALID_READ;
if (source != NETFS_READ_FROM_CACHE) {
- erofs_err(sb, "failed to fscache prepare_read (source %d)",
- source);
- ret = -EIO;
- subreq->error = ret;
- erofs_fscache_put_subrequest(subreq);
- goto out;
+ erofs_err(sb, "failed to fscache prepare_read (source %d)", source);
+ return -EIO;
}
- atomic_inc(&rreq->nr_outstanding);
+ refcount_inc(&req->ref);
+ iov_iter_xarray(&iter, ITER_DEST, &req->mapping->i_pages,
+ lstart + done, slen);
- iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages,
- start + done, subreq->len);
-
- ret = fscache_read(cres, subreq->start, &iter,
- NETFS_READ_HOLE_FAIL,
- erofc_fscache_subreq_complete, subreq);
+ ret = fscache_read(cres, sstart, &iter, NETFS_READ_HOLE_FAIL,
+ erofs_fscache_subreq_complete, req);
if (ret == -EIOCBQUEUED)
ret = 0;
if (ret) {
erofs_err(sb, "failed to fscache_read (ret %d)", ret);
- goto out;
+ return ret;
}
- done += subreq->len;
+ done += slen;
}
-out:
- if (atomic_dec_and_test(&rreq->nr_outstanding))
- erofs_fscache_rreq_complete(rreq);
-
- return ret;
+ DBG_BUGON(done != len);
+ return 0;
}
static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
{
int ret;
struct super_block *sb = folio_mapping(folio)->host->i_sb;
- struct netfs_io_request *rreq;
+ struct erofs_fscache_request *req;
struct erofs_map_dev mdev = {
.m_deviceid = 0,
.m_pa = folio_pos(folio),
};
ret = erofs_map_dev(sb, &mdev);
- if (ret)
- goto out;
+ if (ret) {
+ folio_unlock(folio);
+ return ret;
+ }
- rreq = erofs_fscache_alloc_request(folio_mapping(folio),
+ req = erofs_fscache_req_alloc(folio_mapping(folio),
folio_pos(folio), folio_size(folio));
- if (IS_ERR(rreq)) {
- ret = PTR_ERR(rreq);
- goto out;
+ if (IS_ERR(req)) {
+ folio_unlock(folio);
+ return PTR_ERR(req);
}
- return erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
- rreq, mdev.m_pa);
-out:
- folio_unlock(folio);
+ ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
+ req, mdev.m_pa, folio_size(folio));
+ if (ret)
+ req->error = ret;
+
+ erofs_fscache_req_put(req);
return ret;
}
-static int erofs_fscache_read_folio_inline(struct folio *folio,
- struct erofs_map_blocks *map)
+static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
{
- struct super_block *sb = folio_mapping(folio)->host->i_sb;
- struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- erofs_blk_t blknr;
- size_t offset, len;
- void *src, *dst;
-
- /* For tail packing layout, the offset may be non-zero. */
- offset = erofs_blkoff(map->m_pa);
- blknr = erofs_blknr(map->m_pa);
- len = map->m_llen;
-
- src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP);
- if (IS_ERR(src))
- return PTR_ERR(src);
-
- dst = kmap_local_folio(folio, 0);
- memcpy(dst, src + offset, len);
- memset(dst + len, 0, PAGE_SIZE - len);
- kunmap_local(dst);
-
- erofs_put_metabuf(&buf);
- return 0;
-}
-
-static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
-{
- struct inode *inode = folio_mapping(folio)->host;
+ struct address_space *mapping = primary->mapping;
+ struct inode *inode = mapping->host;
struct super_block *sb = inode->i_sb;
+ struct erofs_fscache_request *req;
struct erofs_map_blocks map;
struct erofs_map_dev mdev;
- struct netfs_io_request *rreq;
- erofs_off_t pos;
- loff_t pstart;
+ struct iov_iter iter;
+ loff_t pos = primary->start + primary->submitted;
+ size_t count;
int ret;
- DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ);
-
- pos = folio_pos(folio);
map.m_la = pos;
-
ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
if (ret)
- goto out_unlock;
+ return ret;
- if (!(map.m_flags & EROFS_MAP_MAPPED)) {
- folio_zero_range(folio, 0, folio_size(folio));
- goto out_uptodate;
+ if (map.m_flags & EROFS_MAP_META) {
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ erofs_blk_t blknr;
+ size_t offset, size;
+ void *src;
+
+ /* For tail packing layout, the offset may be non-zero. */
+ offset = erofs_blkoff(map.m_pa);
+ blknr = erofs_blknr(map.m_pa);
+ size = map.m_llen;
+
+ src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP);
+ if (IS_ERR(src))
+ return PTR_ERR(src);
+
+ iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, PAGE_SIZE);
+ if (copy_to_iter(src + offset, size, &iter) != size) {
+ erofs_put_metabuf(&buf);
+ return -EFAULT;
+ }
+ iov_iter_zero(PAGE_SIZE - size, &iter);
+ erofs_put_metabuf(&buf);
+ primary->submitted += PAGE_SIZE;
+ return 0;
}
- if (map.m_flags & EROFS_MAP_META) {
- ret = erofs_fscache_read_folio_inline(folio, &map);
- goto out_uptodate;
+ count = primary->len - primary->submitted;
+ if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+ iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, count);
+ iov_iter_zero(count, &iter);
+ primary->submitted += count;
+ return 0;
}
+ count = min_t(size_t, map.m_llen - (pos - map.m_la), count);
+ DBG_BUGON(!count || count % PAGE_SIZE);
+
mdev = (struct erofs_map_dev) {
.m_deviceid = map.m_deviceid,
.m_pa = map.m_pa,
};
-
ret = erofs_map_dev(sb, &mdev);
if (ret)
- goto out_unlock;
+ return ret;
+ req = erofs_fscache_req_chain(primary, count);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
- rreq = erofs_fscache_alloc_request(folio_mapping(folio),
- folio_pos(folio), folio_size(folio));
- if (IS_ERR(rreq)) {
- ret = PTR_ERR(rreq);
- goto out_unlock;
- }
+ ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
+ req, mdev.m_pa + (pos - map.m_la), count);
+ erofs_fscache_req_put(req);
+ primary->submitted += count;
+ return ret;
+}
+
+static int erofs_fscache_data_read(struct erofs_fscache_request *req)
+{
+ int ret;
- pstart = mdev.m_pa + (pos - map.m_la);
- return erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
- rreq, pstart);
+ do {
+ ret = erofs_fscache_data_read_slice(req);
+ if (ret)
+ req->error = ret;
+ } while (!ret && req->submitted < req->len);
-out_uptodate:
- if (!ret)
- folio_mark_uptodate(folio);
-out_unlock:
- folio_unlock(folio);
return ret;
}
-static void erofs_fscache_advance_folios(struct readahead_control *rac,
- size_t len, bool unlock)
+static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
{
- while (len) {
- struct folio *folio = readahead_folio(rac);
- len -= folio_size(folio);
- if (unlock) {
- folio_mark_uptodate(folio);
- folio_unlock(folio);
- }
+ struct erofs_fscache_request *req;
+ int ret;
+
+ req = erofs_fscache_req_alloc(folio_mapping(folio),
+ folio_pos(folio), folio_size(folio));
+ if (IS_ERR(req)) {
+ folio_unlock(folio);
+ return PTR_ERR(req);
}
+
+ ret = erofs_fscache_data_read(req);
+ erofs_fscache_req_put(req);
+ return ret;
}
static void erofs_fscache_readahead(struct readahead_control *rac)
{
- struct inode *inode = rac->mapping->host;
- struct super_block *sb = inode->i_sb;
- size_t len, count, done = 0;
- erofs_off_t pos;
- loff_t start, offset;
- int ret;
+ struct erofs_fscache_request *req;
if (!readahead_count(rac))
return;
- start = readahead_pos(rac);
- len = readahead_length(rac);
+ req = erofs_fscache_req_alloc(rac->mapping,
+ readahead_pos(rac), readahead_length(rac));
+ if (IS_ERR(req))
+ return;
- do {
- struct erofs_map_blocks map;
- struct erofs_map_dev mdev;
- struct netfs_io_request *rreq;
+ /* The request completion will drop refs on the folios. */
+ while (readahead_folio(rac))
+ ;
- pos = start + done;
- map.m_la = pos;
+ erofs_fscache_data_read(req);
+ erofs_fscache_req_put(req);
+}
- ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
- if (ret)
- return;
+static const struct address_space_operations erofs_fscache_meta_aops = {
+ .read_folio = erofs_fscache_meta_read_folio,
+};
- offset = start + done;
- count = min_t(size_t, map.m_llen - (pos - map.m_la),
- len - done);
+const struct address_space_operations erofs_fscache_access_aops = {
+ .read_folio = erofs_fscache_read_folio,
+ .readahead = erofs_fscache_readahead,
+};
- if (!(map.m_flags & EROFS_MAP_MAPPED)) {
- struct iov_iter iter;
+static void erofs_fscache_domain_put(struct erofs_domain *domain)
+{
+ if (!domain)
+ return;
+ mutex_lock(&erofs_domain_list_lock);
+ if (refcount_dec_and_test(&domain->ref)) {
+ list_del(&domain->list);
+ if (list_empty(&erofs_domain_list)) {
+ kern_unmount(erofs_pseudo_mnt);
+ erofs_pseudo_mnt = NULL;
+ }
+ mutex_unlock(&erofs_domain_list_lock);
+ fscache_relinquish_volume(domain->volume, NULL, false);
+ kfree(domain->domain_id);
+ kfree(domain);
+ return;
+ }
+ mutex_unlock(&erofs_domain_list_lock);
+}
- iov_iter_xarray(&iter, READ, &rac->mapping->i_pages,
- offset, count);
- iov_iter_zero(count, &iter);
+static int erofs_fscache_register_volume(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ char *domain_id = sbi->domain_id;
+ struct fscache_volume *volume;
+ char *name;
+ int ret = 0;
- erofs_fscache_advance_folios(rac, count, true);
- ret = count;
- continue;
- }
+ name = kasprintf(GFP_KERNEL, "erofs,%s",
+ domain_id ? domain_id : sbi->fsid);
+ if (!name)
+ return -ENOMEM;
- if (map.m_flags & EROFS_MAP_META) {
- struct folio *folio = readahead_folio(rac);
+ volume = fscache_acquire_volume(name, NULL, NULL, 0);
+ if (IS_ERR_OR_NULL(volume)) {
+ erofs_err(sb, "failed to register volume for %s", name);
+ ret = volume ? PTR_ERR(volume) : -EOPNOTSUPP;
+ volume = NULL;
+ }
- ret = erofs_fscache_read_folio_inline(folio, &map);
- if (!ret) {
- folio_mark_uptodate(folio);
- ret = folio_size(folio);
- }
+ sbi->volume = volume;
+ kfree(name);
+ return ret;
+}
- folio_unlock(folio);
- continue;
- }
+static int erofs_fscache_init_domain(struct super_block *sb)
+{
+ int err;
+ struct erofs_domain *domain;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
- mdev = (struct erofs_map_dev) {
- .m_deviceid = map.m_deviceid,
- .m_pa = map.m_pa,
- };
- ret = erofs_map_dev(sb, &mdev);
- if (ret)
- return;
+ domain = kzalloc(sizeof(struct erofs_domain), GFP_KERNEL);
+ if (!domain)
+ return -ENOMEM;
- rreq = erofs_fscache_alloc_request(rac->mapping, offset, count);
- if (IS_ERR(rreq))
- return;
- /*
- * Drop the ref of folios here. Unlock them in
- * rreq_unlock_folios() when rreq complete.
- */
- erofs_fscache_advance_folios(rac, count, false);
- ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
- rreq, mdev.m_pa + (pos - map.m_la));
- if (!ret)
- ret = count;
- } while (ret > 0 && ((done += ret) < len));
+ domain->domain_id = kstrdup(sbi->domain_id, GFP_KERNEL);
+ if (!domain->domain_id) {
+ kfree(domain);
+ return -ENOMEM;
+ }
+
+ err = erofs_fscache_register_volume(sb);
+ if (err)
+ goto out;
+
+ if (!erofs_pseudo_mnt) {
+ erofs_pseudo_mnt = kern_mount(&erofs_fs_type);
+ if (IS_ERR(erofs_pseudo_mnt)) {
+ err = PTR_ERR(erofs_pseudo_mnt);
+ goto out;
+ }
+ }
+
+ domain->volume = sbi->volume;
+ refcount_set(&domain->ref, 1);
+ list_add(&domain->list, &erofs_domain_list);
+ sbi->domain = domain;
+ return 0;
+out:
+ kfree(domain->domain_id);
+ kfree(domain);
+ return err;
}
-static const struct address_space_operations erofs_fscache_meta_aops = {
- .read_folio = erofs_fscache_meta_read_folio,
-};
+static int erofs_fscache_register_domain(struct super_block *sb)
+{
+ int err;
+ struct erofs_domain *domain;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
-const struct address_space_operations erofs_fscache_access_aops = {
- .read_folio = erofs_fscache_read_folio,
- .readahead = erofs_fscache_readahead,
-};
+ mutex_lock(&erofs_domain_list_lock);
+ list_for_each_entry(domain, &erofs_domain_list, list) {
+ if (!strcmp(domain->domain_id, sbi->domain_id)) {
+ sbi->domain = domain;
+ sbi->volume = domain->volume;
+ refcount_inc(&domain->ref);
+ mutex_unlock(&erofs_domain_list_lock);
+ return 0;
+ }
+ }
+ err = erofs_fscache_init_domain(sb);
+ mutex_unlock(&erofs_domain_list_lock);
+ return err;
+}
-int erofs_fscache_register_cookie(struct super_block *sb,
- struct erofs_fscache **fscache,
- char *name, bool need_inode)
+static
+struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb,
+ char *name,
+ unsigned int flags)
{
struct fscache_volume *volume = EROFS_SB(sb)->volume;
struct erofs_fscache *ctx;
@@ -432,7 +443,7 @@ int erofs_fscache_register_cookie(struct super_block *sb,
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
cookie = fscache_acquire_cookie(volume, FSCACHE_ADV_WANT_CACHE_SIZE,
name, strlen(name), NULL, 0, 0);
@@ -445,7 +456,7 @@ int erofs_fscache_register_cookie(struct super_block *sb,
fscache_use_cookie(cookie, false);
ctx->cookie = cookie;
- if (need_inode) {
+ if (flags & EROFS_REG_COOKIE_NEED_INODE) {
struct inode *const inode = new_inode(sb);
if (!inode) {
@@ -462,63 +473,171 @@ int erofs_fscache_register_cookie(struct super_block *sb,
ctx->inode = inode;
}
- *fscache = ctx;
- return 0;
+ return ctx;
err_cookie:
fscache_unuse_cookie(ctx->cookie, NULL, NULL);
fscache_relinquish_cookie(ctx->cookie, false);
- ctx->cookie = NULL;
err:
kfree(ctx);
- return ret;
+ return ERR_PTR(ret);
}
-void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache)
+static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx)
{
- struct erofs_fscache *ctx = *fscache;
-
- if (!ctx)
- return;
-
fscache_unuse_cookie(ctx->cookie, NULL, NULL);
fscache_relinquish_cookie(ctx->cookie, false);
- ctx->cookie = NULL;
-
iput(ctx->inode);
- ctx->inode = NULL;
-
+ kfree(ctx->name);
kfree(ctx);
- *fscache = NULL;
}
-int erofs_fscache_register_fs(struct super_block *sb)
+static
+struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb,
+ char *name,
+ unsigned int flags)
{
- struct erofs_sb_info *sbi = EROFS_SB(sb);
- struct fscache_volume *volume;
- char *name;
- int ret = 0;
+ int err;
+ struct inode *inode;
+ struct erofs_fscache *ctx;
+ struct erofs_domain *domain = EROFS_SB(sb)->domain;
- name = kasprintf(GFP_KERNEL, "erofs,%s", sbi->opt.fsid);
- if (!name)
- return -ENOMEM;
+ ctx = erofs_fscache_acquire_cookie(sb, name, flags);
+ if (IS_ERR(ctx))
+ return ctx;
- volume = fscache_acquire_volume(name, NULL, NULL, 0);
- if (IS_ERR_OR_NULL(volume)) {
- erofs_err(sb, "failed to register volume for %s", name);
- ret = volume ? PTR_ERR(volume) : -EOPNOTSUPP;
- volume = NULL;
+ ctx->name = kstrdup(name, GFP_KERNEL);
+ if (!ctx->name) {
+ err = -ENOMEM;
+ goto out;
}
- sbi->volume = volume;
- kfree(name);
- return ret;
+ inode = new_inode(erofs_pseudo_mnt->mnt_sb);
+ if (!inode) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ ctx->domain = domain;
+ ctx->anon_inode = inode;
+ inode->i_private = ctx;
+ refcount_inc(&domain->ref);
+ return ctx;
+out:
+ erofs_fscache_relinquish_cookie(ctx);
+ return ERR_PTR(err);
+}
+
+static
+struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb,
+ char *name,
+ unsigned int flags)
+{
+ struct inode *inode;
+ struct erofs_fscache *ctx;
+ struct erofs_domain *domain = EROFS_SB(sb)->domain;
+ struct super_block *psb = erofs_pseudo_mnt->mnt_sb;
+
+ mutex_lock(&erofs_domain_cookies_lock);
+ spin_lock(&psb->s_inode_list_lock);
+ list_for_each_entry(inode, &psb->s_inodes, i_sb_list) {
+ ctx = inode->i_private;
+ if (!ctx || ctx->domain != domain || strcmp(ctx->name, name))
+ continue;
+ if (!(flags & EROFS_REG_COOKIE_NEED_NOEXIST)) {
+ igrab(inode);
+ } else {
+ erofs_err(sb, "%s already exists in domain %s", name,
+ domain->domain_id);
+ ctx = ERR_PTR(-EEXIST);
+ }
+ spin_unlock(&psb->s_inode_list_lock);
+ mutex_unlock(&erofs_domain_cookies_lock);
+ return ctx;
+ }
+ spin_unlock(&psb->s_inode_list_lock);
+ ctx = erofs_fscache_domain_init_cookie(sb, name, flags);
+ mutex_unlock(&erofs_domain_cookies_lock);
+ return ctx;
+}
+
+struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
+ char *name,
+ unsigned int flags)
+{
+ if (EROFS_SB(sb)->domain_id)
+ return erofs_domain_register_cookie(sb, name, flags);
+ return erofs_fscache_acquire_cookie(sb, name, flags);
+}
+
+void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx)
+{
+ bool drop;
+ struct erofs_domain *domain;
+
+ if (!ctx)
+ return;
+ domain = ctx->domain;
+ if (domain) {
+ mutex_lock(&erofs_domain_cookies_lock);
+ drop = atomic_read(&ctx->anon_inode->i_count) == 1;
+ iput(ctx->anon_inode);
+ mutex_unlock(&erofs_domain_cookies_lock);
+ if (!drop)
+ return;
+ }
+
+ erofs_fscache_relinquish_cookie(ctx);
+ erofs_fscache_domain_put(domain);
+}
+
+int erofs_fscache_register_fs(struct super_block *sb)
+{
+ int ret;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_fscache *fscache;
+ unsigned int flags;
+
+ if (sbi->domain_id)
+ ret = erofs_fscache_register_domain(sb);
+ else
+ ret = erofs_fscache_register_volume(sb);
+ if (ret)
+ return ret;
+
+ /*
+ * When shared domain is enabled, using NEED_NOEXIST to guarantee
+ * the primary data blob (aka fsid) is unique in the shared domain.
+ *
+ * For non-shared-domain case, fscache_acquire_volume() invoked by
+ * erofs_fscache_register_volume() has already guaranteed
+ * the uniqueness of primary data blob.
+ *
+ * Acquired domain/volume will be relinquished in kill_sb() on error.
+ */
+ flags = EROFS_REG_COOKIE_NEED_INODE;
+ if (sbi->domain_id)
+ flags |= EROFS_REG_COOKIE_NEED_NOEXIST;
+ fscache = erofs_fscache_register_cookie(sb, sbi->fsid, flags);
+ if (IS_ERR(fscache))
+ return PTR_ERR(fscache);
+
+ sbi->s_fscache = fscache;
+ return 0;
}
void erofs_fscache_unregister_fs(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
- fscache_relinquish_volume(sbi->volume, NULL, false);
+ erofs_fscache_unregister_cookie(sbi->s_fscache);
+
+ if (sbi->domain)
+ erofs_fscache_domain_put(sbi->domain);
+ else
+ fscache_relinquish_volume(sbi->volume, NULL, false);
+
+ sbi->s_fscache = NULL;
sbi->volume = NULL;
+ sbi->domain = NULL;
}
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 95a403720e8c..d3b8736fa124 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -214,7 +214,7 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr,
/* if it cannot be handled with fast symlink scheme */
if (vi->datalayout != EROFS_INODE_FLAT_INLINE ||
- inode->i_size >= EROFS_BLKSIZ) {
+ inode->i_size >= EROFS_BLKSIZ || inode->i_size < 0) {
inode->i_op = &erofs_symlink_iops;
return 0;
}
@@ -241,7 +241,7 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr,
return 0;
}
-static int erofs_fill_inode(struct inode *inode, int isdir)
+static int erofs_fill_inode(struct inode *inode)
{
struct erofs_inode *vi = EROFS_I(inode);
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
@@ -249,7 +249,7 @@ static int erofs_fill_inode(struct inode *inode, int isdir)
unsigned int ofs;
int err = 0;
- trace_erofs_fill_inode(inode, isdir);
+ trace_erofs_fill_inode(inode);
/* read inode base data from disk */
kaddr = erofs_read_inode(&buf, inode, &ofs);
@@ -268,6 +268,7 @@ static int erofs_fill_inode(struct inode *inode, int isdir)
case S_IFDIR:
inode->i_op = &erofs_dir_iops;
inode->i_fop = &erofs_dir_fops;
+ inode_nohighmem(inode);
break;
case S_IFLNK:
err = erofs_fill_symlink(inode, kaddr, ofs);
@@ -295,6 +296,7 @@ static int erofs_fill_inode(struct inode *inode, int isdir)
goto out_unlock;
}
inode->i_mapping->a_ops = &erofs_raw_access_aops;
+ mapping_set_large_folios(inode->i_mapping);
#ifdef CONFIG_EROFS_FS_ONDEMAND
if (erofs_is_fscache_mode(inode->i_sb))
inode->i_mapping->a_ops = &erofs_fscache_access_aops;
@@ -324,21 +326,13 @@ static int erofs_iget_set_actor(struct inode *inode, void *opaque)
return 0;
}
-static inline struct inode *erofs_iget_locked(struct super_block *sb,
- erofs_nid_t nid)
+struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid)
{
const unsigned long hashval = erofs_inode_hash(nid);
+ struct inode *inode;
- return iget5_locked(sb, hashval, erofs_ilookup_test_actor,
+ inode = iget5_locked(sb, hashval, erofs_ilookup_test_actor,
erofs_iget_set_actor, &nid);
-}
-
-struct inode *erofs_iget(struct super_block *sb,
- erofs_nid_t nid,
- bool isdir)
-{
- struct inode *inode = erofs_iget_locked(sb, nid);
-
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -348,10 +342,10 @@ struct inode *erofs_iget(struct super_block *sb,
vi->nid = nid;
- err = erofs_fill_inode(inode, isdir);
- if (!err)
+ err = erofs_fill_inode(inode);
+ if (!err) {
unlock_new_inode(inode);
- else {
+ } else {
iget_failed(inode);
inode = ERR_PTR(err);
}
@@ -379,7 +373,7 @@ int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path,
const struct inode_operations erofs_generic_iops = {
.getattr = erofs_getattr,
.listxattr = erofs_listxattr,
- .get_acl = erofs_get_acl,
+ .get_inode_acl = erofs_get_acl,
.fiemap = erofs_fiemap,
};
@@ -387,12 +381,12 @@ const struct inode_operations erofs_symlink_iops = {
.get_link = page_get_link,
.getattr = erofs_getattr,
.listxattr = erofs_listxattr,
- .get_acl = erofs_get_acl,
+ .get_inode_acl = erofs_get_acl,
};
const struct inode_operations erofs_fast_symlink_iops = {
.get_link = simple_get_link,
.getattr = erofs_getattr,
.listxattr = erofs_listxattr,
- .get_acl = erofs_get_acl,
+ .get_inode_acl = erofs_get_acl,
};
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index a01cc82795a2..bb8501c0ff5b 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -75,7 +75,6 @@ struct erofs_mount_opts {
unsigned int max_sync_decompress_pages;
#endif
unsigned int mount_opt;
- char *fsid;
};
struct erofs_dev_context {
@@ -88,6 +87,8 @@ struct erofs_dev_context {
struct erofs_fs_context {
struct erofs_mount_opts opt;
struct erofs_dev_context *devs;
+ char *fsid;
+ char *domain_id;
};
/* all filesystem-wide lz4 configurations */
@@ -98,9 +99,19 @@ struct erofs_sb_lz4_info {
u16 max_pclusterblks;
};
+struct erofs_domain {
+ refcount_t ref;
+ struct list_head list;
+ struct fscache_volume *volume;
+ char *domain_id;
+};
+
struct erofs_fscache {
struct fscache_cookie *cookie;
struct inode *inode;
+ struct inode *anon_inode;
+ struct erofs_domain *domain;
+ char *name;
};
struct erofs_sb_info {
@@ -120,6 +131,7 @@ struct erofs_sb_info {
struct inode *managed_cache;
struct erofs_sb_lz4_info lz4;
+ struct inode *packed_inode;
#endif /* CONFIG_EROFS_FS_ZIP */
struct erofs_dev_context *devs;
struct dax_device *dax_dev;
@@ -157,6 +169,9 @@ struct erofs_sb_info {
/* fscache support */
struct fscache_volume *volume;
struct erofs_fscache *s_fscache;
+ struct erofs_domain *domain;
+ char *fsid;
+ char *domain_id;
};
#define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
@@ -183,7 +198,6 @@ enum {
EROFS_ZIP_CACHE_READAROUND
};
-#ifdef CONFIG_EROFS_FS_ZIP
#define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL)
/* basic unit of the workstation of a super_block */
@@ -223,7 +237,6 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
return atomic_cond_read_relaxed(&grp->refcount,
VAL != EROFS_LOCKED_MAGIC);
}
-#endif /* !CONFIG_EROFS_FS_ZIP */
/* we strictly follow PAGE_SIZE and no buffer head yet */
#define LOG_BLOCK_SIZE PAGE_SHIFT
@@ -242,8 +255,7 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
enum erofs_kmap_type {
EROFS_NO_KMAP, /* don't map the buffer */
- EROFS_KMAP, /* use kmap() to map the buffer */
- EROFS_KMAP_ATOMIC, /* use kmap_atomic() to map the buffer */
+ EROFS_KMAP, /* use kmap_local_page() to map the buffer */
};
struct erofs_buf {
@@ -277,6 +289,8 @@ EROFS_FEATURE_FUNCS(chunked_file, incompat, INCOMPAT_CHUNKED_FILE)
EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE)
EROFS_FEATURE_FUNCS(compr_head2, incompat, INCOMPAT_COMPR_HEAD2)
EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING)
+EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS)
+EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE)
EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
/* atomic flag definitions */
@@ -312,8 +326,13 @@ struct erofs_inode {
unsigned char z_algorithmtype[2];
unsigned char z_logical_clusterbits;
unsigned long z_tailextent_headlcn;
- erofs_off_t z_idataoff;
- unsigned short z_idata_size;
+ union {
+ struct {
+ erofs_off_t z_idataoff;
+ unsigned short z_idata_size;
+ };
+ erofs_off_t z_fragmentoff;
+ };
};
#endif /* CONFIG_EROFS_FS_ZIP */
};
@@ -364,6 +383,7 @@ struct page *erofs_grab_cache_page_nowait(struct address_space *mapping,
}
extern const struct super_operations erofs_sops;
+extern struct file_system_type erofs_fs_type;
extern const struct address_space_operations erofs_raw_access_aops;
extern const struct address_space_operations z_erofs_aops;
@@ -371,6 +391,8 @@ extern const struct address_space_operations z_erofs_aops;
enum {
BH_Encoded = BH_PrivateStart,
BH_FullMapped,
+ BH_Fragment,
+ BH_Partialref,
};
/* Has a disk mapping */
@@ -381,6 +403,10 @@ enum {
#define EROFS_MAP_ENCODED (1 << BH_Encoded)
/* The length of extent is full */
#define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped)
+/* Located in the special packed inode */
+#define EROFS_MAP_FRAGMENT (1 << BH_Fragment)
+/* The extent refers to partial decompressed data */
+#define EROFS_MAP_PARTIAL_REF (1 << BH_Partialref)
struct erofs_map_blocks {
struct erofs_buf buf;
@@ -402,11 +428,12 @@ struct erofs_map_blocks {
#define EROFS_GET_BLOCKS_FIEMAP 0x0002
/* Used to map the whole extent if non-negligible data is requested for LZMA */
#define EROFS_GET_BLOCKS_READMORE 0x0004
-/* Used to map tail extent for tailpacking inline pcluster */
+/* Used to map tail extent for tailpacking inline or fragment pcluster */
#define EROFS_GET_BLOCKS_FINDTAIL 0x0008
enum {
Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
+ Z_EROFS_COMPRESSION_INTERLACED,
Z_EROFS_COMPRESSION_RUNTIME_MAX
};
@@ -466,7 +493,7 @@ extern const struct inode_operations erofs_generic_iops;
extern const struct inode_operations erofs_symlink_iops;
extern const struct inode_operations erofs_fast_symlink_iops;
-struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid, bool dir);
+struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid);
int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask,
unsigned int query_flags);
@@ -576,32 +603,37 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb,
}
#endif /* !CONFIG_EROFS_FS_ZIP */
+/* flags for erofs_fscache_register_cookie() */
+#define EROFS_REG_COOKIE_NEED_INODE 1
+#define EROFS_REG_COOKIE_NEED_NOEXIST 2
+
/* fscache.c */
#ifdef CONFIG_EROFS_FS_ONDEMAND
int erofs_fscache_register_fs(struct super_block *sb);
void erofs_fscache_unregister_fs(struct super_block *sb);
-int erofs_fscache_register_cookie(struct super_block *sb,
- struct erofs_fscache **fscache,
- char *name, bool need_inode);
-void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache);
+struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
+ char *name,
+ unsigned int flags);
+void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache);
extern const struct address_space_operations erofs_fscache_access_aops;
#else
static inline int erofs_fscache_register_fs(struct super_block *sb)
{
- return 0;
+ return -EOPNOTSUPP;
}
static inline void erofs_fscache_unregister_fs(struct super_block *sb) {}
-static inline int erofs_fscache_register_cookie(struct super_block *sb,
- struct erofs_fscache **fscache,
- char *name, bool need_inode)
+static inline
+struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
+ char *name,
+ unsigned int flags)
{
- return -EOPNOTSUPP;
+ return ERR_PTR(-EOPNOTSUPP);
}
-static inline void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache)
+static inline void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache)
{
}
#endif
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index fd75506799c4..b64a108fac92 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -185,7 +185,6 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid,
if (IS_ERR(de))
return PTR_ERR(de);
- /* the target page has been mapped */
if (ndirents)
de = find_target_dirent(&qn, (u8 *)de, EROFS_BLKSIZ, ndirents);
@@ -197,9 +196,7 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid,
return PTR_ERR_OR_ZERO(de);
}
-/* NOTE: i_mutex is already held by vfs */
-static struct dentry *erofs_lookup(struct inode *dir,
- struct dentry *dentry,
+static struct dentry *erofs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
int err;
@@ -207,17 +204,11 @@ static struct dentry *erofs_lookup(struct inode *dir,
unsigned int d_type;
struct inode *inode;
- DBG_BUGON(!d_really_is_negative(dentry));
- /* dentry must be unhashed in lookup, no need to worry about */
- DBG_BUGON(!d_unhashed(dentry));
-
trace_erofs_lookup(dir, dentry, flags);
- /* file name exceeds fs limit */
if (dentry->d_name.len > EROFS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
- /* false uninitialized warnings on gcc 4.8.x */
err = erofs_namei(dir, &dentry->d_name, &nid, &d_type);
if (err == -ENOENT) {
@@ -228,7 +219,7 @@ static struct dentry *erofs_lookup(struct inode *dir,
} else {
erofs_dbg("%s, %pd (nid %llu) found, d_type %u", __func__,
dentry, nid, d_type);
- inode = erofs_iget(dir->i_sb, nid, d_type == FT_DIR);
+ inode = erofs_iget(dir->i_sb, nid);
}
return d_splice_alias(inode, dentry);
}
@@ -237,6 +228,6 @@ const struct inode_operations erofs_dir_iops = {
.lookup = erofs_lookup,
.getattr = erofs_getattr,
.listxattr = erofs_listxattr,
- .get_acl = erofs_get_acl,
+ .get_inode_acl = erofs_get_acl,
.fiemap = erofs_fiemap,
};
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 3173debeaa5a..481788c24a68 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -224,10 +224,10 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
struct erofs_device_info *dif, erofs_off_t *pos)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_fscache *fscache;
struct erofs_deviceslot *dis;
struct block_device *bdev;
void *ptr;
- int ret;
ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*pos), EROFS_KMAP);
if (IS_ERR(ptr))
@@ -245,10 +245,10 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
}
if (erofs_is_fscache_mode(sb)) {
- ret = erofs_fscache_register_cookie(sb, &dif->fscache,
- dif->path, false);
- if (ret)
- return ret;
+ fscache = erofs_fscache_register_cookie(sb, dif->path, 0);
+ if (IS_ERR(fscache))
+ return PTR_ERR(fscache);
+ dif->fscache = fscache;
} else {
bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL,
sb->s_type);
@@ -381,6 +381,17 @@ static int erofs_read_superblock(struct super_block *sb)
#endif
sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
sbi->root_nid = le16_to_cpu(dsb->root_nid);
+#ifdef CONFIG_EROFS_FS_ZIP
+ sbi->packed_inode = NULL;
+ if (erofs_sb_has_fragments(sbi) && dsb->packed_nid) {
+ sbi->packed_inode =
+ erofs_iget(sb, le64_to_cpu(dsb->packed_nid));
+ if (IS_ERR(sbi->packed_inode)) {
+ ret = PTR_ERR(sbi->packed_inode);
+ goto out;
+ }
+ }
+#endif
sbi->inos = le64_to_cpu(dsb->inos);
sbi->build_time = le64_to_cpu(dsb->build_time);
@@ -411,6 +422,10 @@ static int erofs_read_superblock(struct super_block *sb)
erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!");
if (erofs_is_fscache_mode(sb))
erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!");
+ if (erofs_sb_has_fragments(sbi))
+ erofs_info(sb, "EXPERIMENTAL compressed fragments feature in use. Use at your own risk!");
+ if (erofs_sb_has_dedupe(sbi))
+ erofs_info(sb, "EXPERIMENTAL global deduplication feature in use. Use at your own risk!");
out:
erofs_put_metabuf(&buf);
return ret;
@@ -440,6 +455,7 @@ enum {
Opt_dax_enum,
Opt_device,
Opt_fsid,
+ Opt_domain_id,
Opt_err
};
@@ -465,6 +481,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums),
fsparam_string("device", Opt_device),
fsparam_string("fsid", Opt_fsid),
+ fsparam_string("domain_id", Opt_domain_id),
{}
};
@@ -562,14 +579,24 @@ static int erofs_fc_parse_param(struct fs_context *fc,
break;
case Opt_fsid:
#ifdef CONFIG_EROFS_FS_ONDEMAND
- kfree(ctx->opt.fsid);
- ctx->opt.fsid = kstrdup(param->string, GFP_KERNEL);
- if (!ctx->opt.fsid)
+ kfree(ctx->fsid);
+ ctx->fsid = kstrdup(param->string, GFP_KERNEL);
+ if (!ctx->fsid)
return -ENOMEM;
#else
errorfc(fc, "fsid option not supported");
#endif
break;
+ case Opt_domain_id:
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+ kfree(ctx->domain_id);
+ ctx->domain_id = kstrdup(param->string, GFP_KERNEL);
+ if (!ctx->domain_id)
+ return -ENOMEM;
+#else
+ errorfc(fc, "domain_id option not supported");
+#endif
+ break;
default:
return -ENOPARAM;
}
@@ -641,7 +668,7 @@ static int erofs_init_managed_cache(struct super_block *sb) { return 0; }
static struct inode *erofs_nfs_get_inode(struct super_block *sb,
u64 ino, u32 generation)
{
- return erofs_iget(sb, ino, false);
+ return erofs_iget(sb, ino);
}
static struct dentry *erofs_fh_to_dentry(struct super_block *sb,
@@ -667,7 +694,7 @@ static struct dentry *erofs_get_parent(struct dentry *child)
err = erofs_namei(d_inode(child), &dotdot_name, &nid, &d_type);
if (err)
return ERR_PTR(err);
- return d_obtain_alias(erofs_iget(child->d_sb, nid, d_type == FT_DIR));
+ return d_obtain_alias(erofs_iget(child->d_sb, nid));
}
static const struct export_operations erofs_export_ops = {
@@ -676,6 +703,13 @@ static const struct export_operations erofs_export_ops = {
.get_parent = erofs_get_parent,
};
+static int erofs_fc_fill_pseudo_super(struct super_block *sb, struct fs_context *fc)
+{
+ static const struct tree_descr empty_descr = {""};
+
+ return simple_fill_super(sb, EROFS_SUPER_MAGIC, &empty_descr);
+}
+
static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct inode *inode;
@@ -694,9 +728,12 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_fs_info = sbi;
sbi->opt = ctx->opt;
- ctx->opt.fsid = NULL;
sbi->devs = ctx->devs;
ctx->devs = NULL;
+ sbi->fsid = ctx->fsid;
+ ctx->fsid = NULL;
+ sbi->domain_id = ctx->domain_id;
+ ctx->domain_id = NULL;
if (erofs_is_fscache_mode(sb)) {
sb->s_blocksize = EROFS_BLKSIZ;
@@ -706,11 +743,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
if (err)
return err;
- err = erofs_fscache_register_cookie(sb, &sbi->s_fscache,
- sbi->opt.fsid, true);
- if (err)
- return err;
-
err = super_setup_bdi(sb);
if (err)
return err;
@@ -752,7 +784,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
#endif
/* get the root inode */
- inode = erofs_iget(sb, ROOT_NID(sbi), true);
+ inode = erofs_iget(sb, ROOT_NID(sbi));
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -781,11 +813,16 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
return 0;
}
+static int erofs_fc_anon_get_tree(struct fs_context *fc)
+{
+ return get_tree_nodev(fc, erofs_fc_fill_pseudo_super);
+}
+
static int erofs_fc_get_tree(struct fs_context *fc)
{
struct erofs_fs_context *ctx = fc->fs_private;
- if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && ctx->opt.fsid)
+ if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && ctx->fsid)
return get_tree_nodev(fc, erofs_fc_fill_super);
return get_tree_bdev(fc, erofs_fc_fill_super);
@@ -799,6 +836,9 @@ static int erofs_fc_reconfigure(struct fs_context *fc)
DBG_BUGON(!sb_rdonly(sb));
+ if (ctx->fsid || ctx->domain_id)
+ erofs_info(sb, "ignoring reconfiguration for fsid|domain_id.");
+
if (test_opt(&ctx->opt, POSIX_ACL))
fc->sb_flags |= SB_POSIXACL;
else
@@ -817,7 +857,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
fs_put_dax(dif->dax_dev, NULL);
if (dif->bdev)
blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL);
- erofs_fscache_unregister_cookie(&dif->fscache);
+ erofs_fscache_unregister_cookie(dif->fscache);
+ dif->fscache = NULL;
kfree(dif->path);
kfree(dif);
return 0;
@@ -837,7 +878,8 @@ static void erofs_fc_free(struct fs_context *fc)
struct erofs_fs_context *ctx = fc->fs_private;
erofs_free_dev_context(ctx->devs);
- kfree(ctx->opt.fsid);
+ kfree(ctx->fsid);
+ kfree(ctx->domain_id);
kfree(ctx);
}
@@ -848,10 +890,21 @@ static const struct fs_context_operations erofs_context_ops = {
.free = erofs_fc_free,
};
+static const struct fs_context_operations erofs_anon_context_ops = {
+ .get_tree = erofs_fc_anon_get_tree,
+};
+
static int erofs_init_fs_context(struct fs_context *fc)
{
- struct erofs_fs_context *ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ struct erofs_fs_context *ctx;
+
+ /* pseudo mount for anon inodes */
+ if (fc->sb_flags & SB_KERNMOUNT) {
+ fc->ops = &erofs_anon_context_ops;
+ return 0;
+ }
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL);
@@ -878,8 +931,14 @@ static void erofs_kill_sb(struct super_block *sb)
WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC);
+ /* pseudo mount for anon inodes */
+ if (sb->s_flags & SB_KERNMOUNT) {
+ kill_anon_super(sb);
+ return;
+ }
+
if (erofs_is_fscache_mode(sb))
- generic_shutdown_super(sb);
+ kill_anon_super(sb);
else
kill_block_super(sb);
@@ -889,9 +948,9 @@ static void erofs_kill_sb(struct super_block *sb)
erofs_free_dev_context(sbi->devs);
fs_put_dax(sbi->dax_dev, NULL);
- erofs_fscache_unregister_cookie(&sbi->s_fscache);
erofs_fscache_unregister_fs(sb);
- kfree(sbi->opt.fsid);
+ kfree(sbi->fsid);
+ kfree(sbi->domain_id);
kfree(sbi);
sb->s_fs_info = NULL;
}
@@ -908,11 +967,13 @@ static void erofs_put_super(struct super_block *sb)
#ifdef CONFIG_EROFS_FS_ZIP
iput(sbi->managed_cache);
sbi->managed_cache = NULL;
+ iput(sbi->packed_inode);
+ sbi->packed_inode = NULL;
#endif
- erofs_fscache_unregister_cookie(&sbi->s_fscache);
+ erofs_fscache_unregister_fs(sb);
}
-static struct file_system_type erofs_fs_type = {
+struct file_system_type erofs_fs_type = {
.owner = THIS_MODULE,
.name = "erofs",
.init_fs_context = erofs_init_fs_context,
@@ -1042,8 +1103,10 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
if (test_opt(opt, DAX_NEVER))
seq_puts(seq, ",dax=never");
#ifdef CONFIG_EROFS_FS_ONDEMAND
- if (opt->fsid)
- seq_printf(seq, ",fsid=%s", opt->fsid);
+ if (sbi->fsid)
+ seq_printf(seq, ",fsid=%s", sbi->fsid);
+ if (sbi->domain_id)
+ seq_printf(seq, ",domain_id=%s", sbi->domain_id);
#endif
return 0;
}
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
index c1383e508bbe..fd476961f742 100644
--- a/fs/erofs/sysfs.c
+++ b/fs/erofs/sysfs.c
@@ -76,6 +76,8 @@ EROFS_ATTR_FEATURE(device_table);
EROFS_ATTR_FEATURE(compr_head2);
EROFS_ATTR_FEATURE(sb_chksum);
EROFS_ATTR_FEATURE(ztailpacking);
+EROFS_ATTR_FEATURE(fragments);
+EROFS_ATTR_FEATURE(dedupe);
static struct attribute *erofs_feat_attrs[] = {
ATTR_LIST(zero_padding),
@@ -86,6 +88,8 @@ static struct attribute *erofs_feat_attrs[] = {
ATTR_LIST(compr_head2),
ATTR_LIST(sb_chksum),
ATTR_LIST(ztailpacking),
+ ATTR_LIST(fragments),
+ ATTR_LIST(dedupe),
NULL,
};
ATTRIBUTE_GROUPS(erofs_feat);
@@ -201,12 +205,27 @@ static struct kobject erofs_feat = {
int erofs_register_sysfs(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
+ char *name;
+ char *str = NULL;
int err;
+ if (erofs_is_fscache_mode(sb)) {
+ if (sbi->domain_id) {
+ str = kasprintf(GFP_KERNEL, "%s,%s", sbi->domain_id,
+ sbi->fsid);
+ if (!str)
+ return -ENOMEM;
+ name = str;
+ } else {
+ name = sbi->fsid;
+ }
+ } else {
+ name = sb->s_id;
+ }
sbi->s_kobj.kset = &erofs_root;
init_completion(&sbi->s_kobj_unregister);
- err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s",
- erofs_is_fscache_mode(sb) ? sbi->opt.fsid : sb->s_id);
+ err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", name);
+ kfree(str);
if (err)
goto put_sb_kobj;
return 0;
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 8106bcb5a38d..a62fb8a3318a 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -148,7 +148,7 @@ static inline int xattr_iter_fixup(struct xattr_iter *it)
it->blkaddr += erofs_blknr(it->ofs);
it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr,
- EROFS_KMAP_ATOMIC);
+ EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
it->ofs = erofs_blkoff(it->ofs);
@@ -174,7 +174,7 @@ static int inline_xattr_iter_begin(struct xattr_iter *it,
it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs);
it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr,
- EROFS_KMAP_ATOMIC);
+ EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
return vi->xattr_isize - xattr_header_sz;
@@ -368,7 +368,7 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr,
- EROFS_KMAP_ATOMIC);
+ EROFS_KMAP);
if (IS_ERR(it->it.kaddr))
return PTR_ERR(it->it.kaddr);
it->it.blkaddr = blkaddr;
@@ -580,7 +580,7 @@ static int shared_listxattr(struct listxattr_iter *it)
it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr,
- EROFS_KMAP_ATOMIC);
+ EROFS_KMAP);
if (IS_ERR(it->it.kaddr))
return PTR_ERR(it->it.kaddr);
it->it.blkaddr = blkaddr;
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
index 332462c59f11..0a43c9ee9f8f 100644
--- a/fs/erofs/xattr.h
+++ b/fs/erofs/xattr.h
@@ -39,9 +39,7 @@ static inline unsigned int xattrblock_offset(struct erofs_sb_info *sbi,
#ifdef CONFIG_EROFS_FS_XATTR
extern const struct xattr_handler erofs_xattr_user_handler;
extern const struct xattr_handler erofs_xattr_trusted_handler;
-#ifdef CONFIG_EROFS_FS_SECURITY
extern const struct xattr_handler erofs_xattr_security_handler;
-#endif
static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx)
{
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 5792ca9e0d5e..ccf7c55d477f 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -7,6 +7,7 @@
#include "zdata.h"
#include "compress.h"
#include <linux/prefetch.h>
+#include <linux/psi.h>
#include <trace/events/erofs.h>
@@ -174,16 +175,6 @@ static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
DBG_BUGON(1);
}
-/* how to allocate cached pages for a pcluster */
-enum z_erofs_cache_alloctype {
- DONTALLOC, /* don't allocate any cached pages */
- /*
- * try to use cached I/O if page allocation succeeds or fallback
- * to in-place I/O instead to avoid any direct reclaim.
- */
- TRYALLOC,
-};
-
/*
* tagged pointer with 1-bit tag for all compressed pages
* tag 0 - the page is just found with an extra page reference
@@ -291,12 +282,29 @@ struct z_erofs_decompress_frontend {
.inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
.mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true }
+static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
+{
+ unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy;
+
+ if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
+ return false;
+
+ if (fe->backmost)
+ return true;
+
+ if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
+ fe->map.m_la < fe->headoffset)
+ return true;
+
+ return false;
+}
+
static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
- enum z_erofs_cache_alloctype type,
struct page **pagepool)
{
struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
struct z_erofs_pcluster *pcl = fe->pcl;
+ bool shouldalloc = z_erofs_should_alloc_cache(fe);
bool standalone = true;
/*
* optimistic allocation without direct reclaim since inplace I/O
@@ -325,18 +333,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
} else {
/* I/O is needed, no possible to decompress directly */
standalone = false;
- switch (type) {
- case TRYALLOC:
- newpage = erofs_allocpage(pagepool, gfp);
- if (!newpage)
- continue;
- set_page_private(newpage,
- Z_EROFS_PREALLOCATED_PAGE);
- t = tag_compressed_page_justfound(newpage);
- break;
- default: /* DONTALLOC */
+ if (!shouldalloc)
continue;
- }
+
+ /*
+ * try to use cached I/O if page allocation
+ * succeeds or fallback to in-place I/O instead
+ * to avoid any direct reclaim.
+ */
+ newpage = erofs_allocpage(pagepool, gfp);
+ if (!newpage)
+ continue;
+ set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
+ t = tag_compressed_page_justfound(newpage);
}
if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL,
@@ -487,7 +496,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
struct erofs_workgroup *grp;
int err;
- if (!(map->m_flags & EROFS_MAP_ENCODED)) {
+ if (!(map->m_flags & EROFS_MAP_ENCODED) ||
+ (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
@@ -636,30 +646,45 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
return true;
}
-static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
- unsigned int cachestrategy,
- erofs_off_t la)
+static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos,
+ struct page *page, unsigned int pageofs,
+ unsigned int len)
{
- if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
- return false;
+ struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ u8 *src, *dst;
+ unsigned int i, cnt;
- if (fe->backmost)
- return true;
+ if (!packed_inode)
+ return -EFSCORRUPTED;
- return cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
- la < fe->headoffset;
+ pos += EROFS_I(inode)->z_fragmentoff;
+ for (i = 0; i < len; i += cnt) {
+ cnt = min_t(unsigned int, len - i,
+ EROFS_BLKSIZ - erofs_blkoff(pos));
+ src = erofs_bread(&buf, packed_inode,
+ erofs_blknr(pos), EROFS_KMAP);
+ if (IS_ERR(src)) {
+ erofs_put_metabuf(&buf);
+ return PTR_ERR(src);
+ }
+
+ dst = kmap_local_page(page);
+ memcpy(dst + pageofs + i, src + erofs_blkoff(pos), cnt);
+ kunmap_local(dst);
+ pos += cnt;
+ }
+ erofs_put_metabuf(&buf);
+ return 0;
}
static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
struct page *page, struct page **pagepool)
{
struct inode *const inode = fe->inode;
- struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct erofs_map_blocks *const map = &fe->map;
const loff_t offset = page_offset(page);
bool tight = true, exclusive;
-
- enum z_erofs_cache_alloctype cache_strategy;
unsigned int cur, end, spiltted;
int err = 0;
@@ -688,7 +713,8 @@ repeat:
/* didn't get a valid pcluster previously (very rare) */
}
- if (!(map->m_flags & EROFS_MAP_MAPPED))
+ if (!(map->m_flags & EROFS_MAP_MAPPED) ||
+ map->m_flags & EROFS_MAP_FRAGMENT)
goto hitted;
err = z_erofs_collector_begin(fe);
@@ -712,13 +738,7 @@ repeat:
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
} else {
/* bind cache first when cached decompression is preferred */
- if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy,
- map->m_la))
- cache_strategy = TRYALLOC;
- else
- cache_strategy = DONTALLOC;
-
- z_erofs_bind_cache(fe, cache_strategy, pagepool);
+ z_erofs_bind_cache(fe, pagepool);
}
hitted:
/*
@@ -735,6 +755,24 @@ hitted:
zero_user_segment(page, cur, end);
goto next_part;
}
+ if (map->m_flags & EROFS_MAP_FRAGMENT) {
+ unsigned int pageofs, skip, len;
+
+ if (offset > map->m_la) {
+ pageofs = 0;
+ skip = offset - map->m_la;
+ } else {
+ pageofs = map->m_la & ~PAGE_MASK;
+ skip = 0;
+ }
+ len = min_t(unsigned int, map->m_llen - skip, end - cur);
+ err = z_erofs_read_fragment(inode, skip, page, pageofs, len);
+ if (err)
+ goto out;
+ ++spiltted;
+ tight = false;
+ goto next_part;
+ }
exclusive = (!cur && (!spiltted || tight));
if (cur)
@@ -764,14 +802,14 @@ retry:
++spiltted;
if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
fe->pcl->multibases = true;
-
- if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
- fe->pcl->length == map->m_llen)
- fe->pcl->partial = false;
if (fe->pcl->length < offset + end - map->m_la) {
fe->pcl->length = offset + end - map->m_la;
fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
}
+ if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
+ !(map->m_flags & EROFS_MAP_PARTIAL_REF) &&
+ fe->pcl->length == map->m_llen)
+ fe->pcl->partial = false;
next_part:
/* shorten the remaining extent to update progress */
map->m_llen = offset + cur - map->m_la;
@@ -838,15 +876,13 @@ static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) {
unsigned int pgnr;
- struct page *oldpage;
pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT;
DBG_BUGON(pgnr >= be->nr_pages);
- oldpage = be->decompressed_pages[pgnr];
- be->decompressed_pages[pgnr] = bvec->page;
-
- if (!oldpage)
+ if (!be->decompressed_pages[pgnr]) {
+ be->decompressed_pages[pgnr] = bvec->page;
return;
+ }
}
/* (cold path) one pcluster is requested multiple times */
@@ -1365,6 +1401,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
struct block_device *last_bdev;
unsigned int nr_bios = 0;
struct bio *bio = NULL;
+ unsigned long pflags;
+ int memstall = 0;
bi_private = jobqueueset_init(sb, q, fgq, force_fg);
qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
@@ -1415,9 +1453,18 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
last_bdev != mdev.m_bdev)) {
submit_bio_retry:
submit_bio(bio);
+ if (memstall) {
+ psi_memstall_leave(&pflags);
+ memstall = 0;
+ }
bio = NULL;
}
+ if (unlikely(PageWorkingset(page)) && !memstall) {
+ psi_memstall_enter(&pflags);
+ memstall = 1;
+ }
+
if (!bio) {
bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
REQ_OP_READ, GFP_NOIO);
@@ -1445,8 +1492,11 @@ submit_bio_retry:
move_to_bypass_jobqueue(pcl, qtail, owned_head);
} while (owned_head != Z_EROFS_PCLUSTER_TAIL);
- if (bio)
+ if (bio) {
submit_bio(bio);
+ if (memstall)
+ psi_memstall_leave(&pflags);
+ }
/*
* although background is preferred, no one is pending for submission.
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index e7f04c4fbb81..d98c95212985 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -126,10 +126,10 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
}
/*
- * bit 31: I/O error occurred on this page
- * bit 0 - 30: remaining parts to complete this page
+ * bit 30: I/O error occurred on this page
+ * bit 0 - 29: remaining parts to complete this page
*/
-#define Z_EROFS_PAGE_EIO (1 << 31)
+#define Z_EROFS_PAGE_EIO (1 << 30)
static inline void z_erofs_onlinepage_init(struct page *page)
{
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index d58549ca1df9..0150570c33aa 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -17,7 +17,7 @@ int z_erofs_fill_inode(struct inode *inode)
struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
if (!erofs_sb_has_big_pcluster(sbi) &&
- !erofs_sb_has_ztailpacking(sbi) &&
+ !erofs_sb_has_ztailpacking(sbi) && !erofs_sb_has_fragments(sbi) &&
vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) {
vi->z_advise = 0;
vi->z_algorithmtype[0] = 0;
@@ -55,20 +55,25 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
goto out_unlock;
- DBG_BUGON(!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
- !erofs_sb_has_ztailpacking(EROFS_SB(sb)) &&
- vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY);
-
pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
vi->xattr_isize, 8);
- kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos),
- EROFS_KMAP_ATOMIC);
+ kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP);
if (IS_ERR(kaddr)) {
err = PTR_ERR(kaddr);
goto out_unlock;
}
h = kaddr + erofs_blkoff(pos);
+ /*
+ * if the highest bit of the 8-byte map header is set, the whole file
+ * is stored in the packed inode. The rest bits keeps z_fragmentoff.
+ */
+ if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) {
+ vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER;
+ vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63);
+ vi->z_tailextent_headlcn = 0;
+ goto done;
+ }
vi->z_advise = le16_to_cpu(h->h_advise);
vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
@@ -79,7 +84,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel",
headnr + 1, vi->z_algorithmtype[headnr], vi->nid);
err = -EOPNOTSUPP;
- goto unmap_done;
+ goto out_put_metabuf;
}
vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7);
@@ -89,7 +94,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu",
vi->nid);
err = -EFSCORRUPTED;
- goto unmap_done;
+ goto out_put_metabuf;
}
if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION &&
!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^
@@ -97,12 +102,8 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu",
vi->nid);
err = -EFSCORRUPTED;
- goto unmap_done;
+ goto out_put_metabuf;
}
-unmap_done:
- erofs_put_metabuf(&buf);
- if (err)
- goto out_unlock;
if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) {
struct erofs_map_blocks map = {
@@ -121,11 +122,28 @@ unmap_done:
err = -EFSCORRUPTED;
}
if (err < 0)
- goto out_unlock;
+ goto out_put_metabuf;
}
+
+ if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER &&
+ !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) {
+ struct erofs_map_blocks map = {
+ .buf = __EROFS_BUF_INITIALIZER
+ };
+
+ vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff);
+ err = z_erofs_do_map_blocks(inode, &map,
+ EROFS_GET_BLOCKS_FINDTAIL);
+ erofs_put_metabuf(&map.buf);
+ if (err < 0)
+ goto out_put_metabuf;
+ }
+done:
/* paired with smp_mb() at the beginning of the function */
smp_mb();
set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
+out_put_metabuf:
+ erofs_put_metabuf(&buf);
out_unlock:
clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags);
return err;
@@ -143,20 +161,9 @@ struct z_erofs_maprecorder {
u16 delta[2];
erofs_blk_t pblk, compressedblks;
erofs_off_t nextpackoff;
+ bool partialref;
};
-static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m,
- erofs_blk_t eblk)
-{
- struct super_block *const sb = m->inode->i_sb;
-
- m->kaddr = erofs_read_metabuf(&m->map->buf, sb, eblk,
- EROFS_KMAP_ATOMIC);
- if (IS_ERR(m->kaddr))
- return PTR_ERR(m->kaddr);
- return 0;
-}
-
static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
unsigned long lcn)
{
@@ -169,11 +176,11 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
lcn * sizeof(struct z_erofs_vle_decompressed_index);
struct z_erofs_vle_decompressed_index *di;
unsigned int advise, type;
- int err;
- err = z_erofs_reload_indexes(m, erofs_blknr(pos));
- if (err)
- return err;
+ m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
+ erofs_blknr(pos), EROFS_KMAP);
+ if (IS_ERR(m->kaddr))
+ return PTR_ERR(m->kaddr);
m->nextpackoff = pos + sizeof(struct z_erofs_vle_decompressed_index);
m->lcn = lcn;
@@ -201,6 +208,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
+ if (advise & Z_EROFS_VLE_DI_PARTIAL_REF)
+ m->partialref = true;
m->clusterofs = le16_to_cpu(di->di_clusterofs);
m->pblk = le32_to_cpu(di->di_u.blkaddr);
break;
@@ -370,7 +379,6 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
unsigned int compacted_4b_initial, compacted_2b;
unsigned int amortizedshift;
erofs_off_t pos;
- int err;
if (lclusterbits != 12)
return -EOPNOTSUPP;
@@ -407,9 +415,10 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
amortizedshift = 2;
out:
pos += lcn * (1 << amortizedshift);
- err = z_erofs_reload_indexes(m, erofs_blknr(pos));
- if (err)
- return err;
+ m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
+ erofs_blknr(pos), EROFS_KMAP);
+ if (IS_ERR(m->kaddr))
+ return PTR_ERR(m->kaddr);
return unpack_compacted_index(m, amortizedshift, pos, lookahead);
}
@@ -598,6 +607,7 @@ static int z_erofs_do_map_blocks(struct inode *inode,
{
struct erofs_inode *const vi = EROFS_I(inode);
bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER;
+ bool fragment = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER;
struct z_erofs_maprecorder m = {
.inode = inode,
.map = map,
@@ -663,28 +673,47 @@ static int z_erofs_do_map_blocks(struct inode *inode,
err = -EOPNOTSUPP;
goto unmap_out;
}
-
+ if (m.partialref)
+ map->m_flags |= EROFS_MAP_PARTIAL_REF;
map->m_llen = end - map->m_la;
- if (flags & EROFS_GET_BLOCKS_FINDTAIL)
+ if (flags & EROFS_GET_BLOCKS_FINDTAIL) {
vi->z_tailextent_headlcn = m.lcn;
+ /* for non-compact indexes, fragmentoff is 64 bits */
+ if (fragment &&
+ vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY)
+ vi->z_fragmentoff |= (u64)m.pblk << 32;
+ }
if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) {
map->m_flags |= EROFS_MAP_META;
map->m_pa = vi->z_idataoff;
map->m_plen = vi->z_idata_size;
+ } else if (fragment && m.lcn == vi->z_tailextent_headlcn) {
+ map->m_flags |= EROFS_MAP_FRAGMENT;
} else {
map->m_pa = blknr_to_addr(m.pblk);
err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
if (err)
- goto out;
+ goto unmap_out;
}
- if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN)
- map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
- else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2)
+ if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) {
+ if (map->m_llen > map->m_plen) {
+ DBG_BUGON(1);
+ err = -EFSCORRUPTED;
+ goto unmap_out;
+ }
+ if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER)
+ map->m_algorithmformat =
+ Z_EROFS_COMPRESSION_INTERLACED;
+ else
+ map->m_algorithmformat =
+ Z_EROFS_COMPRESSION_SHIFTED;
+ } else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) {
map->m_algorithmformat = vi->z_algorithmtype[1];
- else
+ } else {
map->m_algorithmformat = vi->z_algorithmtype[0];
+ }
if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
((flags & EROFS_GET_BLOCKS_READMORE) &&
@@ -694,21 +723,19 @@ static int z_erofs_do_map_blocks(struct inode *inode,
if (!err)
map->m_flags |= EROFS_MAP_FULL_MAPPED;
}
+
unmap_out:
erofs_unmap_metabuf(&m.map->buf);
-
-out:
erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o",
__func__, map->m_la, map->m_pa,
map->m_llen, map->m_plen, map->m_flags);
-
return err;
}
-int z_erofs_map_blocks_iter(struct inode *inode,
- struct erofs_map_blocks *map,
+int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
int flags)
{
+ struct erofs_inode *const vi = EROFS_I(inode);
int err = 0;
trace_z_erofs_map_blocks_iter_enter(inode, map, flags);
@@ -725,6 +752,15 @@ int z_erofs_map_blocks_iter(struct inode *inode,
if (err)
goto out;
+ if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) &&
+ !vi->z_tailextent_headlcn) {
+ map->m_la = 0;
+ map->m_llen = inode->i_size;
+ map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_FULL_MAPPED |
+ EROFS_MAP_FRAGMENT;
+ goto out;
+ }
+
err = z_erofs_do_map_blocks(inode, map, flags);
out:
trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err);
@@ -751,7 +787,8 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset,
iomap->length = map.m_llen;
if (map.m_flags & EROFS_MAP_MAPPED) {
iomap->type = IOMAP_MAPPED;
- iomap->addr = map.m_pa;
+ iomap->addr = map.m_flags & EROFS_MAP_FRAGMENT ?
+ IOMAP_NULL_ADDR : map.m_pa;
} else {
iomap->type = IOMAP_HOLE;
iomap->addr = IOMAP_NULL_ADDR;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 3627dd7d25db..249ca6c0b784 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -43,21 +43,7 @@ struct eventfd_ctx {
int id;
};
-/**
- * eventfd_signal - Adds @n to the eventfd counter.
- * @ctx: [in] Pointer to the eventfd context.
- * @n: [in] Value of the counter to be added to the eventfd internal counter.
- * The value cannot be negative.
- *
- * This function is supposed to be called by the kernel in paths that do not
- * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
- * value, and we signal this as overflow condition by returning a EPOLLERR
- * to poll(2).
- *
- * Returns the amount by which the counter was incremented. This will be less
- * than @n if the counter has overflowed.
- */
-__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
+__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask)
{
unsigned long flags;
@@ -69,21 +55,40 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
* it returns false, the eventfd_signal() call should be deferred to a
* safe context.
*/
- if (WARN_ON_ONCE(current->in_eventfd_signal))
+ if (WARN_ON_ONCE(current->in_eventfd))
return 0;
spin_lock_irqsave(&ctx->wqh.lock, flags);
- current->in_eventfd_signal = 1;
+ current->in_eventfd = 1;
if (ULLONG_MAX - ctx->count < n)
n = ULLONG_MAX - ctx->count;
ctx->count += n;
if (waitqueue_active(&ctx->wqh))
- wake_up_locked_poll(&ctx->wqh, EPOLLIN);
- current->in_eventfd_signal = 0;
+ wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask);
+ current->in_eventfd = 0;
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
return n;
}
+
+/**
+ * eventfd_signal - Adds @n to the eventfd counter.
+ * @ctx: [in] Pointer to the eventfd context.
+ * @n: [in] Value of the counter to be added to the eventfd internal counter.
+ * The value cannot be negative.
+ *
+ * This function is supposed to be called by the kernel in paths that do not
+ * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
+ * value, and we signal this as overflow condition by returning a EPOLLERR
+ * to poll(2).
+ *
+ * Returns the amount by which the counter was incremented. This will be less
+ * than @n if the counter has overflowed.
+ */
+__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
+{
+ return eventfd_signal_mask(ctx, n, 0);
+}
EXPORT_SYMBOL_GPL(eventfd_signal);
static void eventfd_free_ctx(struct eventfd_ctx *ctx)
@@ -253,8 +258,10 @@ static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
__set_current_state(TASK_RUNNING);
}
eventfd_ctx_do_read(ctx, &ucnt);
+ current->in_eventfd = 1;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
+ current->in_eventfd = 0;
spin_unlock_irq(&ctx->wqh.lock);
if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt)))
return -EFAULT;
@@ -301,8 +308,10 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
}
if (likely(res > 0)) {
ctx->count += ucnt;
+ current->in_eventfd = 1;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+ current->in_eventfd = 0;
}
spin_unlock_irq(&ctx->wqh.lock);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8b56b94e2f56..64659b110973 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -491,7 +491,8 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
+static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
+ unsigned pollflags)
{
struct eventpoll *ep_src;
unsigned long flags;
@@ -522,16 +523,17 @@ static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
}
spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
ep->nests = nests + 1;
- wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
+ wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags);
ep->nests = 0;
spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
}
#else
-static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
+static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
+ unsigned pollflags)
{
- wake_up_poll(&ep->poll_wait, EPOLLIN);
+ wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags);
}
#endif
@@ -742,7 +744,7 @@ static void ep_free(struct eventpoll *ep)
/* We need to release all tasks waiting for these file */
if (waitqueue_active(&ep->poll_wait))
- ep_poll_safewake(ep, NULL);
+ ep_poll_safewake(ep, NULL, 0);
/*
* We need to lock this because we could be hit by
@@ -1065,7 +1067,7 @@ static inline bool list_add_tail_lockless(struct list_head *new,
* added to the list from another CPU: the winner observes
* new->next == new.
*/
- if (cmpxchg(&new->next, new, head) != new)
+ if (!try_cmpxchg(&new->next, &new, head))
return false;
/*
@@ -1208,7 +1210,7 @@ out_unlock:
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(ep, epi);
+ ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE);
if (!(epi->event.events & EPOLLEXCLUSIVE))
ewake = 1;
@@ -1553,7 +1555,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(ep, NULL);
+ ep_poll_safewake(ep, NULL, 0);
return 0;
}
@@ -1629,7 +1631,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(ep, NULL);
+ ep_poll_safewake(ep, NULL, 0);
return 0;
}
diff --git a/fs/exec.c b/fs/exec.c
index d046dbb9cbd0..ab913243a367 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -28,7 +28,6 @@
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
-#include <linux/vmacache.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/swap.h>
@@ -65,6 +64,7 @@
#include <linux/io_uring.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/coredump.h>
+#include <linux/time_namespace.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -172,7 +172,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
exit:
fput(file);
out:
- return error;
+ return error;
}
#endif /* #ifdef CONFIG_USELIB */
@@ -200,7 +200,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
{
struct page *page;
int ret;
- unsigned int gup_flags = FOLL_FORCE;
+ unsigned int gup_flags = 0;
#ifdef CONFIG_STACK_GROWSUP
if (write) {
@@ -683,6 +683,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
unsigned long length = old_end - old_start;
unsigned long new_start = old_start - shift;
unsigned long new_end = old_end - shift;
+ VMA_ITERATOR(vmi, mm, new_start);
+ struct vm_area_struct *next;
struct mmu_gather tlb;
BUG_ON(new_start > new_end);
@@ -691,7 +693,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
* ensure there are no vmas between where we want to go
* and where we are
*/
- if (vma != find_vma(mm, new_start))
+ if (vma != vma_next(&vmi))
return -EFAULT;
/*
@@ -710,12 +712,13 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
lru_add_drain();
tlb_gather_mmu(&tlb, mm);
+ next = vma_next(&vmi);
if (new_end > old_start) {
/*
* when the old and new regions overlap clear from new_end.
*/
free_pgd_range(&tlb, new_end, old_end, new_end,
- vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
+ next ? next->vm_start : USER_PGTABLES_CEILING);
} else {
/*
* otherwise, clean from old_start; this is done to not touch
@@ -724,7 +727,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
* for the others its just a little faster.
*/
free_pgd_range(&tlb, old_start, old_end, new_end,
- vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
+ next ? next->vm_start : USER_PGTABLES_CEILING);
}
tlb_finish_mmu(&tlb);
@@ -840,16 +843,13 @@ int setup_arg_pages(struct linux_binprm *bprm,
* will align it up.
*/
rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK;
+
+ stack_expand = min(rlim_stack, stack_size + stack_expand);
+
#ifdef CONFIG_STACK_GROWSUP
- if (stack_size + stack_expand > rlim_stack)
- stack_base = vma->vm_start + rlim_stack;
- else
- stack_base = vma->vm_end + stack_expand;
+ stack_base = vma->vm_start + stack_expand;
#else
- if (stack_size + stack_expand > rlim_stack)
- stack_base = vma->vm_end - rlim_stack;
- else
- stack_base = vma->vm_start - stack_expand;
+ stack_base = vma->vm_end - stack_expand;
#endif
current->mm->start_stack = bprm->p;
ret = expand_stack(vma, stack_base);
@@ -957,8 +957,7 @@ struct file *open_exec(const char *name)
}
EXPORT_SYMBOL(open_exec);
-#if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \
- defined(CONFIG_BINFMT_ELF_FDPIC)
+#if defined(CONFIG_BINFMT_FLAT) || defined(CONFIG_BINFMT_ELF_FDPIC)
ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
{
ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
@@ -1023,9 +1022,9 @@ static int exec_mmap(struct mm_struct *mm)
activate_mm(active_mm, mm);
if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
local_irq_enable();
- tsk->mm->vmacache_seqnum = 0;
- vmacache_flush(tsk);
+ lru_gen_add_mm(mm);
task_unlock(tsk);
+ lru_gen_use_mm(mm);
if (old_mm) {
mmap_read_unlock(old_mm);
BUG_ON(active_mm != old_mm);
@@ -1196,11 +1195,11 @@ static int unshare_sighand(struct task_struct *me)
return -ENOMEM;
refcount_set(&newsighand->count, 1);
- memcpy(newsighand->action, oldsighand->action,
- sizeof(newsighand->action));
write_lock_irq(&tasklist_lock);
spin_lock(&oldsighand->siglock);
+ memcpy(newsighand->action, oldsighand->action,
+ sizeof(newsighand->action));
rcu_assign_pointer(me->sighand, newsighand);
spin_unlock(&oldsighand->siglock);
write_unlock_irq(&tasklist_lock);
@@ -1296,6 +1295,10 @@ int begin_new_exec(struct linux_binprm * bprm)
bprm->mm = NULL;
+ retval = exec_task_namespaces();
+ if (retval)
+ goto out_unlock;
+
#ifdef CONFIG_POSIX_TIMERS
spin_lock_irq(&me->sighand->siglock);
posix_cpu_timers_exit(me);
@@ -1567,6 +1570,12 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
if (task_no_new_privs(current))
bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
+ /*
+ * If another task is sharing our fs, we cannot safely
+ * suid exec because the differently privileged task
+ * will be able to manipulate the current directory, etc.
+ * It would be nice to force an unshare instead...
+ */
t = p;
n_fs = 1;
spin_lock(&p->fs->lock);
@@ -1588,10 +1597,10 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
{
/* Handle suid and sgid on files */
struct user_namespace *mnt_userns;
- struct inode *inode;
+ struct inode *inode = file_inode(file);
unsigned int mode;
- kuid_t uid;
- kgid_t gid;
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
if (!mnt_may_suid(file->f_path.mnt))
return;
@@ -1599,7 +1608,6 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
if (task_no_new_privs(current))
return;
- inode = file->f_path.dentry->d_inode;
mode = READ_ONCE(inode->i_mode);
if (!(mode & (S_ISUID|S_ISGID)))
return;
@@ -1611,23 +1619,23 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
/* reload atomically mode/uid/gid now that lock held */
mode = inode->i_mode;
- uid = i_uid_into_mnt(mnt_userns, inode);
- gid = i_gid_into_mnt(mnt_userns, inode);
+ vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
inode_unlock(inode);
/* We ignore suid/sgid if there are no mappings for them in the ns */
- if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
- !kgid_has_mapping(bprm->cred->user_ns, gid))
+ if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) ||
+ !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid))
return;
if (mode & S_ISUID) {
bprm->per_clear |= PER_CLEAR_ON_SETID;
- bprm->cred->euid = uid;
+ bprm->cred->euid = vfsuid_into_kuid(vfsuid);
}
if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
bprm->per_clear |= PER_CLEAR_ON_SETID;
- bprm->cred->egid = gid;
+ bprm->cred->egid = vfsgid_into_kgid(vfsgid);
}
}
@@ -1748,6 +1756,7 @@ static int search_binary_handler(struct linux_binprm *bprm)
return retval;
}
+/* binfmt handlers will call back into begin_new_exec() on success. */
static int exec_binprm(struct linux_binprm *bprm)
{
pid_t old_pid, old_vpid;
@@ -1806,6 +1815,11 @@ static int bprm_execve(struct linux_binprm *bprm,
if (retval)
return retval;
+ /*
+ * Check for unsafe execution states before exec_binprm(), which
+ * will call back into begin_new_exec(), into bprm_creds_from_file(),
+ * where setuid-ness is evaluated.
+ */
check_unsafe_exec(bprm);
current->in_execve = 1;
@@ -1881,7 +1895,7 @@ static int do_execveat_common(int fd, struct filename *filename,
* whether NPROC limit is still exceeded.
*/
if ((current->flags & PF_NPROC_EXCEEDED) &&
- is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
+ is_rlimit_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
retval = -EAGAIN;
goto out_ret;
}
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index a27b55ec060a..1dfa67f307f1 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -33,10 +33,9 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
struct exfat_chain *p_dir, int entry, unsigned short *uniname)
{
int i;
- struct exfat_entry_set_cache *es;
+ struct exfat_entry_set_cache es;
- es = exfat_get_dentry_set(sb, p_dir, entry, ES_ALL_ENTRIES);
- if (!es)
+ if (exfat_get_dentry_set(&es, sb, p_dir, entry, ES_ALL_ENTRIES))
return;
/*
@@ -45,8 +44,8 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
* Third entry : first file-name entry
* So, the index of first file-name dentry should start from 2.
*/
- for (i = 2; i < es->num_entries; i++) {
- struct exfat_dentry *ep = exfat_get_dentry_cached(es, i);
+ for (i = ES_IDX_FIRST_FILENAME; i < es.num_entries; i++) {
+ struct exfat_dentry *ep = exfat_get_dentry_cached(&es, i);
/* end of name entry */
if (exfat_get_entry_type(ep) != TYPE_EXTEND)
@@ -56,13 +55,13 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
uniname += EXFAT_FILE_NAME_LEN;
}
- exfat_free_dentry_set(es, false);
+ exfat_put_dentry_set(&es, false);
}
/* read a directory entry from the opened directory */
static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_entry *dir_entry)
{
- int i, dentries_per_clu, dentries_per_clu_bits = 0, num_ext;
+ int i, dentries_per_clu, num_ext;
unsigned int type, clu_offset, max_dentries;
struct exfat_chain dir, clu;
struct exfat_uni_name uni_name;
@@ -84,11 +83,10 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
EXFAT_B_TO_CLU(i_size_read(inode), sbi), ei->flags);
dentries_per_clu = sbi->dentries_per_clu;
- dentries_per_clu_bits = ilog2(dentries_per_clu);
max_dentries = (unsigned int)min_t(u64, MAX_EXFAT_DENTRIES,
- (u64)sbi->num_clusters << dentries_per_clu_bits);
+ (u64)EXFAT_CLU_TO_DEN(sbi->num_clusters, sbi));
- clu_offset = dentry >> dentries_per_clu_bits;
+ clu_offset = EXFAT_DEN_TO_CLU(dentry, sbi);
exfat_chain_dup(&clu, &dir);
if (clu.flags == ALLOC_NO_FAT_CHAIN) {
@@ -163,7 +161,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
dir_entry->entry = dentry;
brelse(bh);
- ei->hint_bmap.off = dentry >> dentries_per_clu_bits;
+ ei->hint_bmap.off = EXFAT_DEN_TO_CLU(dentry, sbi);
ei->hint_bmap.clu = clu.dir;
*cpos = EXFAT_DEN_TO_B(dentry + 1 + num_ext);
@@ -212,9 +210,9 @@ static void exfat_free_namebuf(struct exfat_dentry_namebuf *nb)
/* skip iterating emit_dots when dir is empty */
#define ITER_POS_FILLED_DOTS (2)
-static int exfat_iterate(struct file *filp, struct dir_context *ctx)
+static int exfat_iterate(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct inode *tmp;
struct exfat_dir_entry de;
@@ -228,7 +226,7 @@ static int exfat_iterate(struct file *filp, struct dir_context *ctx)
mutex_lock(&EXFAT_SB(sb)->s_lock);
cpos = ctx->pos;
- if (!dir_emit_dots(filp, ctx))
+ if (!dir_emit_dots(file, ctx))
goto unlock;
if (ctx->pos == ITER_POS_FILLED_DOTS) {
@@ -337,7 +335,7 @@ int exfat_calc_num_entries(struct exfat_uni_name *p_uniname)
return -EINVAL;
/* 1 file entry + 1 stream entry + name entries */
- return ((len - 1) / EXFAT_FILE_NAME_LEN + 3);
+ return ES_ENTRY_NUM(len);
}
unsigned int exfat_get_entry_type(struct exfat_dentry *ep)
@@ -592,18 +590,18 @@ void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es)
unsigned short chksum = 0;
struct exfat_dentry *ep;
- for (i = 0; i < es->num_entries; i++) {
+ for (i = ES_IDX_FILE; i < es->num_entries; i++) {
ep = exfat_get_dentry_cached(es, i);
chksum = exfat_calc_chksum16(ep, DENTRY_SIZE, chksum,
chksum_type);
chksum_type = CS_DEFAULT;
}
- ep = exfat_get_dentry_cached(es, 0);
+ ep = exfat_get_dentry_cached(es, ES_IDX_FILE);
ep->dentry.file.checksum = cpu_to_le16(chksum);
es->modified = true;
}
-int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync)
+int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync)
{
int i, err = 0;
@@ -615,7 +613,10 @@ int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync)
bforget(es->bh[i]);
else
brelse(es->bh[i]);
- kfree(es);
+
+ if (IS_DYNAMIC_ES(es))
+ kfree(es->bh);
+
return err;
}
@@ -812,14 +813,14 @@ struct exfat_dentry *exfat_get_dentry_cached(
* pointer of entry set on success,
* NULL on failure.
*/
-struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
- struct exfat_chain *p_dir, int entry, unsigned int type)
+int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
+ struct super_block *sb, struct exfat_chain *p_dir, int entry,
+ unsigned int type)
{
int ret, i, num_bh;
- unsigned int off, byte_offset, clu = 0;
+ unsigned int off;
sector_t sec;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- struct exfat_entry_set_cache *es;
struct exfat_dentry *ep;
int num_entries;
enum exfat_validate_dentry_mode mode = ES_MODE_STARTED;
@@ -827,52 +828,51 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
if (p_dir->dir == DIR_DELETED) {
exfat_err(sb, "access to deleted dentry");
- return NULL;
+ return -EIO;
}
- byte_offset = EXFAT_DEN_TO_B(entry);
- ret = exfat_walk_fat_chain(sb, p_dir, byte_offset, &clu);
+ ret = exfat_find_location(sb, p_dir, entry, &sec, &off);
if (ret)
- return NULL;
+ return ret;
- es = kzalloc(sizeof(*es), GFP_KERNEL);
- if (!es)
- return NULL;
+ memset(es, 0, sizeof(*es));
es->sb = sb;
es->modified = false;
-
- /* byte offset in cluster */
- byte_offset = EXFAT_CLU_OFFSET(byte_offset, sbi);
-
- /* byte offset in sector */
- off = EXFAT_BLK_OFFSET(byte_offset, sb);
es->start_off = off;
-
- /* sector offset in cluster */
- sec = EXFAT_B_TO_BLK(byte_offset, sb);
- sec += exfat_cluster_to_sector(sbi, clu);
+ es->bh = es->__bh;
bh = sb_bread(sb, sec);
if (!bh)
- goto free_es;
+ return -EIO;
es->bh[es->num_bh++] = bh;
- ep = exfat_get_dentry_cached(es, 0);
+ ep = exfat_get_dentry_cached(es, ES_IDX_FILE);
if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode))
- goto free_es;
+ goto put_es;
num_entries = type == ES_ALL_ENTRIES ?
ep->dentry.file.num_ext + 1 : type;
es->num_entries = num_entries;
num_bh = EXFAT_B_TO_BLK_ROUND_UP(off + num_entries * DENTRY_SIZE, sb);
+ if (num_bh > ARRAY_SIZE(es->__bh)) {
+ es->bh = kmalloc_array(num_bh, sizeof(*es->bh), GFP_KERNEL);
+ if (!es->bh) {
+ brelse(bh);
+ return -ENOMEM;
+ }
+ es->bh[0] = bh;
+ }
+
for (i = 1; i < num_bh; i++) {
/* get the next sector */
if (exfat_is_last_sector_in_cluster(sbi, sec)) {
+ unsigned int clu = exfat_sector_to_cluster(sbi, sec);
+
if (p_dir->flags == ALLOC_NO_FAT_CHAIN)
clu++;
else if (exfat_get_next_cluster(sb, &clu))
- goto free_es;
+ goto put_es;
sec = exfat_cluster_to_sector(sbi, clu);
} else {
sec++;
@@ -880,21 +880,51 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
bh = sb_bread(sb, sec);
if (!bh)
- goto free_es;
+ goto put_es;
es->bh[es->num_bh++] = bh;
}
/* validate cached dentries */
- for (i = 1; i < num_entries; i++) {
+ for (i = ES_IDX_STREAM; i < num_entries; i++) {
ep = exfat_get_dentry_cached(es, i);
if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode))
- goto free_es;
+ goto put_es;
}
- return es;
+ return 0;
+
+put_es:
+ exfat_put_dentry_set(es, false);
+ return -EIO;
+}
-free_es:
- exfat_free_dentry_set(es, false);
- return NULL;
+static inline void exfat_reset_empty_hint(struct exfat_hint_femp *hint_femp)
+{
+ hint_femp->eidx = EXFAT_HINT_NONE;
+ hint_femp->count = 0;
+}
+
+static inline void exfat_set_empty_hint(struct exfat_inode_info *ei,
+ struct exfat_hint_femp *candi_empty, struct exfat_chain *clu,
+ int dentry, int num_entries, int entry_type)
+{
+ if (ei->hint_femp.eidx == EXFAT_HINT_NONE ||
+ ei->hint_femp.eidx > dentry) {
+ int total_entries = EXFAT_B_TO_DEN(i_size_read(&ei->vfs_inode));
+
+ if (candi_empty->count == 0) {
+ candi_empty->cur = *clu;
+ candi_empty->eidx = dentry;
+ }
+
+ if (entry_type == TYPE_UNUSED)
+ candi_empty->count += total_entries - dentry;
+ else
+ candi_empty->count++;
+
+ if (candi_empty->count == num_entries ||
+ candi_empty->count + candi_empty->eidx == total_entries)
+ ei->hint_femp = *candi_empty;
+ }
}
enum {
@@ -917,17 +947,21 @@ enum {
*/
int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname,
- int num_entries, unsigned int type, struct exfat_hint *hint_opt)
+ struct exfat_hint *hint_opt)
{
int i, rewind = 0, dentry = 0, end_eidx = 0, num_ext = 0, len;
int order, step, name_len = 0;
- int dentries_per_clu, num_empty = 0;
+ int dentries_per_clu;
unsigned int entry_type;
unsigned short *uniname = NULL;
struct exfat_chain clu;
struct exfat_hint *hint_stat = &ei->hint_stat;
struct exfat_hint_femp candi_empty;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ int num_entries = exfat_calc_num_entries(p_uniname);
+
+ if (num_entries < 0)
+ return num_entries;
dentries_per_clu = sbi->dentries_per_clu;
@@ -939,10 +973,13 @@ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
end_eidx = dentry;
}
- candi_empty.eidx = EXFAT_HINT_NONE;
+ exfat_reset_empty_hint(&ei->hint_femp);
+
rewind:
order = 0;
step = DIRENT_STEP_FILE;
+ exfat_reset_empty_hint(&candi_empty);
+
while (clu.dir != EXFAT_EOF_CLUSTER) {
i = dentry & (dentries_per_clu - 1);
for (; i < dentries_per_clu; i++, dentry++) {
@@ -962,26 +999,9 @@ rewind:
entry_type == TYPE_DELETED) {
step = DIRENT_STEP_FILE;
- num_empty++;
- if (candi_empty.eidx == EXFAT_HINT_NONE &&
- num_empty == 1) {
- exfat_chain_set(&candi_empty.cur,
- clu.dir, clu.size, clu.flags);
- }
-
- if (candi_empty.eidx == EXFAT_HINT_NONE &&
- num_empty >= num_entries) {
- candi_empty.eidx =
- dentry - (num_empty - 1);
- WARN_ON(candi_empty.eidx < 0);
- candi_empty.count = num_empty;
-
- if (ei->hint_femp.eidx ==
- EXFAT_HINT_NONE ||
- candi_empty.eidx <=
- ei->hint_femp.eidx)
- ei->hint_femp = candi_empty;
- }
+ exfat_set_empty_hint(ei, &candi_empty, &clu,
+ dentry, num_entries,
+ entry_type);
brelse(bh);
if (entry_type == TYPE_UNUSED)
@@ -989,17 +1009,14 @@ rewind:
continue;
}
- num_empty = 0;
- candi_empty.eidx = EXFAT_HINT_NONE;
+ exfat_reset_empty_hint(&candi_empty);
if (entry_type == TYPE_FILE || entry_type == TYPE_DIR) {
step = DIRENT_STEP_FILE;
hint_opt->clu = clu.dir;
hint_opt->eidx = i;
- if (type == TYPE_ALL || type == entry_type) {
- num_ext = ep->dentry.file.num_ext;
- step = DIRENT_STEP_STRM;
- }
+ num_ext = ep->dentry.file.num_ext;
+ step = DIRENT_STEP_STRM;
brelse(bh);
continue;
}
@@ -1090,12 +1107,19 @@ not_found:
rewind = 1;
dentry = 0;
clu.dir = p_dir->dir;
- /* reset empty hint */
- num_empty = 0;
- candi_empty.eidx = EXFAT_HINT_NONE;
goto rewind;
}
+ /*
+ * set the EXFAT_EOF_CLUSTER flag to avoid search
+ * from the beginning again when allocated a new cluster
+ */
+ if (ei->hint_femp.eidx == EXFAT_HINT_NONE) {
+ ei->hint_femp.cur.dir = EXFAT_EOF_CLUSTER;
+ ei->hint_femp.eidx = p_dir->size * dentries_per_clu;
+ ei->hint_femp.count = 0;
+ }
+
/* initialized hint_stat */
hint_stat->clu = p_dir->dir;
hint_stat->eidx = 0;
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index a8f8eee4937c..bc6d21d7c5ad 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -9,6 +9,7 @@
#include <linux/fs.h>
#include <linux/ratelimit.h>
#include <linux/nls.h>
+#include <linux/blkdev.h>
#define EXFAT_ROOT_INO 1
@@ -41,6 +42,14 @@ enum {
#define ES_2_ENTRIES 2
#define ES_ALL_ENTRIES 0
+#define ES_IDX_FILE 0
+#define ES_IDX_STREAM 1
+#define ES_IDX_FIRST_FILENAME 2
+#define EXFAT_FILENAME_ENTRY_NUM(name_len) \
+ DIV_ROUND_UP(name_len, EXFAT_FILE_NAME_LEN)
+#define ES_IDX_LAST_FILENAME(name_len) \
+ (ES_IDX_FIRST_FILENAME + EXFAT_FILENAME_ENTRY_NUM(name_len) - 1)
+
#define DIR_DELETED 0xFFFF0321
/* type values */
@@ -62,15 +71,11 @@ enum {
#define TYPE_PADDING 0x0402
#define TYPE_ACLTAB 0x0403
#define TYPE_BENIGN_SEC 0x0800
-#define TYPE_ALL 0x0FFF
#define MAX_CHARSET_SIZE 6 /* max size of multi-byte character */
#define MAX_NAME_LENGTH 255 /* max len of file name excluding NULL */
#define MAX_VFSNAME_BUF_SIZE ((MAX_NAME_LENGTH + 1) * MAX_CHARSET_SIZE)
-/* Enough size to hold 256 dentry (even 512 Byte sector) */
-#define DIR_CACHE_SIZE (256*sizeof(struct exfat_dentry)/512+1)
-
#define EXFAT_HINT_NONE -1
#define EXFAT_MIN_SUBDIR 2
@@ -95,12 +100,18 @@ enum {
/*
* helpers for block size to dentry size conversion.
*/
-#define EXFAT_B_TO_DEN_IDX(b, sbi) \
- ((b) << ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS))
#define EXFAT_B_TO_DEN(b) ((b) >> DENTRY_SIZE_BITS)
#define EXFAT_DEN_TO_B(b) ((b) << DENTRY_SIZE_BITS)
/*
+ * helpers for cluster size to dentry size conversion.
+ */
+#define EXFAT_CLU_TO_DEN(clu, sbi) \
+ ((clu) << ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS))
+#define EXFAT_DEN_TO_CLU(dentry, sbi) \
+ ((dentry) >> ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS))
+
+/*
* helpers for fat entry.
*/
#define FAT_ENT_SIZE (4)
@@ -125,6 +136,17 @@ enum {
#define BITS_PER_BYTE_MASK 0x7
#define IGNORED_BITS_REMAINED(clu, clu_base) ((1 << ((clu) - (clu_base))) - 1)
+#define ES_ENTRY_NUM(name_len) (ES_IDX_LAST_FILENAME(name_len) + 1)
+/* 19 entries = 1 file entry + 1 stream entry + 17 filename entries */
+#define ES_MAX_ENTRY_NUM ES_ENTRY_NUM(MAX_NAME_LENGTH)
+
+/*
+ * 19 entries x 32 bytes/entry = 608 bytes.
+ * The 608 bytes are in 3 sectors at most (even 512 Byte sector).
+ */
+#define DIR_CACHE_SIZE \
+ (DIV_ROUND_UP(EXFAT_DEN_TO_B(ES_MAX_ENTRY_NUM), SECTOR_SIZE) + 1)
+
struct exfat_dentry_namebuf {
char *lfn;
int lfnbuf_len; /* usually MAX_UNINAME_BUF_SIZE */
@@ -166,13 +188,16 @@ struct exfat_hint {
struct exfat_entry_set_cache {
struct super_block *sb;
- bool modified;
unsigned int start_off;
int num_bh;
- struct buffer_head *bh[DIR_CACHE_SIZE];
+ struct buffer_head *__bh[DIR_CACHE_SIZE];
+ struct buffer_head **bh;
unsigned int num_entries;
+ bool modified;
};
+#define IS_DYNAMIC_ES(es) ((es)->__bh != (es)->bh)
+
struct exfat_dir_entry {
struct exfat_chain dir;
int entry;
@@ -375,7 +400,7 @@ static inline sector_t exfat_cluster_to_sector(struct exfat_sb_info *sbi,
sbi->data_start_sector;
}
-static inline int exfat_sector_to_cluster(struct exfat_sb_info *sbi,
+static inline unsigned int exfat_sector_to_cluster(struct exfat_sb_info *sbi,
sector_t sec)
{
return ((sec - sbi->data_start_sector) >> sbi->sect_per_clus_bits) +
@@ -423,8 +448,8 @@ int exfat_trim_fs(struct inode *inode, struct fstrim_range *range);
/* file.c */
extern const struct file_operations exfat_file_operations;
-int __exfat_truncate(struct inode *inode, loff_t new_size);
-void exfat_truncate(struct inode *inode, loff_t size);
+int __exfat_truncate(struct inode *inode);
+void exfat_truncate(struct inode *inode);
int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
struct iattr *attr);
int exfat_getattr(struct user_namespace *mnt_userns, const struct path *path,
@@ -464,15 +489,16 @@ void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es);
int exfat_calc_num_entries(struct exfat_uni_name *p_uniname);
int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname,
- int num_entries, unsigned int type, struct exfat_hint *hint_opt);
+ struct exfat_hint *hint_opt);
int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu);
struct exfat_dentry *exfat_get_dentry(struct super_block *sb,
struct exfat_chain *p_dir, int entry, struct buffer_head **bh);
struct exfat_dentry *exfat_get_dentry_cached(struct exfat_entry_set_cache *es,
int num);
-struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
- struct exfat_chain *p_dir, int entry, unsigned int type);
-int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync);
+int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
+ struct super_block *sb, struct exfat_chain *p_dir, int entry,
+ unsigned int type);
+int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync);
int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir);
/* inode.c */
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 4e0793f35e8f..f5b29072775d 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -93,7 +93,7 @@ static int exfat_sanitize_mode(const struct exfat_sb_info *sbi,
}
/* resize the file length */
-int __exfat_truncate(struct inode *inode, loff_t new_size)
+int __exfat_truncate(struct inode *inode)
{
unsigned int num_clusters_new, num_clusters_phys;
unsigned int last_clu = EXFAT_FREE_CLUSTER;
@@ -113,7 +113,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
exfat_chain_set(&clu, ei->start_clu, num_clusters_phys, ei->flags);
- if (new_size > 0) {
+ if (i_size_read(inode) > 0) {
/*
* Truncate FAT chain num_clusters after the first cluster
* num_clusters = min(new, phys);
@@ -143,8 +143,6 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
ei->start_clu = EXFAT_EOF_CLUSTER;
}
- i_size_write(inode, new_size);
-
if (ei->type == TYPE_FILE)
ei->attr |= ATTR_ARCHIVE;
@@ -189,7 +187,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
return 0;
}
-void exfat_truncate(struct inode *inode, loff_t size)
+void exfat_truncate(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
@@ -207,7 +205,7 @@ void exfat_truncate(struct inode *inode, loff_t size)
goto write_size;
}
- err = __exfat_truncate(inode, i_size_read(inode));
+ err = __exfat_truncate(inode);
if (err)
goto write_size;
@@ -310,7 +308,7 @@ int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
* __exfat_write_inode() is called from exfat_truncate(), inode
* is already written by it, so mark_inode_dirty() is unneeded.
*/
- exfat_truncate(inode, attr->ia_size);
+ exfat_truncate(inode);
up_write(&EXFAT_I(inode)->truncate_lock);
} else
mark_inode_dirty(inode);
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index a795437b86d0..5b644cb057fa 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -21,7 +21,7 @@ int __exfat_write_inode(struct inode *inode, int sync)
{
unsigned long long on_disk_size;
struct exfat_dentry *ep, *ep2;
- struct exfat_entry_set_cache *es = NULL;
+ struct exfat_entry_set_cache es;
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
@@ -42,11 +42,10 @@ int __exfat_write_inode(struct inode *inode, int sync)
exfat_set_volume_dirty(sb);
/* get the directory entry of given file or directory */
- es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES);
- if (!es)
+ if (exfat_get_dentry_set(&es, sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES))
return -EIO;
- ep = exfat_get_dentry_cached(es, 0);
- ep2 = exfat_get_dentry_cached(es, 1);
+ ep = exfat_get_dentry_cached(&es, ES_IDX_FILE);
+ ep2 = exfat_get_dentry_cached(&es, ES_IDX_STREAM);
ep->dentry.file.attr = cpu_to_le16(exfat_make_attr(inode));
@@ -83,8 +82,8 @@ int __exfat_write_inode(struct inode *inode, int sync)
ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER;
}
- exfat_update_dir_chksum_with_entry_set(es);
- return exfat_free_dentry_set(es, sync);
+ exfat_update_dir_chksum_with_entry_set(&es);
+ return exfat_put_dentry_set(&es, sync);
}
int exfat_write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -345,11 +344,6 @@ static void exfat_readahead(struct readahead_control *rac)
mpage_readahead(rac, exfat_get_block);
}
-static int exfat_writepage(struct page *page, struct writeback_control *wbc)
-{
- return block_write_full_page(page, exfat_get_block, wbc);
-}
-
static int exfat_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -363,7 +357,7 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to)
if (to > i_size_read(inode)) {
truncate_pagecache(inode, i_size_read(inode));
inode->i_mtime = inode->i_ctime = current_time(inode);
- exfat_truncate(inode, EXFAT_I(inode)->i_size_aligned);
+ exfat_truncate(inode);
}
}
@@ -473,12 +467,12 @@ static const struct address_space_operations exfat_aops = {
.invalidate_folio = block_invalidate_folio,
.read_folio = exfat_read_folio,
.readahead = exfat_readahead,
- .writepage = exfat_writepage,
.writepages = exfat_writepages,
.write_begin = exfat_write_begin,
.write_end = exfat_write_end,
.direct_IO = exfat_direct_IO,
- .bmap = exfat_aop_bmap
+ .bmap = exfat_aop_bmap,
+ .migrate_folio = buffer_migrate_folio,
};
static inline unsigned long exfat_hash(loff_t i_pos)
@@ -552,7 +546,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
inode->i_uid = sbi->options.fs_uid;
inode->i_gid = sbi->options.fs_gid;
inode_inc_iversion(inode);
- inode->i_generation = prandom_u32();
+ inode->i_generation = get_random_u32();
if (info->attr & ATTR_SUBDIR) { /* directory */
inode->i_generation &= ~1;
@@ -627,7 +621,7 @@ void exfat_evict_inode(struct inode *inode)
if (!inode->i_nlink) {
i_size_write(inode, 0);
mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock);
- __exfat_truncate(inode, 0);
+ __exfat_truncate(inode);
mutex_unlock(&EXFAT_SB(inode->i_sb)->s_lock);
}
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index b617bebc3d0f..5f995eba5dbb 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -224,11 +224,18 @@ static int exfat_search_empty_slot(struct super_block *sb,
if (hint_femp->eidx != EXFAT_HINT_NONE) {
dentry = hint_femp->eidx;
- if (num_entries <= hint_femp->count) {
- hint_femp->eidx = EXFAT_HINT_NONE;
- return dentry;
- }
+ /*
+ * If hint_femp->count is enough, it is needed to check if
+ * there are actual empty entries.
+ * Otherwise, and if "dentry + hint_famp->count" is also equal
+ * to "p_dir->size * dentries_per_clu", it means ENOSPC.
+ */
+ if (dentry + hint_femp->count == p_dir->size * dentries_per_clu &&
+ num_entries > hint_femp->count)
+ return -ENOSPC;
+
+ hint_femp->eidx = EXFAT_HINT_NONE;
exfat_chain_dup(&clu, &hint_femp->cur);
} else {
exfat_chain_dup(&clu, p_dir);
@@ -293,6 +300,12 @@ static int exfat_search_empty_slot(struct super_block *sb,
}
}
+ hint_femp->eidx = p_dir->size * dentries_per_clu - num_empty;
+ hint_femp->count = num_empty;
+ if (num_empty == 0)
+ exfat_chain_set(&hint_femp->cur, EXFAT_EOF_CLUSTER, 0,
+ clu.flags);
+
return -ENOSPC;
}
@@ -369,15 +382,11 @@ static int exfat_find_empty_entry(struct inode *inode,
if (exfat_ent_set(sb, last_clu, clu.dir))
return -EIO;
- if (hint_femp.eidx == EXFAT_HINT_NONE) {
- /* the special case that new dentry
- * should be allocated from the start of new cluster
- */
- hint_femp.eidx = EXFAT_B_TO_DEN_IDX(p_dir->size, sbi);
- hint_femp.count = sbi->dentries_per_clu;
-
+ if (hint_femp.cur.dir == EXFAT_EOF_CLUSTER)
exfat_chain_set(&hint_femp.cur, clu.dir, 0, clu.flags);
- }
+
+ hint_femp.count += sbi->dentries_per_clu;
+
hint_femp.cur.size++;
p_dir->size++;
size = EXFAT_CLU_TO_B(p_dir->size, sbi);
@@ -588,14 +597,14 @@ unlock:
static int exfat_find(struct inode *dir, struct qstr *qname,
struct exfat_dir_entry *info)
{
- int ret, dentry, num_entries, count;
+ int ret, dentry, count;
struct exfat_chain cdir;
struct exfat_uni_name uni_name;
struct super_block *sb = dir->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(dir);
struct exfat_dentry *ep, *ep2;
- struct exfat_entry_set_cache *es;
+ struct exfat_entry_set_cache es;
/* for optimized dir & entry to prevent long traverse of cluster chain */
struct exfat_hint hint_opt;
@@ -607,10 +616,6 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
if (ret)
return ret;
- num_entries = exfat_calc_num_entries(&uni_name);
- if (num_entries < 0)
- return num_entries;
-
/* check the validation of hint_stat and initialize it if required */
if (ei->version != (inode_peek_iversion_raw(dir) & 0xffffffff)) {
ei->hint_stat.clu = cdir.dir;
@@ -620,9 +625,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
}
/* search the file name for directories */
- dentry = exfat_find_dir_entry(sb, ei, &cdir, &uni_name,
- num_entries, TYPE_ALL, &hint_opt);
-
+ dentry = exfat_find_dir_entry(sb, ei, &cdir, &uni_name, &hint_opt);
if (dentry < 0)
return dentry; /* -error value */
@@ -635,11 +638,10 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
if (cdir.flags & ALLOC_NO_FAT_CHAIN)
cdir.size -= dentry / sbi->dentries_per_clu;
dentry = hint_opt.eidx;
- es = exfat_get_dentry_set(sb, &cdir, dentry, ES_2_ENTRIES);
- if (!es)
+ if (exfat_get_dentry_set(&es, sb, &cdir, dentry, ES_2_ENTRIES))
return -EIO;
- ep = exfat_get_dentry_cached(es, 0);
- ep2 = exfat_get_dentry_cached(es, 1);
+ ep = exfat_get_dentry_cached(&es, ES_IDX_FILE);
+ ep2 = exfat_get_dentry_cached(&es, ES_IDX_STREAM);
info->type = exfat_get_entry_type(ep);
info->attr = le16_to_cpu(ep->dentry.file.attr);
@@ -668,7 +670,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
ep->dentry.file.access_time,
ep->dentry.file.access_date,
0);
- exfat_free_dentry_set(es, false);
+ exfat_put_dentry_set(&es, false);
if (ei->start_clu == EXFAT_FREE_CLUSTER) {
exfat_fs_error(sb,
@@ -1167,7 +1169,7 @@ static int __exfat_rename(struct inode *old_parent_inode,
struct exfat_inode_info *new_ei = NULL;
unsigned int new_entry_type = TYPE_UNUSED;
int new_entry = 0;
- struct buffer_head *old_bh, *new_bh = NULL;
+ struct buffer_head *new_bh = NULL;
/* check the validity of pointer parameters */
if (new_path == NULL || strlen(new_path) == 0)
@@ -1183,13 +1185,6 @@ static int __exfat_rename(struct inode *old_parent_inode,
EXFAT_I(old_parent_inode)->flags);
dentry = ei->entry;
- ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh);
- if (!ep) {
- ret = -EIO;
- goto out;
- }
- brelse(old_bh);
-
/* check whether new dir is existing directory and empty */
if (new_inode) {
ret = -EIO;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 3ef80d000e13..3204bd33e4e8 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -18,7 +18,7 @@
#include <linux/sched.h>
#include <linux/cred.h>
-#define dprintk(fmt, args...) do{}while(0)
+#define dprintk(fmt, args...) pr_debug(fmt, ##args)
static int get_name(const struct path *path, char *name, struct dentry *child);
@@ -132,8 +132,8 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
inode_unlock(dentry->d_inode);
if (IS_ERR(parent)) {
- dprintk("%s: get_parent of %ld failed, err %d\n",
- __func__, dentry->d_inode->i_ino, PTR_ERR(parent));
+ dprintk("get_parent of %lu failed, err %ld\n",
+ dentry->d_inode->i_ino, PTR_ERR(parent));
return parent;
}
@@ -147,7 +147,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
dprintk("%s: found name: %s\n", __func__, nbuf);
tmp = lookup_one_unlocked(mnt_user_ns(mnt), nbuf, parent, strlen(nbuf));
if (IS_ERR(tmp)) {
- dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp));
+ dprintk("lookup failed: %ld\n", PTR_ERR(tmp));
err = PTR_ERR(tmp);
goto out_err;
}
@@ -248,21 +248,20 @@ struct getdents_callback {
* A rather strange filldir function to capture
* the name matching the specified inode number.
*/
-static int filldir_one(struct dir_context *ctx, const char *name, int len,
+static bool filldir_one(struct dir_context *ctx, const char *name, int len,
loff_t pos, u64 ino, unsigned int d_type)
{
struct getdents_callback *buf =
container_of(ctx, struct getdents_callback, ctx);
- int result = 0;
buf->sequence++;
if (buf->ino == ino && len <= NAME_MAX) {
memcpy(buf->name, name, len);
buf->name[len] = '\0';
buf->found = 1;
- result = -1;
+ return false; // no more
}
- return result;
+ return true;
}
/**
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index bf298967c5b8..440d5f1e9d47 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -219,11 +219,12 @@ __ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
* inode->i_mutex: down
*/
int
-ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int error;
int update_mode = 0;
+ struct inode *inode = d_inode(dentry);
umode_t mode = inode->i_mode;
if (type == ACL_TYPE_ACCESS && acl) {
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 925ab6287d35..3841becb94ff 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -56,7 +56,7 @@ static inline int ext2_acl_count(size_t size)
/* acl.c */
extern struct posix_acl *ext2_get_acl(struct inode *inode, int type, bool rcu);
-extern int ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+extern int ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
extern int ext2_init_acl (struct inode *, struct inode *);
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index c17ccc19b938..eca60b747c6b 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -126,6 +126,7 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
struct ext2_group_desc * desc;
struct buffer_head * bh = NULL;
ext2_fsblk_t bitmap_blk;
+ int ret;
desc = ext2_get_group_desc(sb, block_group, NULL);
if (!desc)
@@ -139,10 +140,10 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
block_group, le32_to_cpu(desc->bg_block_bitmap));
return NULL;
}
- if (likely(bh_uptodate_or_lock(bh)))
+ ret = bh_read(bh, 0);
+ if (ret > 0)
return bh;
-
- if (bh_submit_read(bh) < 0) {
+ if (ret < 0) {
brelse(bh);
ext2_error(sb, __func__,
"Cannot read block bitmap - "
@@ -666,7 +667,7 @@ ext2_try_to_allocate(struct super_block *sb, int group,
{
ext2_fsblk_t group_first_block = ext2_group_first_block_no(sb, group);
ext2_fsblk_t group_last_block = ext2_group_last_block_no(sb, group);
- ext2_grpblk_t start, end;
+ ext2_grpblk_t start, end;
unsigned long num = 0;
start = 0;
@@ -1480,11 +1481,11 @@ unsigned long ext2_count_free_blocks (struct super_block * sb)
desc_count, bitmap_count);
return bitmap_count;
#else
- for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
- desc = ext2_get_group_desc (sb, i, NULL);
- if (!desc)
- continue;
- desc_count += le16_to_cpu(desc->bg_free_blocks_count);
+ for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
+ desc = ext2_get_group_desc(sb, i, NULL);
+ if (!desc)
+ continue;
+ desc_count += le16_to_cpu(desc->bg_free_blocks_count);
}
return desc_count;
#endif
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 8f597753ac12..e5cbc27ba459 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -81,11 +81,10 @@ ext2_last_byte(struct inode *inode, unsigned long page_nr)
return last_byte;
}
-static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
+static void ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
{
struct address_space *mapping = page->mapping;
struct inode *dir = mapping->host;
- int err = 0;
inode_inc_iversion(dir);
block_write_end(NULL, mapping, pos, len, len, page, NULL);
@@ -94,16 +93,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
i_size_write(dir, pos+len);
mark_inode_dirty(dir);
}
-
- if (IS_DIRSYNC(dir)) {
- err = write_one_page(page);
- if (!err)
- err = sync_inode_metadata(dir, 1);
- } else {
- unlock_page(page);
- }
-
- return err;
+ unlock_page(page);
}
static bool ext2_check_page(struct page *page, int quiet, char *kaddr)
@@ -413,7 +403,7 @@ found:
return de;
}
-/**
+/*
* Return the '..' directory entry and the page in which the entry was found
* (as a parameter - p).
*
@@ -460,6 +450,17 @@ static int ext2_prepare_chunk(struct page *page, loff_t pos, unsigned len)
return __block_write_begin(page, pos, len, ext2_get_block);
}
+
+static int ext2_handle_dirsync(struct inode *dir)
+{
+ int err;
+
+ err = filemap_write_and_wait(dir->i_mapping);
+ if (!err)
+ err = sync_inode_metadata(dir, 1);
+ return err;
+}
+
void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
struct page *page, void *page_addr, struct inode *inode,
int update_times)
@@ -474,11 +475,12 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
BUG_ON(err);
de->inode = cpu_to_le32(inode->i_ino);
ext2_set_de_type(de, inode);
- err = ext2_commit_chunk(page, pos, len);
+ ext2_commit_chunk(page, pos, len);
if (update_times)
dir->i_mtime = dir->i_ctime = current_time(dir);
EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(dir);
+ ext2_handle_dirsync(dir);
}
/*
@@ -566,10 +568,11 @@ got_it:
memcpy(de->name, name, namelen);
de->inode = cpu_to_le32(inode->i_ino);
ext2_set_de_type (de, inode);
- err = ext2_commit_chunk(page, pos, rec_len);
+ ext2_commit_chunk(page, pos, rec_len);
dir->i_mtime = dir->i_ctime = current_time(dir);
EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(dir);
+ err = ext2_handle_dirsync(dir);
/* OFFSET_CACHE */
out_put:
ext2_put_page(page, page_addr);
@@ -615,10 +618,11 @@ int ext2_delete_entry (struct ext2_dir_entry_2 *dir, struct page *page,
if (pde)
pde->rec_len = ext2_rec_len_to_disk(to - from);
dir->inode = 0;
- err = ext2_commit_chunk(page, pos, to - from);
+ ext2_commit_chunk(page, pos, to - from);
inode->i_ctime = inode->i_mtime = current_time(inode);
EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(inode);
+ err = ext2_handle_dirsync(inode);
out:
return err;
}
@@ -658,7 +662,8 @@ int ext2_make_empty(struct inode *inode, struct inode *parent)
memcpy (de->name, "..\0", 4);
ext2_set_de_type (de, inode);
kunmap_atomic(kaddr);
- err = ext2_commit_chunk(page, 0, chunk_size);
+ ext2_commit_chunk(page, 0, chunk_size);
+ err = ext2_handle_dirsync(inode);
fail:
put_page(page);
return err;
@@ -679,7 +684,7 @@ int ext2_empty_dir (struct inode * inode)
page = ext2_get_page(inode, i, 0, &page_addr);
if (IS_ERR(page))
- goto not_empty;
+ return 0;
kaddr = page_addr;
de = (ext2_dirent *)kaddr;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index eb97aa3d700e..6b4bebe982ca 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -200,7 +200,7 @@ const struct inode_operations ext2_file_inode_operations = {
.listxattr = ext2_listxattr,
.getattr = ext2_getattr,
.setattr = ext2_setattr,
- .get_acl = ext2_get_acl,
+ .get_inode_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
.fiemap = ext2_fiemap,
.fileattr_get = ext2_fileattr_get,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 998dd2ac8008..78b8686d9a4a 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -277,8 +277,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
int best_ndir = inodes_per_group;
int best_group = -1;
- group = prandom_u32();
- parent_group = (unsigned)group % ngroups;
+ parent_group = get_random_u32_below(ngroups);
for (i = 0; i < ngroups; i++) {
group = (parent_group + i) % ngroups;
desc = ext2_get_group_desc (sb, group, NULL);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 918ab2f9e4c0..69aed9e2359e 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -869,11 +869,6 @@ int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return ret;
}
-static int ext2_writepage(struct page *page, struct writeback_control *wbc)
-{
- return block_write_full_page(page, ext2_get_block, wbc);
-}
-
static int ext2_read_folio(struct file *file, struct folio *folio)
{
return mpage_read_folio(folio, ext2_get_block);
@@ -948,7 +943,6 @@ const struct address_space_operations ext2_aops = {
.invalidate_folio = block_invalidate_folio,
.read_folio = ext2_read_folio,
.readahead = ext2_readahead,
- .writepage = ext2_writepage,
.write_begin = ext2_write_begin,
.write_end = ext2_write_end,
.bmap = ext2_bmap,
@@ -1652,7 +1646,7 @@ int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
}
setattr_copy(&init_user_ns, inode, iattr);
if (iattr->ia_valid & ATTR_MODE)
- error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+ error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
mark_inode_dirty(inode);
return error;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 5fd9a22d2b70..c056957221a2 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -120,7 +120,7 @@ static int ext2_create (struct user_namespace * mnt_userns,
}
static int ext2_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+ struct file *file, umode_t mode)
{
struct inode *inode = ext2_new_inode(dir, mode, NULL);
if (IS_ERR(inode))
@@ -128,9 +128,9 @@ static int ext2_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
ext2_set_file_ops(inode);
mark_inode_dirty(inode);
- d_tmpfile(dentry, inode);
+ d_tmpfile(file, inode);
unlock_new_inode(inode);
- return 0;
+ return finish_open_simple(file, 0);
}
static int ext2_mknod (struct user_namespace * mnt_userns, struct inode * dir,
@@ -427,7 +427,7 @@ const struct inode_operations ext2_dir_inode_operations = {
.listxattr = ext2_listxattr,
.getattr = ext2_getattr,
.setattr = ext2_setattr,
- .get_acl = ext2_get_acl,
+ .get_inode_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
.tmpfile = ext2_tmpfile,
.fileattr_get = ext2_fileattr_get,
@@ -438,6 +438,6 @@ const struct inode_operations ext2_special_inode_operations = {
.listxattr = ext2_listxattr,
.getattr = ext2_getattr,
.setattr = ext2_setattr,
- .get_acl = ext2_get_acl,
+ .get_inode_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
};
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 252c742379cf..69c88facfe90 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -163,7 +163,7 @@ static void ext2_put_super (struct super_block * sb)
db_count = sbi->s_gdb_count;
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
- kfree(sbi->s_group_desc);
+ kvfree(sbi->s_group_desc);
kfree(sbi->s_debts);
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -1052,6 +1052,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_blocks_per_group);
goto failed_mount;
}
+ /* At least inode table, bitmaps, and sb have to fit in one group */
+ if (sbi->s_blocks_per_group <= sbi->s_itb_per_group + 3) {
+ ext2_msg(sb, KERN_ERR,
+ "error: #blocks per group smaller than metadata size: %lu <= %lu",
+ sbi->s_blocks_per_group, sbi->s_inodes_per_group + 3);
+ goto failed_mount;
+ }
if (sbi->s_frags_per_group > sb->s_blocksize * 8) {
ext2_msg(sb, KERN_ERR,
"error: #fragments per group too big: %lu",
@@ -1065,9 +1072,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_inodes_per_group);
goto failed_mount;
}
+ if (sb_bdev_nr_blocks(sb) < le32_to_cpu(es->s_blocks_count)) {
+ ext2_msg(sb, KERN_ERR,
+ "bad geometry: block count %u exceeds size of device (%u blocks)",
+ le32_to_cpu(es->s_blocks_count),
+ (unsigned)sb_bdev_nr_blocks(sb));
+ goto failed_mount;
+ }
- if (EXT2_BLOCKS_PER_GROUP(sb) == 0)
- goto cantfind_ext2;
sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
le32_to_cpu(es->s_first_data_block) - 1)
/ EXT2_BLOCKS_PER_GROUP(sb)) + 1;
@@ -1080,7 +1092,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
}
db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) /
EXT2_DESC_PER_BLOCK(sb);
- sbi->s_group_desc = kmalloc_array(db_count,
+ sbi->s_group_desc = kvmalloc_array(db_count,
sizeof(struct buffer_head *),
GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
@@ -1206,7 +1218,7 @@ failed_mount2:
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
failed_mount_group_desc:
- kfree(sbi->s_group_desc);
+ kvfree(sbi->s_group_desc);
kfree(sbi->s_debts);
failed_mount:
brelse(bh);
@@ -1636,7 +1648,7 @@ static int __init init_ext2_fs(void)
err = init_inodecache();
if (err)
return err;
- err = register_filesystem(&ext2_fs_type);
+ err = register_filesystem(&ext2_fs_type);
if (err)
goto out;
return 0;
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 57e82e25f8e2..a9f89539aeee 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -225,12 +225,13 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type,
}
int
-ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ext4_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
handle_t *handle;
int error, credits, retries = 0;
size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0;
+ struct inode *inode = d_inode(dentry);
umode_t mode = inode->i_mode;
int update_mode = 0;
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 3219669732bf..09c4a8a3b716 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -56,7 +56,7 @@ static inline int ext4_acl_count(size_t size)
/* acl.c */
struct posix_acl *ext4_get_acl(struct inode *inode, int type, bool rcu);
-int ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ext4_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3bf9a6926798..140e1eb300d1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -558,7 +558,7 @@ enum {
*
* It's not paranoia if the Murphy's Law really *is* out to get you. :-)
*/
-#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG))
#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))
static inline void ext4_check_flag_values(void)
@@ -2964,7 +2964,8 @@ int do_journal_get_write_access(handle_t *handle, struct inode *inode,
typedef enum {
EXT4_IGET_NORMAL = 0,
EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */
- EXT4_IGET_HANDLE = 0x0002 /* Inode # is from a handle */
+ EXT4_IGET_HANDLE = 0x0002, /* Inode # is from a handle */
+ EXT4_IGET_BAD = 0x0004 /* Allow to iget a bad inode */
} ext4_iget_flags;
extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
@@ -2977,6 +2978,7 @@ extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
extern int ext4_write_inode(struct inode *, struct writeback_control *);
extern int ext4_setattr(struct user_namespace *, struct dentry *,
struct iattr *);
+extern u32 ext4_dio_alignment(struct inode *inode);
extern int ext4_getattr(struct user_namespace *, const struct path *,
struct kstat *, u32, unsigned int);
extern void ext4_evict_inode(struct inode *);
@@ -2998,6 +3000,7 @@ extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
+extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
@@ -3591,9 +3594,6 @@ extern bool empty_inline_dir(struct inode *dir, int *has_inline_data);
extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
struct ext4_dir_entry_2 **parent_de,
int *retval);
-extern int ext4_inline_data_fiemap(struct inode *inode,
- struct fiemap_extent_info *fieinfo,
- int *has_inline, __u64 start, __u64 len);
extern void *ext4_read_inline_link(struct inode *inode);
struct iomap;
@@ -3621,8 +3621,8 @@ extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
unsigned int blocksize);
extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
-extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name,
- struct inode *inode);
+extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
+ struct inode *inode, struct dentry *dentry);
extern int __ext4_link(struct inode *dir, struct inode *inode,
struct dentry *dentry);
@@ -3712,7 +3712,7 @@ extern int ext4_ext_insert_extent(handle_t *, struct inode *,
extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
struct ext4_ext_path **,
int flags);
-extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern void ext4_free_ext_path(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -3758,8 +3758,7 @@ extern void ext4_end_io_rsv_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
- int len,
- bool keep_towrite);
+ int len);
extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 8e1fb18f465e..77f318ec8abb 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -86,15 +86,21 @@ static int ext4_journal_check_start(struct super_block *sb)
return 0;
}
-handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+handle_t *__ext4_journal_start_sb(struct inode *inode,
+ struct super_block *sb, unsigned int line,
int type, int blocks, int rsv_blocks,
int revoke_creds)
{
journal_t *journal;
int err;
-
- trace_ext4_journal_start(sb, blocks, rsv_blocks, revoke_creds,
- _RET_IP_);
+ if (inode)
+ trace_ext4_journal_start_inode(inode, blocks, rsv_blocks,
+ revoke_creds, type,
+ _RET_IP_);
+ else
+ trace_ext4_journal_start_sb(sb, blocks, rsv_blocks,
+ revoke_creds, type,
+ _RET_IP_);
err = ext4_journal_check_start(sb);
if (err < 0)
return ERR_PTR(err);
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index db2ae4a2b38d..0c77697d5e90 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -261,9 +261,9 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
__ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
(bh))
-handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
- int type, int blocks, int rsv_blocks,
- int revoke_creds);
+handle_t *__ext4_journal_start_sb(struct inode *inode, struct super_block *sb,
+ unsigned int line, int type, int blocks,
+ int rsv_blocks, int revoke_creds);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -303,7 +303,7 @@ static inline int ext4_trans_default_revoke_credits(struct super_block *sb)
}
#define ext4_journal_start_sb(sb, type, nblocks) \
- __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0, \
+ __ext4_journal_start_sb(NULL, (sb), __LINE__, (type), (nblocks), 0,\
ext4_trans_default_revoke_credits(sb))
#define ext4_journal_start(inode, type, nblocks) \
@@ -323,7 +323,7 @@ static inline handle_t *__ext4_journal_start(struct inode *inode,
int blocks, int rsv_blocks,
int revoke_creds)
{
- return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
+ return __ext4_journal_start_sb(inode, inode->i_sb, line, type, blocks,
rsv_blocks, revoke_creds);
}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 5235974126bd..9de1c9d1a13d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -106,6 +106,25 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
return 0;
}
+static void ext4_ext_drop_refs(struct ext4_ext_path *path)
+{
+ int depth, i;
+
+ if (!path)
+ return;
+ depth = path->p_depth;
+ for (i = 0; i <= depth; i++, path++) {
+ brelse(path->p_bh);
+ path->p_bh = NULL;
+ }
+}
+
+void ext4_free_ext_path(struct ext4_ext_path *path)
+{
+ ext4_ext_drop_refs(path);
+ kfree(path);
+}
+
/*
* Make sure 'handle' has at least 'check_cred' credits. If not, restart
* transaction with 'restart_cred' credits. The function drops i_data_sem
@@ -636,8 +655,7 @@ int ext4_ext_precache(struct inode *inode)
ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
out:
up_read(&ei->i_data_sem);
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return ret;
}
@@ -724,19 +742,6 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
#define ext4_ext_show_move(inode, path, newblock, level)
#endif
-void ext4_ext_drop_refs(struct ext4_ext_path *path)
-{
- int depth, i;
-
- if (!path)
- return;
- depth = path->p_depth;
- for (i = 0; i <= depth; i++, path++) {
- brelse(path->p_bh);
- path->p_bh = NULL;
- }
-}
-
/*
* ext4_ext_binsearch_idx:
* binary search for the closest index of the given block
@@ -955,8 +960,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
return path;
err:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
if (orig_path)
*orig_path = NULL;
return ERR_PTR(ret);
@@ -2174,8 +2178,7 @@ merge:
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
cleanup:
- ext4_ext_drop_refs(npath);
- kfree(npath);
+ ext4_free_ext_path(npath);
return err;
}
@@ -2632,9 +2635,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
unwritten, ex_ee_len);
path[depth].p_ext = ex;
- a = ex_ee_block > start ? ex_ee_block : start;
- b = ex_ee_block+ex_ee_len - 1 < end ?
- ex_ee_block+ex_ee_len - 1 : end;
+ a = max(ex_ee_block, start);
+ b = min(ex_ee_block + ex_ee_len - 1, end);
ext_debug(inode, " border %u:%u\n", a, b);
@@ -3061,8 +3063,7 @@ again:
}
}
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
path = NULL;
if (err == -EAGAIN)
goto again;
@@ -4375,8 +4376,7 @@ got_allocated_blocks:
allocated = map->m_len;
ext4_ext_show_leaf(inode, path);
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
trace_ext4_ext_map_blocks_exit(inode, flags, map,
err ? err : allocated);
@@ -5183,6 +5183,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
* and it is decreased till we reach start.
*/
again:
+ ret = 0;
if (SHIFT == SHIFT_LEFT)
iterator = &start;
else
@@ -5226,14 +5227,21 @@ again:
ext4_ext_get_actual_len(extent);
} else {
extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
- if (le32_to_cpu(extent->ee_block) > 0)
+ if (le32_to_cpu(extent->ee_block) > start)
*iterator = le32_to_cpu(extent->ee_block) - 1;
- else
- /* Beginning is reached, end of the loop */
+ else if (le32_to_cpu(extent->ee_block) == start)
iterator = NULL;
- /* Update path extent in case we need to stop */
- while (le32_to_cpu(extent->ee_block) < start)
+ else {
+ extent = EXT_LAST_EXTENT(path[depth].p_hdr);
+ while (le32_to_cpu(extent->ee_block) >= start)
+ extent--;
+
+ if (extent == EXT_LAST_EXTENT(path[depth].p_hdr))
+ break;
+
extent++;
+ iterator = NULL;
+ }
path[depth].p_ext = extent;
}
ret = ext4_ext_shift_path_extents(path, shift, inode,
@@ -5245,8 +5253,7 @@ again:
break;
}
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return ret;
}
@@ -5538,15 +5545,13 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
EXT4_GET_BLOCKS_METADATA_NOFAIL);
}
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
if (ret < 0) {
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
}
} else {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
}
ret = ext4_es_remove_extent(inode, offset_lblk,
@@ -5561,8 +5566,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
* ee_start_lblk to shift extents
*/
ret = ext4_ext_shift_extents(inode, handle,
- ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
- len_lblk, SHIFT_RIGHT);
+ max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT);
up_write(&EXT4_I(inode)->i_data_sem);
if (IS_SYNC(inode))
@@ -5766,10 +5770,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
count -= len;
repeat:
- ext4_ext_drop_refs(path1);
- kfree(path1);
- ext4_ext_drop_refs(path2);
- kfree(path2);
+ ext4_free_ext_path(path1);
+ ext4_free_ext_path(path2);
path1 = path2 = NULL;
}
return replaced_count;
@@ -5795,6 +5797,14 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
struct ext4_extent *extent;
ext4_lblk_t first_lblk, first_lclu, last_lclu;
+ /*
+ * if data can be stored inline, the logical cluster isn't
+ * mapped - no physical clusters have been allocated, and the
+ * file has no extents
+ */
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
+ return 0;
+
/* search for the extent closest to the first block in the cluster */
path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
if (IS_ERR(path)) {
@@ -5848,8 +5858,7 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
}
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return err ? err : mapped;
}
@@ -5916,8 +5925,7 @@ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
up_write(&EXT4_I(inode)->i_data_sem);
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
ext4_mark_inode_dirty(NULL, inode);
return ret;
}
@@ -5935,8 +5943,7 @@ void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end)
return;
ex = path[path->p_depth].p_ext;
if (!ex) {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
ext4_mark_inode_dirty(NULL, inode);
return;
}
@@ -5949,8 +5956,7 @@ void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end)
ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
up_write(&EXT4_I(inode)->i_data_sem);
ext4_mark_inode_dirty(NULL, inode);
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
}
}
@@ -5989,13 +5995,11 @@ int ext4_ext_replay_set_iblocks(struct inode *inode)
return PTR_ERR(path);
ex = path[path->p_depth].p_ext;
if (!ex) {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
goto out;
}
end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
/* Count the number of data blocks */
cur = 0;
@@ -6025,30 +6029,26 @@ int ext4_ext_replay_set_iblocks(struct inode *inode)
if (IS_ERR(path))
goto out;
numblks += path->p_depth;
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
while (cur < end) {
path = ext4_find_extent(inode, cur, NULL, 0);
if (IS_ERR(path))
break;
ex = path[path->p_depth].p_ext;
if (!ex) {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return 0;
}
cur = max(cur + 1, le32_to_cpu(ex->ee_block) +
ext4_ext_get_actual_len(ex));
ret = skip_hole(inode, &cur);
if (ret < 0) {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
break;
}
path2 = ext4_find_extent(inode, cur, NULL, 0);
if (IS_ERR(path2)) {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
break;
}
for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) {
@@ -6062,10 +6062,8 @@ int ext4_ext_replay_set_iblocks(struct inode *inode)
if (cmp1 != cmp2 && cmp2 != 0)
numblks++;
}
- ext4_ext_drop_refs(path);
- ext4_ext_drop_refs(path2);
- kfree(path);
- kfree(path2);
+ ext4_free_ext_path(path);
+ ext4_free_ext_path(path2);
}
out:
@@ -6092,13 +6090,11 @@ int ext4_ext_clear_bb(struct inode *inode)
return PTR_ERR(path);
ex = path[path->p_depth].p_ext;
if (!ex) {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return 0;
}
end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
cur = 0;
while (cur < end) {
@@ -6117,8 +6113,7 @@ int ext4_ext_clear_bb(struct inode *inode)
ext4_fc_record_regions(inode->i_sb, inode->i_ino,
0, path[j].p_block, 1, 1);
}
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
}
ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
ext4_fc_record_regions(inode->i_sb, inode->i_ino,
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 23167efda95e..7bc221038c6c 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -155,9 +155,7 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
int __init ext4_init_es(void)
{
- ext4_es_cachep = kmem_cache_create("ext4_extent_status",
- sizeof(struct extent_status),
- 0, (SLAB_RECLAIM_ACCOUNT), NULL);
+ ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
if (ext4_es_cachep == NULL)
return -ENOMEM;
return 0;
@@ -667,8 +665,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
}
}
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
}
static void ext4_es_insert_extent_ind_check(struct inode *inode,
@@ -1372,7 +1369,7 @@ retry:
if (count_reserved)
count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
&orig_es, &rc);
- goto out;
+ goto out_get_reserved;
}
if (len1 > 0) {
@@ -1414,6 +1411,7 @@ retry:
}
}
+out_get_reserved:
if (count_reserved)
*reserved = get_rsvd(inode, end, es, &rc);
out:
@@ -1808,9 +1806,7 @@ static void ext4_print_pending_tree(struct inode *inode)
int __init ext4_init_pending(void)
{
- ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation",
- sizeof(struct pending_reservation),
- 0, (SLAB_RECLAIM_ACCOUNT), NULL);
+ ext4_pending_cachep = KMEM_CACHE(pending_reservation, SLAB_RECLAIM_ACCOUNT);
if (ext4_pending_cachep == NULL)
return -ENOMEM;
return 0;
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 2af962cbb835..4594b62f147b 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -229,6 +229,12 @@ __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
finish_wait(wq, &wait.wq_entry);
}
+static bool ext4_fc_disabled(struct super_block *sb)
+{
+ return (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
+ (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY));
+}
+
/*
* Inform Ext4's fast about start of an inode update
*
@@ -240,8 +246,7 @@ void ext4_fc_start_update(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(inode->i_sb))
return;
restart:
@@ -265,8 +270,7 @@ void ext4_fc_stop_update(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(inode->i_sb))
return;
if (atomic_dec_and_test(&ei->i_fc_updates))
@@ -283,8 +287,7 @@ void ext4_fc_del(struct inode *inode)
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_fc_dentry_update *fc_dentry;
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(inode->i_sb))
return;
restart:
@@ -337,8 +340,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
struct ext4_sb_info *sbi = EXT4_SB(sb);
tid_t tid;
- if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
- (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(sb))
return;
ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
@@ -418,25 +420,34 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
struct __track_dentry_update_args *dentry_update =
(struct __track_dentry_update_args *)arg;
struct dentry *dentry = dentry_update->dentry;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct super_block *sb = inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
mutex_unlock(&ei->i_fc_lock);
+
+ if (IS_ENCRYPTED(dir)) {
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
+ NULL);
+ mutex_lock(&ei->i_fc_lock);
+ return -EOPNOTSUPP;
+ }
+
node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
if (!node) {
- ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL);
mutex_lock(&ei->i_fc_lock);
return -ENOMEM;
}
node->fcd_op = dentry_update->op;
- node->fcd_parent = dentry->d_parent->d_inode->i_ino;
+ node->fcd_parent = dir->i_ino;
node->fcd_ino = inode->i_ino;
if (dentry->d_name.len > DNAME_INLINE_LEN) {
node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
if (!node->fcd_name.name) {
kmem_cache_free(ext4_fc_dentry_cachep, node);
- ext4_fc_mark_ineligible(inode->i_sb,
- EXT4_FC_REASON_NOMEM, NULL);
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL);
mutex_lock(&ei->i_fc_lock);
return -ENOMEM;
}
@@ -493,10 +504,8 @@ void __ext4_fc_track_unlink(handle_t *handle,
void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (sbi->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(inode->i_sb))
return;
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
@@ -522,10 +531,8 @@ void __ext4_fc_track_link(handle_t *handle,
void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (sbi->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(inode->i_sb))
return;
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
@@ -551,10 +558,8 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (sbi->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(inode->i_sb))
return;
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
@@ -576,22 +581,20 @@ static int __track_inode(struct inode *inode, void *arg, bool update)
void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int ret;
if (S_ISDIR(inode->i_mode))
return;
+ if (ext4_fc_disabled(inode->i_sb))
+ return;
+
if (ext4_should_journal_data(inode)) {
ext4_fc_mark_ineligible(inode->i_sb,
EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
return;
}
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (sbi->s_mount_state & EXT4_FC_REPLAY))
- return;
-
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
return;
@@ -634,15 +637,13 @@ static int __track_range(struct inode *inode, void *arg, bool update)
void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
ext4_lblk_t end)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct __track_range_args args;
int ret;
if (S_ISDIR(inode->i_mode))
return;
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (sbi->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(inode->i_sb))
return;
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
@@ -674,18 +675,6 @@ static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
/* Ext4 commit path routines */
-/* memzero and update CRC */
-static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
- u32 *crc)
-{
- void *ret;
-
- ret = memset(dst, 0, len);
- if (crc)
- *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
- return ret;
-}
-
/*
* Allocate len bytes on a fast commit buffer.
*
@@ -699,62 +688,60 @@ static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
*/
static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
{
- struct ext4_fc_tl *tl;
+ struct ext4_fc_tl tl;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct buffer_head *bh;
int bsize = sbi->s_journal->j_blocksize;
int ret, off = sbi->s_fc_bytes % bsize;
- int pad_len;
+ int remaining;
+ u8 *dst;
/*
- * After allocating len, we should have space at least for a 0 byte
- * padding.
+ * If 'len' is too long to fit in any block alongside a PAD tlv, then we
+ * cannot fulfill the request.
*/
- if (len + sizeof(struct ext4_fc_tl) > bsize)
+ if (len > bsize - EXT4_FC_TAG_BASE_LEN)
return NULL;
- if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
- /*
- * Only allocate from current buffer if we have enough space for
- * this request AND we have space to add a zero byte padding.
- */
- if (!sbi->s_fc_bh) {
- ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
- if (ret)
- return NULL;
- sbi->s_fc_bh = bh;
- }
+ if (!sbi->s_fc_bh) {
+ ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
+ if (ret)
+ return NULL;
+ sbi->s_fc_bh = bh;
+ }
+ dst = sbi->s_fc_bh->b_data + off;
+
+ /*
+ * Allocate the bytes in the current block if we can do so while still
+ * leaving enough space for a PAD tlv.
+ */
+ remaining = bsize - EXT4_FC_TAG_BASE_LEN - off;
+ if (len <= remaining) {
sbi->s_fc_bytes += len;
- return sbi->s_fc_bh->b_data + off;
+ return dst;
}
- /* Need to add PAD tag */
- tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
- tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
- pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
- tl->fc_len = cpu_to_le16(pad_len);
- if (crc)
- *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
- if (pad_len > 0)
- ext4_fc_memzero(sb, tl + 1, pad_len, crc);
+
+ /*
+ * Else, terminate the current block with a PAD tlv, then allocate a new
+ * block and allocate the bytes at the start of that new block.
+ */
+
+ tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
+ tl.fc_len = cpu_to_le16(remaining);
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
+ memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
+ *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize);
+
ext4_fc_submit_bh(sb, false);
ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
if (ret)
return NULL;
sbi->s_fc_bh = bh;
- sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
+ sbi->s_fc_bytes += bsize - off + len;
return sbi->s_fc_bh->b_data;
}
-/* memcpy to fc reserved space and update CRC */
-static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
- int len, u32 *crc)
-{
- if (crc)
- *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
- return memcpy(dst, src, len);
-}
-
/*
* Complete a fast commit by writing tail tag.
*
@@ -775,23 +762,27 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
* ext4_fc_reserve_space takes care of allocating an extra block if
* there's no enough space on this block for accommodating this tail.
*/
- dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
+ dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc);
if (!dst)
return -ENOSPC;
off = sbi->s_fc_bytes % bsize;
tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
- tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
+ tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail));
sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
- ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
- dst += sizeof(tl);
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
+ dst += EXT4_FC_TAG_BASE_LEN;
tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
- ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
+ memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
dst += sizeof(tail.fc_tid);
+ crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data,
+ dst - (u8 *)sbi->s_fc_bh->b_data);
tail.fc_crc = cpu_to_le32(crc);
- ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
+ memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
+ dst += sizeof(tail.fc_crc);
+ memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */
ext4_fc_submit_bh(sb, true);
@@ -808,15 +799,15 @@ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
struct ext4_fc_tl tl;
u8 *dst;
- dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
+ dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc);
if (!dst)
return false;
tl.fc_tag = cpu_to_le16(tag);
tl.fc_len = cpu_to_le16(len);
- ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
- ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
+ memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len);
return true;
}
@@ -828,8 +819,8 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
struct ext4_fc_dentry_info fcd;
struct ext4_fc_tl tl;
int dlen = fc_dentry->fcd_name.len;
- u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
- crc);
+ u8 *dst = ext4_fc_reserve_space(sb,
+ EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);
if (!dst)
return false;
@@ -838,11 +829,11 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
- ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
- dst += sizeof(tl);
- ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
+ dst += EXT4_FC_TAG_BASE_LEN;
+ memcpy(dst, &fcd, sizeof(fcd));
dst += sizeof(fcd);
- ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
+ memcpy(dst, fc_dentry->fcd_name.name, dlen);
return true;
}
@@ -874,22 +865,21 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
+ ret = -ECANCELED;
dst = ext4_fc_reserve_space(inode->i_sb,
- sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
+ EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc);
if (!dst)
- return -ECANCELED;
+ goto err;
- if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
- return -ECANCELED;
- dst += sizeof(tl);
- if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
- return -ECANCELED;
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
+ dst += EXT4_FC_TAG_BASE_LEN;
+ memcpy(dst, &fc_inode, sizeof(fc_inode));
dst += sizeof(fc_inode);
- if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
- inode_len, crc))
- return -ECANCELED;
-
- return 0;
+ memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len);
+ ret = 0;
+err:
+ brelse(iloc.bh);
+ return ret;
}
/*
@@ -991,7 +981,7 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal)
finish_wait(&ei->i_fc_wait, &wait);
}
spin_unlock(&sbi->s_fc_lock);
- ret = jbd2_submit_inode_data(ei->jinode);
+ ret = jbd2_submit_inode_data(journal, ei->jinode);
if (ret)
return ret;
spin_lock(&sbi->s_fc_lock);
@@ -1343,7 +1333,7 @@ struct dentry_info_args {
};
static inline void tl_to_darg(struct dentry_info_args *darg,
- struct ext4_fc_tl *tl, u8 *val)
+ struct ext4_fc_tl *tl, u8 *val)
{
struct ext4_fc_dentry_info fcd;
@@ -1352,8 +1342,14 @@ static inline void tl_to_darg(struct dentry_info_args *darg,
darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
darg->ino = le32_to_cpu(fcd.fc_ino);
darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
- darg->dname_len = le16_to_cpu(tl->fc_len) -
- sizeof(struct ext4_fc_dentry_info);
+ darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info);
+}
+
+static inline void ext4_fc_get_tl(struct ext4_fc_tl *tl, u8 *val)
+{
+ memcpy(tl, val, EXT4_FC_TAG_BASE_LEN);
+ tl->fc_len = le16_to_cpu(tl->fc_len);
+ tl->fc_tag = le16_to_cpu(tl->fc_tag);
}
/* Unlink replay function */
@@ -1387,7 +1383,7 @@ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
return 0;
}
- ret = __ext4_unlink(NULL, old_parent, &entry, inode);
+ ret = __ext4_unlink(old_parent, &entry, inode, NULL);
/* -ENOENT ok coz it might not exist anymore. */
if (ret == -ENOENT)
ret = 0;
@@ -1491,13 +1487,15 @@ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
if (state->fc_modified_inodes[i] == ino)
return 0;
if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
- state->fc_modified_inodes = krealloc(
- state->fc_modified_inodes,
+ int *fc_modified_inodes;
+
+ fc_modified_inodes = krealloc(state->fc_modified_inodes,
sizeof(int) * (state->fc_modified_inodes_size +
EXT4_FC_REPLAY_REALLOC_INCREMENT),
GFP_KERNEL);
- if (!state->fc_modified_inodes)
+ if (!fc_modified_inodes)
return -ENOMEM;
+ state->fc_modified_inodes = fc_modified_inodes;
state->fc_modified_inodes_size +=
EXT4_FC_REPLAY_REALLOC_INCREMENT;
}
@@ -1516,8 +1514,9 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
struct ext4_inode *raw_fc_inode;
struct inode *inode = NULL;
struct ext4_iloc iloc;
- int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
+ int inode_len, ino, ret, tag = tl->fc_tag;
struct ext4_extent_header *eh;
+ size_t off_gen = offsetof(struct ext4_inode, i_generation);
memcpy(&fc_inode, val, sizeof(fc_inode));
@@ -1541,12 +1540,12 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
if (ret)
goto out;
- inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
+ inode_len = tl->fc_len - sizeof(struct ext4_fc_inode);
raw_inode = ext4_raw_inode(&iloc);
memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
- memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
- inode_len - offsetof(struct ext4_inode, i_generation));
+ memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen,
+ inode_len - off_gen);
if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
if (eh->eh_magic != EXT4_EXT_MAGIC) {
@@ -1682,15 +1681,18 @@ int ext4_fc_record_regions(struct super_block *sb, int ino,
if (replay && state->fc_regions_used != state->fc_regions_valid)
state->fc_regions_used = state->fc_regions_valid;
if (state->fc_regions_used == state->fc_regions_size) {
+ struct ext4_fc_alloc_region *fc_regions;
+
+ fc_regions = krealloc(state->fc_regions,
+ sizeof(struct ext4_fc_alloc_region) *
+ (state->fc_regions_size +
+ EXT4_FC_REPLAY_REALLOC_INCREMENT),
+ GFP_KERNEL);
+ if (!fc_regions)
+ return -ENOMEM;
state->fc_regions_size +=
EXT4_FC_REPLAY_REALLOC_INCREMENT;
- state->fc_regions = krealloc(
- state->fc_regions,
- state->fc_regions_size *
- sizeof(struct ext4_fc_alloc_region),
- GFP_KERNEL);
- if (!state->fc_regions)
- return -ENOMEM;
+ state->fc_regions = fc_regions;
}
region = &state->fc_regions[state->fc_regions_used++];
region->ino = ino;
@@ -1770,8 +1772,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
ret = ext4_ext_insert_extent(
NULL, inode, &path, &newex, 0);
up_write((&EXT4_I(inode)->i_data_sem));
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
if (ret)
goto out;
goto next;
@@ -1926,8 +1927,7 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
for (j = 0; j < path->p_depth; j++)
ext4_mb_mark_bb(inode->i_sb,
path[j].p_block, 1, 1);
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
}
cur += ret;
ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
@@ -1972,6 +1972,33 @@ void ext4_fc_replay_cleanup(struct super_block *sb)
kfree(sbi->s_fc_replay_state.fc_modified_inodes);
}
+static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi,
+ int tag, int len)
+{
+ switch (tag) {
+ case EXT4_FC_TAG_ADD_RANGE:
+ return len == sizeof(struct ext4_fc_add_range);
+ case EXT4_FC_TAG_DEL_RANGE:
+ return len == sizeof(struct ext4_fc_del_range);
+ case EXT4_FC_TAG_CREAT:
+ case EXT4_FC_TAG_LINK:
+ case EXT4_FC_TAG_UNLINK:
+ len -= sizeof(struct ext4_fc_dentry_info);
+ return len >= 1 && len <= EXT4_NAME_LEN;
+ case EXT4_FC_TAG_INODE:
+ len -= sizeof(struct ext4_fc_inode);
+ return len >= EXT4_GOOD_OLD_INODE_SIZE &&
+ len <= sbi->s_inode_size;
+ case EXT4_FC_TAG_PAD:
+ return true; /* padding can have any length */
+ case EXT4_FC_TAG_TAIL:
+ return len >= sizeof(struct ext4_fc_tail);
+ case EXT4_FC_TAG_HEAD:
+ return len == sizeof(struct ext4_fc_head);
+ }
+ return false;
+}
+
/*
* Recovery Scan phase handler
*
@@ -2007,7 +2034,7 @@ static int ext4_fc_replay_scan(journal_t *journal,
state = &sbi->s_fc_replay_state;
start = (u8 *)bh->b_data;
- end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
+ end = start + journal->j_blocksize;
if (state->fc_replay_expected_off == 0) {
state->fc_cur_tag = 0;
@@ -2028,12 +2055,19 @@ static int ext4_fc_replay_scan(journal_t *journal,
}
state->fc_replay_expected_off++;
- for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
- memcpy(&tl, cur, sizeof(tl));
- val = cur + sizeof(tl);
+ for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
+ cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
+ ext4_fc_get_tl(&tl, cur);
+ val = cur + EXT4_FC_TAG_BASE_LEN;
+ if (tl.fc_len > end - val ||
+ !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) {
+ ret = state->fc_replay_num_tags ?
+ JBD2_FC_REPLAY_STOP : -ECANCELED;
+ goto out_err;
+ }
ext4_debug("Scan phase, tag:%s, blk %lld\n",
- tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
- switch (le16_to_cpu(tl.fc_tag)) {
+ tag2str(tl.fc_tag), bh->b_blocknr);
+ switch (tl.fc_tag) {
case EXT4_FC_TAG_ADD_RANGE:
memcpy(&ext, val, sizeof(ext));
ex = (struct ext4_extent *)&ext.fc_ex;
@@ -2053,13 +2087,13 @@ static int ext4_fc_replay_scan(journal_t *journal,
case EXT4_FC_TAG_PAD:
state->fc_cur_tag++;
state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
- sizeof(tl) + le16_to_cpu(tl.fc_len));
+ EXT4_FC_TAG_BASE_LEN + tl.fc_len);
break;
case EXT4_FC_TAG_TAIL:
state->fc_cur_tag++;
memcpy(&tail, val, sizeof(tail));
state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
- sizeof(tl) +
+ EXT4_FC_TAG_BASE_LEN +
offsetof(struct ext4_fc_tail,
fc_crc));
if (le32_to_cpu(tail.fc_tid) == expected_tid &&
@@ -2086,7 +2120,7 @@ static int ext4_fc_replay_scan(journal_t *journal,
}
state->fc_cur_tag++;
state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
- sizeof(tl) + le16_to_cpu(tl.fc_len));
+ EXT4_FC_TAG_BASE_LEN + tl.fc_len);
break;
default:
ret = state->fc_replay_num_tags ?
@@ -2139,21 +2173,22 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
#endif
start = (u8 *)bh->b_data;
- end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
+ end = start + journal->j_blocksize;
- for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
- memcpy(&tl, cur, sizeof(tl));
- val = cur + sizeof(tl);
+ for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
+ cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
+ ext4_fc_get_tl(&tl, cur);
+ val = cur + EXT4_FC_TAG_BASE_LEN;
if (state->fc_replay_num_tags == 0) {
ret = JBD2_FC_REPLAY_STOP;
ext4_fc_set_bitmaps_and_counters(sb);
break;
}
- ext4_debug("Replay phase, tag:%s\n",
- tag2str(le16_to_cpu(tl.fc_tag)));
+
+ ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag));
state->fc_replay_num_tags--;
- switch (le16_to_cpu(tl.fc_tag)) {
+ switch (tl.fc_tag) {
case EXT4_FC_TAG_LINK:
ret = ext4_fc_replay_link(sb, &tl, val);
break;
@@ -2174,19 +2209,18 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
break;
case EXT4_FC_TAG_PAD:
trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
- le16_to_cpu(tl.fc_len), 0);
+ tl.fc_len, 0);
break;
case EXT4_FC_TAG_TAIL:
- trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
- le16_to_cpu(tl.fc_len), 0);
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL,
+ 0, tl.fc_len, 0);
memcpy(&tail, val, sizeof(tail));
WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
break;
case EXT4_FC_TAG_HEAD:
break;
default:
- trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
- le16_to_cpu(tl.fc_len), 0);
+ trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0);
ret = -ECANCELED;
break;
}
@@ -2210,17 +2244,17 @@ void ext4_fc_init(struct super_block *sb, journal_t *journal)
journal->j_fc_cleanup_callback = ext4_fc_cleanup;
}
-static const char *fc_ineligible_reasons[] = {
- "Extended attributes changed",
- "Cross rename",
- "Journal flag changed",
- "Insufficient memory",
- "Swap boot",
- "Resize",
- "Dir renamed",
- "Falloc range op",
- "Data journalling",
- "FC Commit Failed"
+static const char * const fc_ineligible_reasons[] = {
+ [EXT4_FC_REASON_XATTR] = "Extended attributes changed",
+ [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename",
+ [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed",
+ [EXT4_FC_REASON_NOMEM] = "Insufficient memory",
+ [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot",
+ [EXT4_FC_REASON_RESIZE] = "Resize",
+ [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed",
+ [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
+ [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
+ [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
};
int ext4_fc_info_show(struct seq_file *seq, void *v)
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
index 1db12847a83b..2fadb2c4780c 100644
--- a/fs/ext4/fast_commit.h
+++ b/fs/ext4/fast_commit.h
@@ -58,7 +58,7 @@ struct ext4_fc_dentry_info {
__u8 fc_dname[];
};
-/* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */
+/* Value structure for EXT4_FC_TAG_INODE. */
struct ext4_fc_inode {
__le32 fc_ino;
__u8 fc_raw_inode[];
@@ -70,6 +70,9 @@ struct ext4_fc_tail {
__le32 fc_crc;
};
+/* Tag base length */
+#define EXT4_FC_TAG_BASE_LEN (sizeof(struct ext4_fc_tl))
+
/*
* Fast commit status codes
*/
@@ -93,6 +96,7 @@ enum {
EXT4_FC_REASON_RENAME_DIR,
EXT4_FC_REASON_FALLOC_RANGE,
EXT4_FC_REASON_INODE_JOURNAL_DATA,
+ EXT4_FC_REASON_ENCRYPTED_FILENAME,
EXT4_FC_REASON_MAX
};
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 109d07629f81..7ac0a81bd371 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -36,19 +36,34 @@
#include "acl.h"
#include "truncate.h"
-static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
+/*
+ * Returns %true if the given DIO request should be attempted with DIO, or
+ * %false if it should fall back to buffered I/O.
+ *
+ * DIO isn't well specified; when it's unsupported (either due to the request
+ * being misaligned, or due to the file not supporting DIO at all), filesystems
+ * either fall back to buffered I/O or return EINVAL. For files that don't use
+ * any special features like encryption or verity, ext4 has traditionally
+ * returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too.
+ * In this case, we should attempt the DIO, *not* fall back to buffered I/O.
+ *
+ * In contrast, in cases where DIO is unsupported due to ext4 features, ext4
+ * traditionally falls back to buffered I/O.
+ *
+ * This function implements the traditional ext4 behavior in all these cases.
+ */
+static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter)
{
struct inode *inode = file_inode(iocb->ki_filp);
+ u32 dio_align = ext4_dio_alignment(inode);
- if (!fscrypt_dio_supported(iocb, iter))
- return false;
- if (fsverity_active(inode))
- return false;
- if (ext4_should_journal_data(inode))
+ if (dio_align == 0)
return false;
- if (ext4_has_inline_data(inode))
- return false;
- return true;
+
+ if (dio_align == 1)
+ return true;
+
+ return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align);
}
static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -63,7 +78,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
inode_lock_shared(inode);
}
- if (!ext4_dio_supported(iocb, to)) {
+ if (!ext4_should_use_dio(iocb, to)) {
inode_unlock_shared(inode);
/*
* Fallback to buffered I/O if the operation being performed on
@@ -511,7 +526,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
/* Fallback to buffered I/O if the inode does not support direct I/O. */
- if (!ext4_dio_supported(iocb, from)) {
+ if (!ext4_should_use_dio(iocb, from)) {
if (ilock_shared)
inode_unlock_shared(inode);
else
@@ -528,6 +543,12 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = -EAGAIN;
goto out;
}
+ /*
+ * Make sure inline data cannot be created anymore since we are going
+ * to allocate blocks for DIO. We know the inode does not have any
+ * inline data now because ext4_dio_supported() checked for that.
+ */
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
offset = iocb->ki_pos;
count = ret;
@@ -934,7 +955,7 @@ const struct inode_operations ext4_file_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_file_getattr,
.listxattr = ext4_listxattr,
- .get_acl = ext4_get_acl,
+ .get_inode_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
.fileattr_get = ext4_fileattr_get,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 208b87ce8858..63f9bb6e8851 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -463,10 +463,9 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
hinfo.hash_version = DX_HASH_HALF_MD4;
hinfo.seed = sbi->s_hash_seed;
ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo);
- grp = hinfo.hash;
+ parent_group = hinfo.hash % ngroups;
} else
- grp = prandom_u32();
- parent_group = (unsigned)grp % ngroups;
+ parent_group = get_random_u32_below(ngroups);
for (i = 0; i < ngroups; i++) {
g = (parent_group + i) % ngroups;
get_orlov_stats(sb, g, flex_size, &stats);
@@ -871,7 +870,7 @@ static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode,
struct super_block *sb = dir->i_sb;
int nblocks = 0;
#ifdef CONFIG_EXT4_FS_POSIX_ACL
- struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT);
+ struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT);
if (IS_ERR(p))
return PTR_ERR(p);
@@ -1077,8 +1076,8 @@ repeat_in_this_group:
if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) {
BUG_ON(nblocks <= 0);
- handle = __ext4_journal_start_sb(dir->i_sb, line_no,
- handle_type, nblocks, 0,
+ handle = __ext4_journal_start_sb(NULL, dir->i_sb,
+ line_no, handle_type, nblocks, 0,
ext4_trans_default_revoke_credits(sb));
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
@@ -1280,7 +1279,7 @@ got:
EXT4_GROUP_INFO_IBITMAP_CORRUPT);
goto out;
}
- inode->i_generation = prandom_u32();
+ inode->i_generation = get_random_u32();
/* Precompute checksum seed for inode metadata */
if (ext4_has_metadata_csum(sb)) {
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 860fc5119009..c68bebe7ff4b 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -148,6 +148,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
struct super_block *sb = inode->i_sb;
Indirect *p = chain;
struct buffer_head *bh;
+ unsigned int key;
int ret = -EIO;
*err = 0;
@@ -156,7 +157,13 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
if (!p->key)
goto no_block;
while (--depth) {
- bh = sb_getblk(sb, le32_to_cpu(p->key));
+ key = le32_to_cpu(p->key);
+ if (key > ext4_blocks_count(EXT4_SB(sb)->s_es)) {
+ /* the block was out of range */
+ ret = -EFSCORRUPTED;
+ goto failure;
+ }
+ bh = sb_getblk(sb, key);
if (unlikely(!bh)) {
ret = -ENOMEM;
goto failure;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index a4fbe825694b..2b42ececa46d 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -180,8 +180,7 @@ static int ext4_read_inline_data(struct inode *inode, void *buffer,
BUG_ON(len > EXT4_I(inode)->i_inline_size);
- cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ?
- len : EXT4_MIN_INLINE_DATA_SIZE;
+ cp_len = min_t(unsigned int, len, EXT4_MIN_INLINE_DATA_SIZE);
raw_inode = ext4_raw_inode(iloc);
memcpy(buffer, (void *)(raw_inode->i_block), cp_len);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 601214453c3a..9d9f414f99fe 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -222,13 +222,13 @@ void ext4_evict_inode(struct inode *inode)
/*
* For inodes with journalled data, transaction commit could have
- * dirtied the inode. Flush worker is ignoring it because of I_FREEING
- * flag but we still need to remove the inode from the writeback lists.
+ * dirtied the inode. And for inodes with dioread_nolock, unwritten
+ * extents converting worker could merge extents and also have dirtied
+ * the inode. Flush worker is ignoring it because of I_FREEING flag but
+ * we still need to remove the inode from the writeback lists.
*/
- if (!list_empty_careful(&inode->i_io_list)) {
- WARN_ON_ONCE(!ext4_should_journal_data(inode));
+ if (!list_empty_careful(&inode->i_io_list))
inode_io_list_del(inode);
- }
/*
* Protect us against freezing - iput() caller didn't have to have any
@@ -335,6 +335,12 @@ stop_handle:
ext4_xattr_inode_array_free(ea_inode_array);
return;
no_delete:
+ /*
+ * Check out some where else accidentally dirty the evicting inode,
+ * which may probably cause inode use-after-free issues later.
+ */
+ WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list));
+
if (!list_empty(&EXT4_I(inode)->i_fc_list))
ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
@@ -1188,6 +1194,13 @@ retry_grab:
page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
+ /*
+ * The same as page allocation, we prealloc buffer heads before
+ * starting the handle.
+ */
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, inode->i_sb->s_blocksize, 0);
+
unlock_page(page);
retry_journal:
@@ -1302,7 +1315,8 @@ static int ext4_write_end(struct file *file,
trace_ext4_write_end(inode, pos, len, copied);
- if (ext4_has_inline_data(inode))
+ if (ext4_has_inline_data(inode) &&
+ ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
return ext4_write_inline_data_end(inode, pos, len, copied, page);
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
@@ -1536,9 +1550,12 @@ void ext4_da_release_space(struct inode *inode, int to_free)
*/
struct mpage_da_data {
+ /* These are input fields for ext4_do_writepages() */
struct inode *inode;
struct writeback_control *wbc;
+ unsigned int can_map:1; /* Can writepages call map blocks? */
+ /* These are internal state of ext4_do_writepages() */
pgoff_t first_page; /* The first page to write */
pgoff_t next_page; /* Current page to examine */
pgoff_t last_page; /* Last page to examine */
@@ -2002,7 +2019,6 @@ static int ext4_writepage(struct page *page,
struct buffer_head *page_bufs = NULL;
struct inode *inode = page->mapping->host;
struct ext4_io_submit io_submit;
- bool keep_towrite = false;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
folio_invalidate(folio, 0, folio_size(folio));
@@ -2060,7 +2076,6 @@ static int ext4_writepage(struct page *page,
unlock_page(page);
return 0;
}
- keep_towrite = true;
}
if (PageChecked(page) && ext4_should_journal_data(inode))
@@ -2077,7 +2092,7 @@ static int ext4_writepage(struct page *page,
unlock_page(page);
return -ENOMEM;
}
- ret = ext4_bio_write_page(&io_submit, page, len, keep_towrite);
+ ret = ext4_bio_write_page(&io_submit, page, len);
ext4_io_submit(&io_submit);
/* Drop io_end reference we got from init */
ext4_put_io_end_defer(io_submit.io_end);
@@ -2111,7 +2126,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
len = size & ~PAGE_MASK;
else
len = PAGE_SIZE;
- err = ext4_bio_write_page(&mpd->io_submit, page, len, false);
+ err = ext4_bio_write_page(&mpd->io_submit, page, len);
if (!err)
mpd->wbc->nr_to_write--;
mpd->first_page++;
@@ -2544,18 +2559,33 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
}
+/* Return true if the page needs to be written as part of transaction commit */
+static bool ext4_page_nomap_can_writeout(struct page *page)
+{
+ struct buffer_head *bh, *head;
+
+ bh = head = page_buffers(page);
+ do {
+ if (buffer_dirty(bh) && buffer_mapped(bh) && !buffer_delay(bh))
+ return true;
+ } while ((bh = bh->b_this_page) != head);
+ return false;
+}
+
/*
* mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
- * and underlying extent to map
+ * needing mapping, submit mapped pages
*
* @mpd - where to look for pages
*
* Walk dirty pages in the mapping. If they are fully mapped, submit them for
- * IO immediately. When we find a page which isn't mapped we start accumulating
- * extent of buffers underlying these pages that needs mapping (formed by
- * either delayed or unwritten buffers). We also lock the pages containing
- * these buffers. The extent found is returned in @mpd structure (starting at
- * mpd->lblk with length mpd->len blocks).
+ * IO immediately. If we cannot map blocks, we submit just already mapped
+ * buffers in the page for IO and keep page dirty. When we can map blocks and
+ * we find a page which isn't mapped we start accumulating extent of buffers
+ * underlying these pages that needs mapping (formed by either delayed or
+ * unwritten buffers). We also lock the pages containing these buffers. The
+ * extent found is returned in @mpd structure (starting at mpd->lblk with
+ * length mpd->len blocks).
*
* Note that this function can attach bios to one io_end structure which are
* neither logically nor physically contiguous. Although it may seem as an
@@ -2646,14 +2676,30 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
if (mpd->map.m_len == 0)
mpd->first_page = page->index;
mpd->next_page = page->index + 1;
- /* Add all dirty buffers to mpd */
- lblk = ((ext4_lblk_t)page->index) <<
- (PAGE_SHIFT - blkbits);
- head = page_buffers(page);
- err = mpage_process_page_bufs(mpd, head, head, lblk);
- if (err <= 0)
- goto out;
- err = 0;
+ /*
+ * Writeout for transaction commit where we cannot
+ * modify metadata is simple. Just submit the page.
+ */
+ if (!mpd->can_map) {
+ if (ext4_page_nomap_can_writeout(page)) {
+ err = mpage_submit_page(mpd, page);
+ if (err < 0)
+ goto out;
+ } else {
+ unlock_page(page);
+ mpd->first_page++;
+ }
+ } else {
+ /* Add all dirty buffers to mpd */
+ lblk = ((ext4_lblk_t)page->index) <<
+ (PAGE_SHIFT - blkbits);
+ head = page_buffers(page);
+ err = mpage_process_page_bufs(mpd, head, head,
+ lblk);
+ if (err <= 0)
+ goto out;
+ err = 0;
+ }
left--;
}
pagevec_release(&pvec);
@@ -2666,25 +2712,27 @@ out:
return err;
}
-static int ext4_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+static int ext4_writepage_cb(struct page *page, struct writeback_control *wbc,
+ void *data)
{
+ return ext4_writepage(page, wbc);
+}
+
+static int ext4_do_writepages(struct mpage_da_data *mpd)
+{
+ struct writeback_control *wbc = mpd->wbc;
pgoff_t writeback_index = 0;
long nr_to_write = wbc->nr_to_write;
int range_whole = 0;
int cycled = 1;
handle_t *handle = NULL;
- struct mpage_da_data mpd;
- struct inode *inode = mapping->host;
+ struct inode *inode = mpd->inode;
+ struct address_space *mapping = inode->i_mapping;
int needed_blocks, rsv_blocks = 0, ret = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
struct blk_plug plug;
bool give_up_on_write = false;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
- return -EIO;
-
- percpu_down_read(&sbi->s_writepages_rwsem);
trace_ext4_writepages(inode, wbc);
/*
@@ -2696,7 +2744,9 @@ static int ext4_writepages(struct address_space *mapping,
goto out_writepages;
if (ext4_should_journal_data(inode)) {
- ret = generic_writepages(mapping, wbc);
+ blk_start_plug(&plug);
+ ret = write_cache_pages(mapping, wbc, ext4_writepage_cb, NULL);
+ blk_finish_plug(&plug);
goto out_writepages;
}
@@ -2750,19 +2800,18 @@ static int ext4_writepages(struct address_space *mapping,
writeback_index = mapping->writeback_index;
if (writeback_index)
cycled = 0;
- mpd.first_page = writeback_index;
- mpd.last_page = -1;
+ mpd->first_page = writeback_index;
+ mpd->last_page = -1;
} else {
- mpd.first_page = wbc->range_start >> PAGE_SHIFT;
- mpd.last_page = wbc->range_end >> PAGE_SHIFT;
+ mpd->first_page = wbc->range_start >> PAGE_SHIFT;
+ mpd->last_page = wbc->range_end >> PAGE_SHIFT;
}
- mpd.inode = inode;
- mpd.wbc = wbc;
- ext4_io_submit_init(&mpd.io_submit, wbc);
+ ext4_io_submit_init(&mpd->io_submit, wbc);
retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
+ tag_pages_for_writeback(mapping, mpd->first_page,
+ mpd->last_page);
blk_start_plug(&plug);
/*
@@ -2771,31 +2820,32 @@ retry:
* in the block layer on device congestion while having transaction
* started.
*/
- mpd.do_map = 0;
- mpd.scanned_until_end = 0;
- mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
- if (!mpd.io_submit.io_end) {
+ mpd->do_map = 0;
+ mpd->scanned_until_end = 0;
+ mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+ if (!mpd->io_submit.io_end) {
ret = -ENOMEM;
goto unplug;
}
- ret = mpage_prepare_extent_to_map(&mpd);
+ ret = mpage_prepare_extent_to_map(mpd);
/* Unlock pages we didn't use */
- mpage_release_unused_pages(&mpd, false);
+ mpage_release_unused_pages(mpd, false);
/* Submit prepared bio */
- ext4_io_submit(&mpd.io_submit);
- ext4_put_io_end_defer(mpd.io_submit.io_end);
- mpd.io_submit.io_end = NULL;
+ ext4_io_submit(&mpd->io_submit);
+ ext4_put_io_end_defer(mpd->io_submit.io_end);
+ mpd->io_submit.io_end = NULL;
if (ret < 0)
goto unplug;
- while (!mpd.scanned_until_end && wbc->nr_to_write > 0) {
+ while (!mpd->scanned_until_end && wbc->nr_to_write > 0) {
/* For each extent of pages we use new io_end */
- mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
- if (!mpd.io_submit.io_end) {
+ mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+ if (!mpd->io_submit.io_end) {
ret = -ENOMEM;
break;
}
+ WARN_ON_ONCE(!mpd->can_map);
/*
* We have two constraints: We find one extent to map and we
* must always write out whole page (makes a difference when
@@ -2815,16 +2865,16 @@ retry:
"%ld pages, ino %lu; err %d", __func__,
wbc->nr_to_write, inode->i_ino, ret);
/* Release allocated io_end */
- ext4_put_io_end(mpd.io_submit.io_end);
- mpd.io_submit.io_end = NULL;
+ ext4_put_io_end(mpd->io_submit.io_end);
+ mpd->io_submit.io_end = NULL;
break;
}
- mpd.do_map = 1;
+ mpd->do_map = 1;
- trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
- ret = mpage_prepare_extent_to_map(&mpd);
- if (!ret && mpd.map.m_len)
- ret = mpage_map_and_submit_extent(handle, &mpd,
+ trace_ext4_da_write_pages(inode, mpd->first_page, wbc);
+ ret = mpage_prepare_extent_to_map(mpd);
+ if (!ret && mpd->map.m_len)
+ ret = mpage_map_and_submit_extent(handle, mpd,
&give_up_on_write);
/*
* Caution: If the handle is synchronous,
@@ -2839,12 +2889,12 @@ retry:
if (!ext4_handle_valid(handle) || handle->h_sync == 0) {
ext4_journal_stop(handle);
handle = NULL;
- mpd.do_map = 0;
+ mpd->do_map = 0;
}
/* Unlock pages we didn't use */
- mpage_release_unused_pages(&mpd, give_up_on_write);
+ mpage_release_unused_pages(mpd, give_up_on_write);
/* Submit prepared bio */
- ext4_io_submit(&mpd.io_submit);
+ ext4_io_submit(&mpd->io_submit);
/*
* Drop our io_end reference we got from init. We have
@@ -2854,11 +2904,11 @@ retry:
* up doing unwritten extent conversion.
*/
if (handle) {
- ext4_put_io_end_defer(mpd.io_submit.io_end);
+ ext4_put_io_end_defer(mpd->io_submit.io_end);
ext4_journal_stop(handle);
} else
- ext4_put_io_end(mpd.io_submit.io_end);
- mpd.io_submit.io_end = NULL;
+ ext4_put_io_end(mpd->io_submit.io_end);
+ mpd->io_submit.io_end = NULL;
if (ret == -ENOSPC && sbi->s_journal) {
/*
@@ -2878,8 +2928,8 @@ unplug:
blk_finish_plug(&plug);
if (!ret && !cycled && wbc->nr_to_write > 0) {
cycled = 1;
- mpd.last_page = writeback_index - 1;
- mpd.first_page = 0;
+ mpd->last_page = writeback_index - 1;
+ mpd->first_page = 0;
goto retry;
}
@@ -2889,15 +2939,51 @@ unplug:
* Set the writeback_index so that range_cyclic
* mode will write it back later
*/
- mapping->writeback_index = mpd.first_page;
+ mapping->writeback_index = mpd->first_page;
out_writepages:
trace_ext4_writepages_result(inode, wbc, ret,
nr_to_write - wbc->nr_to_write);
- percpu_up_read(&sbi->s_writepages_rwsem);
return ret;
}
+static int ext4_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct super_block *sb = mapping->host->i_sb;
+ struct mpage_da_data mpd = {
+ .inode = mapping->host,
+ .wbc = wbc,
+ .can_map = 1,
+ };
+ int ret;
+
+ if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+ return -EIO;
+
+ percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem);
+ ret = ext4_do_writepages(&mpd);
+ percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem);
+
+ return ret;
+}
+
+int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .range_start = jinode->i_dirty_start,
+ .range_end = jinode->i_dirty_end,
+ };
+ struct mpage_da_data mpd = {
+ .inode = jinode->i_vfs_inode,
+ .wbc = &wbc,
+ .can_map = 0,
+ };
+ return ext4_do_writepages(&mpd);
+}
+
static int ext4_dax_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -3639,7 +3725,6 @@ static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
static const struct address_space_operations ext4_aops = {
.read_folio = ext4_read_folio,
.readahead = ext4_readahead,
- .writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_write_end,
@@ -3657,7 +3742,6 @@ static const struct address_space_operations ext4_aops = {
static const struct address_space_operations ext4_journalled_aops = {
.read_folio = ext4_read_folio,
.readahead = ext4_readahead,
- .writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_journalled_write_end,
@@ -3666,6 +3750,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.invalidate_folio = ext4_journalled_invalidate_folio,
.release_folio = ext4_release_folio,
.direct_IO = noop_direct_IO,
+ .migrate_folio = buffer_migrate_folio_norefs,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
.swap_activate = ext4_iomap_swap_activate,
@@ -3674,7 +3759,6 @@ static const struct address_space_operations ext4_journalled_aops = {
static const struct address_space_operations ext4_da_aops = {
.read_folio = ext4_read_folio,
.readahead = ext4_readahead,
- .writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_da_write_begin,
.write_end = ext4_da_write_end,
@@ -4218,7 +4302,8 @@ int ext4_truncate(struct inode *inode)
/* If we zero-out tail of the page, we have to create jinode for jbd2 */
if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
- if (ext4_inode_attach_jinode(inode) < 0)
+ err = ext4_inode_attach_jinode(inode);
+ if (err)
goto out_trace;
}
@@ -4466,9 +4551,17 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
inode_offset = ((ino - 1) %
EXT4_INODES_PER_GROUP(sb));
- block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
+ block = ext4_inode_table(sb, gdp);
+ if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) ||
+ (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) {
+ ext4_error(sb, "Invalid inode table block %llu in "
+ "block_group %u", block, iloc->block_group);
+ return -EFSCORRUPTED;
+ }
+ block += (inode_offset / inodes_per_block);
+
bh = sb_getblk(sb, block);
if (unlikely(!bh))
return -ENOMEM;
@@ -5037,8 +5130,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb))
ext4_error_inode(inode, function, line, 0,
"casefold flag without casefold feature");
- brelse(iloc.bh);
+ if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) {
+ ext4_error_inode(inode, function, line, 0,
+ "bad inode without EXT4_IGET_BAD flag");
+ ret = -EUCLEAN;
+ goto bad_inode;
+ }
+ brelse(iloc.bh);
unlock_new_inode(inode);
return inode;
@@ -5342,6 +5441,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
int error, rc = 0;
int orphan = 0;
const unsigned int ia_valid = attr->ia_valid;
+ bool inc_ivers = true;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
@@ -5425,8 +5525,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return -EINVAL;
}
- if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
- inode_inc_iversion(inode);
+ if (attr->ia_size == inode->i_size)
+ inc_ivers = false;
if (shrink) {
if (ext4_should_order_data(inode)) {
@@ -5528,6 +5628,8 @@ out_mmap_sem:
}
if (!error) {
+ if (inc_ivers)
+ inode_inc_iversion(inode);
setattr_copy(mnt_userns, inode, attr);
mark_inode_dirty(inode);
}
@@ -5540,7 +5642,7 @@ out_mmap_sem:
ext4_orphan_del(NULL, inode);
if (!error && (ia_valid & ATTR_MODE))
- rc = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
+ rc = posix_acl_chmod(mnt_userns, dentry, inode->i_mode);
err_out:
if (error)
@@ -5550,6 +5652,22 @@ err_out:
return error;
}
+u32 ext4_dio_alignment(struct inode *inode)
+{
+ if (fsverity_active(inode))
+ return 0;
+ if (ext4_should_journal_data(inode))
+ return 0;
+ if (ext4_has_inline_data(inode))
+ return 0;
+ if (IS_ENCRYPTED(inode)) {
+ if (!fscrypt_dio_supported(inode))
+ return 0;
+ return i_blocksize(inode);
+ }
+ return 1; /* use the iomap defaults */
+}
+
int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
@@ -5565,6 +5683,27 @@ int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path,
stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
}
+ /*
+ * Return the DIO alignment restrictions if requested. We only return
+ * this information when requested, since on encrypted files it might
+ * take a fair bit of work to get if the file wasn't opened recently.
+ */
+ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
+ u32 dio_align = ext4_dio_alignment(inode);
+
+ stat->result_mask |= STATX_DIOALIGN;
+ if (dio_align == 1) {
+ struct block_device *bdev = inode->i_sb->s_bdev;
+
+ /* iomap defaults */
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+ stat->dio_offset_align = bdev_logical_block_size(bdev);
+ } else {
+ stat->dio_mem_align = dio_align;
+ stat->dio_offset_align = dio_align;
+ }
+ }
+
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
if (flags & EXT4_APPEND_FL)
stat->attributes |= STATX_ATTR_APPEND;
@@ -5731,9 +5870,6 @@ int ext4_mark_iloc_dirty(handle_t *handle,
}
ext4_fc_track_inode(handle, inode);
- if (IS_I_VERSION(inode))
- inode_inc_iversion(inode);
-
/* the do_update_inode consumes one bh->b_count */
get_bh(iloc->bh);
@@ -5809,6 +5945,14 @@ static int __ext4_expand_extra_isize(struct inode *inode,
return 0;
}
+ /*
+ * We may need to allocate external xattr block so we need quotas
+ * initialized. Here we can be called with various locks held so we
+ * cannot affort to initialize quotas ourselves. So just bail.
+ */
+ if (dquot_initialize_needed(inode))
+ return -EAGAIN;
+
/* try to expand with EAs present */
error = ext4_expand_extra_isize_ea(inode, new_extra_isize,
raw_inode, handle);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 3cf3ec4b1c21..8067ccda34e4 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -145,9 +145,8 @@ static int ext4_update_backup_sb(struct super_block *sb,
if (ext4_has_metadata_csum(sb) &&
es->s_checksum != ext4_superblock_csum(sb, es)) {
ext4_msg(sb, KERN_ERR, "Invalid checksum for backup "
- "superblock %llu\n", sb_block);
+ "superblock %llu", sb_block);
unlock_buffer(bh);
- err = -EFSBADCRC;
goto out_bh;
}
func(es, arg);
@@ -375,7 +374,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
blkcnt_t blocks;
unsigned short bytes;
- inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL);
+ inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO,
+ EXT4_IGET_SPECIAL | EXT4_IGET_BAD);
if (IS_ERR(inode_bl))
return PTR_ERR(inode_bl);
ei_bl = EXT4_I(inode_bl);
@@ -425,7 +425,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
/* Protect extent tree against block allocations via delalloc */
ext4_double_down_write_data_sem(inode, inode_bl);
- if (inode_bl->i_nlink == 0) {
+ if (is_bad_inode(inode_bl) || !S_ISREG(inode_bl->i_mode)) {
/* this inode has never been used as a BOOT_LOADER */
set_nlink(inode_bl, 1);
i_uid_write(inode_bl, 0);
@@ -452,9 +452,10 @@ static long swap_inode_boot_loader(struct super_block *sb,
swap_inode_data(inode, inode_bl);
inode->i_ctime = inode_bl->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
- inode->i_generation = prandom_u32();
- inode_bl->i_generation = prandom_u32();
+ inode->i_generation = get_random_u32();
+ inode_bl->i_generation = get_random_u32();
ext4_reset_inode_seed(inode);
ext4_reset_inode_seed(inode_bl);
@@ -665,6 +666,7 @@ static int ext4_ioctl_setflags(struct inode *inode,
ext4_set_inode_flags(inode, false);
inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
flags_err:
@@ -730,6 +732,10 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
if (ext4_is_quota_file(inode))
return err;
+ err = dquot_initialize(inode);
+ if (err)
+ return err;
+
err = ext4_get_inode_loc(inode, &iloc);
if (err)
return err;
@@ -745,10 +751,6 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
brelse(iloc.bh);
}
- err = dquot_initialize(inode);
- if (err)
- return err;
-
handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
EXT4_QUOTA_INIT_BLOCKS(sb) +
EXT4_QUOTA_DEL_BLOCKS(sb) + 3);
@@ -775,6 +777,7 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
EXT4_I(inode)->i_projid = kprojid;
inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
out_dirty:
rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
if (!err)
@@ -1060,9 +1063,6 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg)
if (!EXT4_SB(sb)->s_journal)
return -ENODEV;
- if (flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID)
- return -EINVAL;
-
if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
!bdev_max_discard_sectors(EXT4_SB(sb)->s_journal->j_dev))
return -EOPNOTSUPP;
@@ -1154,19 +1154,22 @@ static int ext4_ioctl_getuuid(struct ext4_sb_info *sbi,
if (fsuuid.fsu_len == 0) {
fsuuid.fsu_len = UUID_SIZE;
- if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid.fsu_len)))
+ if (copy_to_user(&ufsuuid->fsu_len, &fsuuid.fsu_len,
+ sizeof(fsuuid.fsu_len)))
return -EFAULT;
- return -EINVAL;
+ return 0;
}
- if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0)
+ if (fsuuid.fsu_len < UUID_SIZE || fsuuid.fsu_flags != 0)
return -EINVAL;
lock_buffer(sbi->s_sbh);
memcpy(uuid, sbi->s_es->s_uuid, UUID_SIZE);
unlock_buffer(sbi->s_sbh);
- if (copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE))
+ fsuuid.fsu_len = UUID_SIZE;
+ if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid)) ||
+ copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE))
return -EFAULT;
return 0;
}
@@ -1257,6 +1260,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
inode->i_generation = generation;
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 71f5b67d7f28..5b2ae37a8b80 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -910,7 +910,7 @@ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac,
int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct ext4_group_info *grp, *iter;
+ struct ext4_group_info *grp = NULL, *iter;
int i;
if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) {
@@ -927,7 +927,6 @@ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac,
read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
continue;
}
- grp = NULL;
list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i],
bb_avg_fragment_size_node) {
if (sbi->s_mb_stats)
@@ -5205,7 +5204,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
mutex_lock(&ac->ac_lg->lg_mutex);
}
-static noinline_for_stack int
+static noinline_for_stack void
ext4_mb_initialize_context(struct ext4_allocation_context *ac,
struct ext4_allocation_request *ar)
{
@@ -5254,8 +5253,6 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
(unsigned) ar->lleft, (unsigned) ar->pleft,
(unsigned) ar->lright, (unsigned) ar->pright,
inode_is_open_for_write(ar->inode) ? "" : "non-");
- return 0;
-
}
static noinline_for_stack void
@@ -5592,11 +5589,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
goto out;
}
- *errp = ext4_mb_initialize_context(ac, ar);
- if (*errp) {
- ar->len = 0;
- goto out;
- }
+ ext4_mb_initialize_context(ac, ar);
ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
seq = this_cpu_read(discard_pa_seq);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 54e7d3c95fd7..a19a9661646e 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -56,8 +56,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0);
err_out:
up_write((&EXT4_I(inode)->i_data_sem));
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
lb->first_pblock = 0;
return retval;
}
@@ -425,7 +424,8 @@ int ext4_ext_migrate(struct inode *inode)
* already is extent-based, error out.
*/
if (!ext4_has_feature_extents(inode->i_sb) ||
- (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
+ ext4_has_inline_data(inode))
return -EINVAL;
if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 9af68a7ecdcf..4681fff6665f 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -262,13 +262,7 @@ void ext4_stop_mmpd(struct ext4_sb_info *sbi)
*/
static unsigned int mmp_new_seq(void)
{
- u32 new_seq;
-
- do {
- new_seq = prandom_u32();
- } while (new_seq > EXT4_MMP_SEQ_MAX);
-
- return new_seq;
+ return get_random_u32_below(EXT4_MMP_SEQ_MAX + 1);
}
/*
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 701f1d6a217f..8dbb87edf24c 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -32,8 +32,7 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock,
if (IS_ERR(path))
return PTR_ERR(path);
if (path[ext_depth(inode)].p_ext == NULL) {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
*ppath = NULL;
return -ENODATA;
}
@@ -103,12 +102,10 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
if (unwritten != ext4_ext_is_unwritten(ext))
goto out;
from += ext4_ext_get_actual_len(ext);
- ext4_ext_drop_refs(path);
}
ret = 1;
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return ret;
}
@@ -256,6 +253,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
{
struct inode *orig_inode = file_inode(o_filp);
struct page *pagep[2] = {NULL, NULL};
+ struct folio *folio[2] = {NULL, NULL};
handle_t *handle;
ext4_lblk_t orig_blk_offset, donor_blk_offset;
unsigned long blocksize = orig_inode->i_sb->s_blocksize;
@@ -316,6 +314,13 @@ again:
* hold page's lock, if it is still the case data copy is not
* necessary, just swap data blocks between orig and donor.
*/
+ folio[0] = page_folio(pagep[0]);
+ folio[1] = page_folio(pagep[1]);
+
+ VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]);
+ VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]);
+ VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]);
+
if (unwritten) {
ext4_double_down_write_data_sem(orig_inode, donor_inode);
/* If any of extents in range became initialized we have to
@@ -334,10 +339,10 @@ again:
ext4_double_up_write_data_sem(orig_inode, donor_inode);
goto data_copy;
}
- if ((page_has_private(pagep[0]) &&
- !try_to_release_page(pagep[0], 0)) ||
- (page_has_private(pagep[1]) &&
- !try_to_release_page(pagep[1], 0))) {
+ if ((folio_has_private(folio[0]) &&
+ !filemap_release_folio(folio[0], 0)) ||
+ (folio_has_private(folio[1]) &&
+ !filemap_release_folio(folio[1], 0))) {
*err = -EBUSY;
goto drop_data_sem;
}
@@ -347,19 +352,21 @@ again:
block_len_in_page, 1, err);
drop_data_sem:
ext4_double_up_write_data_sem(orig_inode, donor_inode);
- goto unlock_pages;
+ goto unlock_folios;
}
data_copy:
- *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size);
+ *err = mext_page_mkuptodate(&folio[0]->page, from, from + replaced_size);
if (*err)
- goto unlock_pages;
+ goto unlock_folios;
/* At this point all buffers in range are uptodate, old mapping layout
* is no longer required, try to drop it now. */
- if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) ||
- (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) {
+ if ((folio_has_private(folio[0]) &&
+ !filemap_release_folio(folio[0], 0)) ||
+ (folio_has_private(folio[1]) &&
+ !filemap_release_folio(folio[1], 0))) {
*err = -EBUSY;
- goto unlock_pages;
+ goto unlock_folios;
}
ext4_double_down_write_data_sem(orig_inode, donor_inode);
replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode,
@@ -372,13 +379,13 @@ data_copy:
replaced_size =
block_len_in_page << orig_inode->i_blkbits;
} else
- goto unlock_pages;
+ goto unlock_folios;
}
/* Perform all necessary steps similar write_begin()/write_end()
* but keeping in mind that i_size will not change */
- if (!page_has_buffers(pagep[0]))
- create_empty_buffers(pagep[0], 1 << orig_inode->i_blkbits, 0);
- bh = page_buffers(pagep[0]);
+ if (!folio_buffers(folio[0]))
+ create_empty_buffers(&folio[0]->page, 1 << orig_inode->i_blkbits, 0);
+ bh = folio_buffers(folio[0]);
for (i = 0; i < data_offset_in_page; i++)
bh = bh->b_this_page;
for (i = 0; i < block_len_in_page; i++) {
@@ -388,7 +395,7 @@ data_copy:
bh = bh->b_this_page;
}
if (!*err)
- *err = block_commit_write(pagep[0], from, from + replaced_size);
+ *err = block_commit_write(&folio[0]->page, from, from + replaced_size);
if (unlikely(*err < 0))
goto repair_branches;
@@ -398,11 +405,11 @@ data_copy:
*err = ext4_jbd2_inode_add_write(handle, orig_inode,
(loff_t)orig_page_offset << PAGE_SHIFT, replaced_size);
-unlock_pages:
- unlock_page(pagep[0]);
- put_page(pagep[0]);
- unlock_page(pagep[1]);
- put_page(pagep[1]);
+unlock_folios:
+ folio_unlock(folio[0]);
+ folio_put(folio[0]);
+ folio_unlock(folio[1]);
+ folio_put(folio[1]);
stop_journal:
ext4_journal_stop(handle);
if (*err == -ENOSPC &&
@@ -433,7 +440,7 @@ repair_branches:
*err = -EIO;
}
replaced_count = 0;
- goto unlock_pages;
+ goto unlock_folios;
}
/**
@@ -472,19 +479,17 @@ mext_check_arguments(struct inode *orig_inode,
if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
return -EPERM;
- /* Ext4 move extent does not support swapfile */
+ /* Ext4 move extent does not support swap files */
if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
- ext4_debug("ext4 move extent: The argument files should "
- "not be swapfile [ino:orig %lu, donor %lu]\n",
+ ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
- return -EBUSY;
+ return -ETXTBSY;
}
if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) {
- ext4_debug("ext4 move extent: The argument files should "
- "not be quota files [ino:orig %lu, donor %lu]\n",
+ ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
- return -EBUSY;
+ return -EOPNOTSUPP;
}
/* Ext4 move extent supports only extent based file */
@@ -631,11 +636,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
if (ret)
goto out;
ex = path[path->p_depth].p_ext;
- next_blk = ext4_ext_next_allocated_block(path);
cur_blk = le32_to_cpu(ex->ee_block);
cur_len = ext4_ext_get_actual_len(ex);
/* Check hole before the start pos */
if (cur_blk + cur_len - 1 < o_start) {
+ next_blk = ext4_ext_next_allocated_block(path);
if (next_blk == EXT_MAX_BLOCKS) {
ret = -ENODATA;
goto out;
@@ -663,7 +668,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
donor_page_index = d_start >> (PAGE_SHIFT -
donor_inode->i_blkbits);
offset_in_page = o_start % blocks_per_page;
- if (cur_len > blocks_per_page- offset_in_page)
+ if (cur_len > blocks_per_page - offset_in_page)
cur_len = blocks_per_page - offset_in_page;
/*
* Up semaphore to avoid following problems:
@@ -694,8 +699,7 @@ out:
ext4_discard_preallocations(donor_inode, 0);
}
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
unlock_two_nondirectories(orig_inode, donor_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 3a31b662f661..dd28453d6ea3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -85,15 +85,20 @@ static struct buffer_head *ext4_append(handle_t *handle,
return bh;
inode->i_size += inode->i_sb->s_blocksize;
EXT4_I(inode)->i_disksize = inode->i_size;
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err)
+ goto out;
BUFFER_TRACE(bh, "get_write_access");
err = ext4_journal_get_write_access(handle, inode->i_sb, bh,
EXT4_JTR_NONE);
- if (err) {
- brelse(bh);
- ext4_std_error(inode->i_sb, err);
- return ERR_PTR(err);
- }
+ if (err)
+ goto out;
return bh;
+
+out:
+ brelse(bh);
+ ext4_std_error(inode->i_sb, err);
+ return ERR_PTR(err);
}
static int ext4_dx_csum_verify(struct inode *inode,
@@ -126,7 +131,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
struct ext4_dir_entry *dirent;
int is_dx_block = 0;
- if (block >= inode->i_size) {
+ if (block >= inode->i_size >> inode->i_blkbits) {
ext4_error_inode(inode, func, line, block,
"Attempting to read directory block (%u) that is past i_size (%llu)",
block, inode->i_size);
@@ -2254,8 +2259,16 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
memset(de, 0, len); /* wipe old data */
de = (struct ext4_dir_entry_2 *) data2;
top = data2 + len;
- while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
+ while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len,
+ (data2 + (blocksize - csum_size) -
+ (char *) de))) {
+ brelse(bh2);
+ brelse(bh);
+ return -EFSCORRUPTED;
+ }
de = de2;
+ }
de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
(char *) de, blocksize);
@@ -2849,7 +2862,7 @@ retry:
}
static int ext4_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+ struct file *file, umode_t mode)
{
handle_t *handle;
struct inode *inode;
@@ -2871,7 +2884,7 @@ retry:
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
- d_tmpfile(dentry, inode);
+ d_tmpfile(file, inode);
err = ext4_orphan_add(handle, inode);
if (err)
goto err_unlock_inode;
@@ -2882,7 +2895,7 @@ retry:
ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
- return err;
+ return finish_open_simple(file, err);
err_unlock_inode:
ext4_journal_stop(handle);
unlock_new_inode(inode);
@@ -3191,14 +3204,20 @@ end_rmdir:
return retval;
}
-int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name,
- struct inode *inode)
+int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
+ struct inode *inode,
+ struct dentry *dentry /* NULL during fast_commit recovery */)
{
int retval = -ENOENT;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
+ handle_t *handle;
int skip_remove_dentry = 0;
+ /*
+ * Keep this outside the transaction; it may have to set up the
+ * directory's encryption key, which isn't GFP_NOFS-safe.
+ */
bh = ext4_find_entry(dir, d_name, &de, NULL);
if (IS_ERR(bh))
return PTR_ERR(bh);
@@ -3215,7 +3234,14 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
skip_remove_dentry = 1;
else
- goto out;
+ goto out_bh;
+ }
+
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle)) {
+ retval = PTR_ERR(handle);
+ goto out_bh;
}
if (IS_DIRSYNC(dir))
@@ -3224,12 +3250,12 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name
if (!skip_remove_dentry) {
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
- goto out;
+ goto out_handle;
dir->i_ctime = dir->i_mtime = current_time(dir);
ext4_update_dx_flag(dir);
retval = ext4_mark_inode_dirty(handle, dir);
if (retval)
- goto out;
+ goto out_handle;
} else {
retval = 0;
}
@@ -3242,15 +3268,17 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name
ext4_orphan_add(handle, inode);
inode->i_ctime = current_time(inode);
retval = ext4_mark_inode_dirty(handle, inode);
-
-out:
+ if (dentry && !retval)
+ ext4_fc_track_unlink(handle, dentry);
+out_handle:
+ ext4_journal_stop(handle);
+out_bh:
brelse(bh);
return retval;
}
static int ext4_unlink(struct inode *dir, struct dentry *dentry)
{
- handle_t *handle;
int retval;
if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
@@ -3268,16 +3296,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (retval)
goto out_trace;
- handle = ext4_journal_start(dir, EXT4_HT_DIR,
- EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
- if (IS_ERR(handle)) {
- retval = PTR_ERR(handle);
- goto out_trace;
- }
-
- retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry));
- if (!retval)
- ext4_fc_track_unlink(handle, dentry);
+ retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry);
#if IS_ENABLED(CONFIG_UNICODE)
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
@@ -3288,8 +3307,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (IS_CASEFOLDED(dir))
d_invalidate(dentry);
#endif
- if (handle)
- ext4_journal_stop(handle);
out_trace:
trace_ext4_unlink_exit(dentry, retval);
@@ -3781,6 +3798,9 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
retval = dquot_initialize(old.dir);
if (retval)
return retval;
+ retval = dquot_initialize(old.inode);
+ if (retval)
+ return retval;
retval = dquot_initialize(new.dir);
if (retval)
return retval;
@@ -4181,7 +4201,7 @@ const struct inode_operations ext4_dir_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_getattr,
.listxattr = ext4_listxattr,
- .get_acl = ext4_get_acl,
+ .get_inode_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
.fileattr_get = ext4_fileattr_get,
@@ -4192,6 +4212,6 @@ const struct inode_operations ext4_special_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_getattr,
.listxattr = ext4_listxattr,
- .get_acl = ext4_get_acl,
+ .get_inode_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
};
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index 69a9cf9137a6..e5b47dda3317 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -412,7 +412,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
/* don't clear list on RO mount w/ errors */
if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
- "clearing orphan list.\n");
+ "clearing orphan list.");
es->s_last_orphan = 0;
}
ext4_debug("Skipping orphan recovery on fs with errors.\n");
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 97fa7b4c645f..beaec6d81074 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -430,25 +430,20 @@ submit_and_retry:
int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
- int len,
- bool keep_towrite)
+ int len)
{
struct page *bounce_page = NULL;
struct inode *inode = page->mapping->host;
unsigned block_start;
struct buffer_head *bh, *head;
int ret = 0;
- int nr_submitted = 0;
int nr_to_submit = 0;
struct writeback_control *wbc = io->io_wbc;
+ bool keep_towrite = false;
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
- if (keep_towrite)
- set_page_writeback_keepwrite(page);
- else
- set_page_writeback(page);
ClearPageError(page);
/*
@@ -482,16 +477,31 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
/* A hole? We can safely clear the dirty bit */
if (!buffer_mapped(bh))
clear_buffer_dirty(bh);
- if (io->io_bio)
- ext4_io_submit(io);
+ /*
+ * Keeping dirty some buffer we cannot write? Make sure
+ * to redirty the page and keep TOWRITE tag so that
+ * racing WB_SYNC_ALL writeback does not skip the page.
+ * This happens e.g. when doing writeout for
+ * transaction commit.
+ */
+ if (buffer_dirty(bh)) {
+ if (!PageDirty(page))
+ redirty_page_for_writepage(wbc, page);
+ keep_towrite = true;
+ }
continue;
}
if (buffer_new(bh))
clear_buffer_new(bh);
set_buffer_async_write(bh);
+ clear_buffer_dirty(bh);
nr_to_submit++;
} while ((bh = bh->b_this_page) != head);
+ /* Nothing to submit? Just unlock the page... */
+ if (!nr_to_submit)
+ goto unlock;
+
bh = head = page_buffers(page);
/*
@@ -532,27 +542,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
redirty_page_for_writepage(wbc, page);
do {
- clear_buffer_async_write(bh);
+ if (buffer_async_write(bh)) {
+ clear_buffer_async_write(bh);
+ set_buffer_dirty(bh);
+ }
bh = bh->b_this_page;
} while (bh != head);
goto unlock;
}
}
+ if (keep_towrite)
+ set_page_writeback_keepwrite(page);
+ else
+ set_page_writeback(page);
+
/* Now submit buffers to write */
do {
if (!buffer_async_write(bh))
continue;
io_submit_add_bh(io, inode,
bounce_page ? bounce_page : page, bh);
- nr_submitted++;
- clear_buffer_dirty(bh);
} while ((bh = bh->b_this_page) != head);
-
unlock:
unlock_page(page);
- /* Nothing submitted - we have to end page writeback */
- if (!nr_submitted)
- end_page_writeback(page);
return ret;
}
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index e02a5f14e021..d5266932ce6c 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -75,14 +75,10 @@ static void __read_end_io(struct bio *bio)
bio_for_each_segment_all(bv, bio, iter_all) {
page = bv->bv_page;
- /* PG_error was set if any post_read step failed */
- if (bio->bi_status || PageError(page)) {
+ if (bio->bi_status)
ClearPageUptodate(page);
- /* will re-read again later */
- ClearPageError(page);
- } else {
+ else
SetPageUptodate(page);
- }
unlock_page(page);
}
if (bio->bi_private)
@@ -96,10 +92,12 @@ static void decrypt_work(struct work_struct *work)
{
struct bio_post_read_ctx *ctx =
container_of(work, struct bio_post_read_ctx, work);
+ struct bio *bio = ctx->bio;
- fscrypt_decrypt_bio(ctx->bio);
-
- bio_post_read_processing(ctx);
+ if (fscrypt_decrypt_bio(bio))
+ bio_post_read_processing(ctx);
+ else
+ __read_end_io(bio);
}
static void verity_work(struct work_struct *work)
@@ -408,9 +406,8 @@ int ext4_mpage_readpages(struct inode *inode,
int __init ext4_init_post_read_processing(void)
{
- bio_post_read_ctx_cache =
- kmem_cache_create("ext4_bio_post_read_ctx",
- sizeof(struct bio_post_read_ctx), 0, 0, NULL);
+ bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, SLAB_RECLAIM_ACCOUNT);
+
if (!bio_post_read_ctx_cache)
goto fail;
bio_post_read_ctx_pool =
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index fea2a68d067b..6b91443d6bf3 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1110,6 +1110,16 @@ exit_free:
return err;
}
+static inline void ext4_set_block_group_nr(struct super_block *sb, char *data,
+ ext4_group_t group)
+{
+ struct ext4_super_block *es = (struct ext4_super_block *) data;
+
+ es->s_block_group_nr = cpu_to_le16(group);
+ if (ext4_has_metadata_csum(sb))
+ es->s_checksum = ext4_superblock_csum(sb, es);
+}
+
/*
* Update the backup copies of the ext4 metadata. These don't need to be part
* of the main resize transaction, because e2fsck will re-write them if there
@@ -1158,6 +1168,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
while (group < sbi->s_groups_count) {
struct buffer_head *bh;
ext4_fsblk_t backup_block;
+ int has_super = ext4_bg_has_super(sb, group);
+ ext4_fsblk_t first_block = ext4_group_first_block_no(sb, group);
/* Out of journal space, and can't get more - abort - so sad */
err = ext4_resize_ensure_credits_batch(handle, 1);
@@ -1167,8 +1179,7 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
if (meta_bg == 0)
backup_block = ((ext4_fsblk_t)group) * bpg + blk_off;
else
- backup_block = (ext4_group_first_block_no(sb, group) +
- ext4_bg_has_super(sb, group));
+ backup_block = first_block + has_super;
bh = sb_getblk(sb, backup_block);
if (unlikely(!bh)) {
@@ -1186,6 +1197,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
memcpy(bh->b_data, data, size);
if (rest)
memset(bh->b_data + size, 0, rest);
+ if (has_super && (backup_block == first_block))
+ ext4_set_block_group_nr(sb, bh->b_data, group);
set_buffer_uptodate(bh);
unlock_buffer(bh);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
@@ -1471,8 +1484,6 @@ static void ext4_update_super(struct super_block *sb,
* active. */
ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
reserved_blocks);
- ext4_superblock_csum_set(sb);
- unlock_buffer(sbi->s_sbh);
/* Update the free space counts */
percpu_counter_add(&sbi->s_freeclusters_counter,
@@ -1508,6 +1519,8 @@ static void ext4_update_super(struct super_block *sb,
ext4_calculate_overhead(sb);
es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(sbi->s_sbh);
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: added group %u:"
"%llu blocks(%llu free %llu reserved)\n", flex_gd->count,
@@ -1591,8 +1604,8 @@ exit_journal:
int meta_bg = ext4_has_feature_meta_bg(sb);
sector_t old_gdb = 0;
- update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
- sizeof(struct ext4_super_block), 0);
+ update_backups(sb, ext4_group_first_block_no(sb, 0),
+ (char *)es, sizeof(struct ext4_super_block), 0);
for (; gdb_num <= gdb_num_end; gdb_num++) {
struct buffer_head *gdb_bh;
@@ -1803,7 +1816,7 @@ errout:
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
"blocks\n", ext4_blocks_count(es));
- update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
+ update_backups(sb, ext4_group_first_block_no(sb, 0),
(char *)es, sizeof(struct ext4_super_block), 0);
}
return err;
@@ -1826,7 +1839,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
ext4_grpblk_t last;
ext4_grpblk_t add;
struct buffer_head *bh;
- int err;
ext4_group_t group;
o_blocks_count = ext4_blocks_count(es);
@@ -1881,8 +1893,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
}
brelse(bh);
- err = ext4_group_extend_no_check(sb, o_blocks_count, add);
- return err;
+ return ext4_group_extend_no_check(sb, o_blocks_count, add);
} /* ext4_group_extend */
@@ -2122,7 +2133,7 @@ retry:
goto out;
}
- if (ext4_blocks_count(es) == n_blocks_count)
+ if (ext4_blocks_count(es) == n_blocks_count && n_blocks_count_retry == 0)
goto out;
err = ext4_alloc_flex_bg_array(sb, n_group + 1);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9a66abcca1a8..16a343e8047d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -205,19 +205,12 @@ int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io
int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
{
- if (trylock_buffer(bh)) {
- if (wait)
- return ext4_read_bh(bh, op_flags, NULL);
+ lock_buffer(bh);
+ if (!wait) {
ext4_read_bh_nowait(bh, op_flags, NULL);
return 0;
}
- if (wait) {
- wait_on_buffer(bh);
- if (buffer_uptodate(bh))
- return 0;
- return -EIO;
- }
- return 0;
+ return ext4_read_bh(bh, op_flags, NULL);
}
/*
@@ -264,7 +257,8 @@ void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
struct buffer_head *bh = sb_getblk_gfp(sb, block, 0);
if (likely(bh)) {
- ext4_read_bh_lock(bh, REQ_RAHEAD, false);
+ if (trylock_buffer(bh))
+ ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL);
brelse(bh);
}
}
@@ -546,8 +540,7 @@ static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
if (ext4_should_journal_data(jinode->i_vfs_inode))
ret = ext4_journalled_submit_inode_data_buffers(jinode);
else
- ret = jbd2_journal_submit_inode_data_buffers(jinode);
-
+ ret = ext4_normal_submit_inode_data_buffers(jinode);
return ret;
}
@@ -1212,7 +1205,8 @@ static void ext4_put_super(struct super_block *sb)
ext4_unregister_sysfs(sb);
if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount"))
- ext4_msg(sb, KERN_INFO, "unmounting filesystem.");
+ ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.",
+ &sb->s_uuid);
ext4_unregister_li_request(sb);
ext4_quota_off_umount(sb);
@@ -1329,6 +1323,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
return NULL;
inode_set_iversion(&ei->vfs_inode, 1);
+ ei->i_flags = 0;
spin_lock_init(&ei->i_raw_lock);
INIT_LIST_HEAD(&ei->i_prealloc_list);
atomic_set(&ei->i_prealloc_active, 0);
@@ -1576,7 +1571,7 @@ enum {
Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
Opt_resgid, Opt_resuid, Opt_sb,
Opt_nouid32, Opt_debug, Opt_removed,
- Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
+ Opt_user_xattr, Opt_acl,
Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
@@ -1585,7 +1580,7 @@ enum {
Opt_inlinecrypt,
Opt_usrjquota, Opt_grpjquota, Opt_quota,
Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
- Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version,
+ Opt_usrquota, Opt_grpquota, Opt_prjquota,
Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never,
Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error,
Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize,
@@ -1662,9 +1657,7 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
fsparam_flag ("oldalloc", Opt_removed),
fsparam_flag ("orlov", Opt_removed),
fsparam_flag ("user_xattr", Opt_user_xattr),
- fsparam_flag ("nouser_xattr", Opt_nouser_xattr),
fsparam_flag ("acl", Opt_acl),
- fsparam_flag ("noacl", Opt_noacl),
fsparam_flag ("norecovery", Opt_noload),
fsparam_flag ("noload", Opt_noload),
fsparam_flag ("bh", Opt_removed),
@@ -1694,7 +1687,7 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
fsparam_flag ("barrier", Opt_barrier),
fsparam_u32 ("barrier", Opt_barrier),
fsparam_flag ("nobarrier", Opt_nobarrier),
- fsparam_flag ("i_version", Opt_i_version),
+ fsparam_flag ("i_version", Opt_removed),
fsparam_flag ("dax", Opt_dax),
fsparam_enum ("dax", Opt_dax_type, ext4_param_dax),
fsparam_u32 ("stripe", Opt_stripe),
@@ -1749,10 +1742,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
-static const char deprecated_msg[] =
- "Mount option \"%s\" will be removed by %s\n"
- "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
-
#define MOPT_SET 0x0001
#define MOPT_CLEAR 0x0002
#define MOPT_NOSUPPORT 0x0004
@@ -1814,13 +1803,10 @@ static const struct mount_opts {
{Opt_journal_ioprio, 0, MOPT_NO_EXT2},
{Opt_data, 0, MOPT_NO_EXT2},
{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
- {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
#ifdef CONFIG_EXT4_FS_POSIX_ACL
{Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
- {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
#else
{Opt_acl, 0, MOPT_NOSUPPORT},
- {Opt_noacl, 0, MOPT_NOSUPPORT},
#endif
{Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
{Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
@@ -2120,10 +2106,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
else
return note_qf_name(fc, GRPQUOTA, param);
#endif
- case Opt_noacl:
- case Opt_nouser_xattr:
- ext4_msg(NULL, KERN_WARNING, deprecated_msg, param->key, "3.5");
- break;
case Opt_sb:
if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
ext4_msg(NULL, KERN_WARNING,
@@ -2140,11 +2122,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_abort:
ctx_set_mount_flag(ctx, EXT4_MF_FS_ABORTED);
return 0;
- case Opt_i_version:
- ext4_msg(NULL, KERN_WARNING, deprecated_msg, param->key, "5.20");
- ext4_msg(NULL, KERN_WARNING, "Use iversion instead\n");
- ctx_set_flags(ctx, SB_I_VERSION);
- return 0;
case Opt_inlinecrypt:
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
ctx_set_flags(ctx, SB_INLINECRYPT);
@@ -2271,7 +2248,7 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
return -EINVAL;
}
- error = fs_lookup_param(fc, param, 1, &path);
+ error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path);
if (error) {
ext4_msg(NULL, KERN_ERR, "error: could not find "
"journal device path");
@@ -2814,14 +2791,6 @@ static void ext4_apply_options(struct fs_context *fc, struct super_block *sb)
sb->s_flags &= ~ctx->mask_s_flags;
sb->s_flags |= ctx->vals_s_flags;
- /*
- * i_version differs from common mount option iversion so we have
- * to let vfs know that it was set, otherwise it would get cleared
- * on remount
- */
- if (ctx->mask_s_flags & SB_I_VERSION)
- fc->sb_flags |= SB_I_VERSION;
-
#define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; })
APPLY(s_commit_interval);
APPLY(s_stripe);
@@ -2970,8 +2939,6 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
- if (sb->s_flags & SB_I_VERSION)
- SEQ_OPTS_PUTS("i_version");
if (nodefs || sbi->s_stripe)
SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
if (nodefs || EXT4_MOUNT_DATA_FLAGS &
@@ -3767,6 +3734,7 @@ static int ext4_lazyinit_thread(void *arg)
unsigned long next_wakeup, cur;
BUG_ON(NULL == eli);
+ set_freezable();
cont_thread:
while (true) {
@@ -3811,8 +3779,7 @@ cont_thread:
}
if (!progress) {
elr->lr_next_sched = jiffies +
- (prandom_u32()
- % (EXT4_DEF_LI_MAX_START_DELAY * HZ));
+ get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
}
if (time_before(elr->lr_next_sched, next_wakeup))
next_wakeup = elr->lr_next_sched;
@@ -3959,8 +3926,7 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
* spread the inode table initialization requests
* better.
*/
- elr->lr_next_sched = jiffies + (prandom_u32() %
- (EXT4_DEF_LI_MAX_START_DELAY * HZ));
+ elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
return elr;
}
@@ -3982,9 +3948,9 @@ int ext4_register_li_request(struct super_block *sb,
goto out;
}
- if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
- (first_not_zeroed == ngroups || sb_rdonly(sb) ||
- !test_opt(sb, INIT_INODE_TABLE)))
+ if (sb_rdonly(sb) ||
+ (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
+ (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE))))
goto out;
elr = ext4_li_request_new(sb, first_not_zeroed);
@@ -4311,112 +4277,10 @@ err_out:
return NULL;
}
-static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
+static void ext4_set_def_opts(struct super_block *sb,
+ struct ext4_super_block *es)
{
- struct buffer_head *bh, **group_desc;
- struct ext4_super_block *es = NULL;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct flex_groups **flex_groups;
- ext4_fsblk_t block;
- ext4_fsblk_t logical_sb_block;
- unsigned long offset = 0;
unsigned long def_mount_opts;
- struct inode *root;
- int ret = -ENOMEM;
- int blocksize, clustersize;
- unsigned int db_count;
- unsigned int i;
- int needs_recovery, has_huge_files;
- __u64 blocks_count;
- int err = 0;
- ext4_group_t first_not_zeroed;
- struct ext4_fs_context *ctx = fc->fs_private;
- int silent = fc->sb_flags & SB_SILENT;
-
- /* Set defaults for the variables that will be set during parsing */
- if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO))
- ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
-
- sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
- sbi->s_sectors_written_start =
- part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);
-
- /* -EINVAL is default */
- ret = -EINVAL;
- blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
- if (!blocksize) {
- ext4_msg(sb, KERN_ERR, "unable to set blocksize");
- goto out_fail;
- }
-
- /*
- * The ext4 superblock will not be buffer aligned for other than 1kB
- * block sizes. We need to calculate the offset from buffer start.
- */
- if (blocksize != EXT4_MIN_BLOCK_SIZE) {
- logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
- offset = do_div(logical_sb_block, blocksize);
- } else {
- logical_sb_block = sbi->s_sb_block;
- }
-
- bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
- if (IS_ERR(bh)) {
- ext4_msg(sb, KERN_ERR, "unable to read superblock");
- ret = PTR_ERR(bh);
- goto out_fail;
- }
- /*
- * Note: s_es must be initialized as soon as possible because
- * some ext4 macro-instructions depend on its value
- */
- es = (struct ext4_super_block *) (bh->b_data + offset);
- sbi->s_es = es;
- sb->s_magic = le16_to_cpu(es->s_magic);
- if (sb->s_magic != EXT4_SUPER_MAGIC)
- goto cantfind_ext4;
- sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
-
- /* Warn if metadata_csum and gdt_csum are both set. */
- if (ext4_has_feature_metadata_csum(sb) &&
- ext4_has_feature_gdt_csum(sb))
- ext4_warning(sb, "metadata_csum and uninit_bg are "
- "redundant flags; please run fsck.");
-
- /* Check for a known checksum algorithm */
- if (!ext4_verify_csum_type(sb, es)) {
- ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
- "unknown checksum algorithm.");
- silent = 1;
- goto cantfind_ext4;
- }
- ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE,
- ext4_orphan_file_block_trigger);
-
- /* Load the checksum driver */
- sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
- if (IS_ERR(sbi->s_chksum_driver)) {
- ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
- ret = PTR_ERR(sbi->s_chksum_driver);
- sbi->s_chksum_driver = NULL;
- goto failed_mount;
- }
-
- /* Check superblock checksum */
- if (!ext4_superblock_csum_verify(sb, es)) {
- ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
- "invalid superblock checksum. Run e2fsck?");
- silent = 1;
- ret = -EFSBADCRC;
- goto cantfind_ext4;
- }
-
- /* Precompute checksum seed for all metadata */
- if (ext4_has_feature_csum_seed(sb))
- sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
- else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
- sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
- sizeof(es->s_uuid));
/* Set defaults before we parse the mount options */
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -4445,9 +4309,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
set_opt(sb, WRITEBACK_DATA);
- if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
+ if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC)
set_opt(sb, ERRORS_PANIC);
- else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
+ else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE)
set_opt(sb, ERRORS_CONT);
else
set_opt(sb, ERRORS_RO);
@@ -4456,12 +4320,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (def_mount_opts & EXT4_DEFM_DISCARD)
set_opt(sb, DISCARD);
- sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
- sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
- sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
- sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
- sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
-
if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
set_opt(sb, BARRIER);
@@ -4473,31 +4331,96 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
set_opt(sb, DELALLOC);
- /*
- * set default s_li_wait_mult for lazyinit, for the case there is
- * no mount option specified.
- */
- sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+ if (sb->s_blocksize == PAGE_SIZE)
+ set_opt(sb, DIOREAD_NOLOCK);
+}
- if (le32_to_cpu(es->s_log_block_size) >
- (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
- ext4_msg(sb, KERN_ERR,
- "Invalid log block size: %u",
- le32_to_cpu(es->s_log_block_size));
- goto failed_mount;
- }
- if (le32_to_cpu(es->s_log_cluster_size) >
- (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
- ext4_msg(sb, KERN_ERR,
- "Invalid log cluster size: %u",
- le32_to_cpu(es->s_log_cluster_size));
- goto failed_mount;
+static int ext4_handle_clustersize(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int clustersize;
+
+ /* Handle clustersize */
+ clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
+ if (ext4_has_feature_bigalloc(sb)) {
+ if (clustersize < sb->s_blocksize) {
+ ext4_msg(sb, KERN_ERR,
+ "cluster size (%d) smaller than "
+ "block size (%lu)", clustersize, sb->s_blocksize);
+ return -EINVAL;
+ }
+ sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
+ le32_to_cpu(es->s_log_block_size);
+ sbi->s_clusters_per_group =
+ le32_to_cpu(es->s_clusters_per_group);
+ if (sbi->s_clusters_per_group > sb->s_blocksize * 8) {
+ ext4_msg(sb, KERN_ERR,
+ "#clusters per group too big: %lu",
+ sbi->s_clusters_per_group);
+ return -EINVAL;
+ }
+ if (sbi->s_blocks_per_group !=
+ (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) {
+ ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
+ "clusters per group (%lu) inconsistent",
+ sbi->s_blocks_per_group,
+ sbi->s_clusters_per_group);
+ return -EINVAL;
+ }
+ } else {
+ if (clustersize != sb->s_blocksize) {
+ ext4_msg(sb, KERN_ERR,
+ "fragment/cluster size (%d) != "
+ "block size (%lu)", clustersize, sb->s_blocksize);
+ return -EINVAL;
+ }
+ if (sbi->s_blocks_per_group > sb->s_blocksize * 8) {
+ ext4_msg(sb, KERN_ERR,
+ "#blocks per group too big: %lu",
+ sbi->s_blocks_per_group);
+ return -EINVAL;
+ }
+ sbi->s_clusters_per_group = sbi->s_blocks_per_group;
+ sbi->s_cluster_bits = 0;
}
+ sbi->s_cluster_ratio = clustersize / sb->s_blocksize;
- blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+ /* Do we have standard group size of clustersize * 8 blocks ? */
+ if (sbi->s_blocks_per_group == clustersize << 3)
+ set_opt2(sb, STD_GROUP_SIZE);
- if (blocksize == PAGE_SIZE)
- set_opt(sb, DIOREAD_NOLOCK);
+ return 0;
+}
+
+static void ext4_fast_commit_init(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ /* Initialize fast commit stuff */
+ atomic_set(&sbi->s_fc_subtid, 0);
+ INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]);
+ INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]);
+ INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
+ INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
+ sbi->s_fc_bytes = 0;
+ ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+ sbi->s_fc_ineligible_tid = 0;
+ spin_lock_init(&sbi->s_fc_lock);
+ memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
+ sbi->s_fc_replay_state.fc_regions = NULL;
+ sbi->s_fc_replay_state.fc_regions_size = 0;
+ sbi->s_fc_replay_state.fc_regions_used = 0;
+ sbi->s_fc_replay_state.fc_regions_valid = 0;
+ sbi->s_fc_replay_state.fc_modified_inodes = NULL;
+ sbi->s_fc_replay_state.fc_modified_inodes_size = 0;
+ sbi->s_fc_replay_state.fc_modified_inodes_used = 0;
+}
+
+static int ext4_inode_info_init(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
@@ -4508,16 +4431,16 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) {
ext4_msg(sb, KERN_ERR, "invalid first ino: %u",
sbi->s_first_ino);
- goto failed_mount;
+ return -EINVAL;
}
if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
(!is_power_of_2(sbi->s_inode_size)) ||
- (sbi->s_inode_size > blocksize)) {
+ (sbi->s_inode_size > sb->s_blocksize)) {
ext4_msg(sb, KERN_ERR,
"unsupported inode size: %d",
sbi->s_inode_size);
- ext4_msg(sb, KERN_ERR, "blocksize: %d", blocksize);
- goto failed_mount;
+ ext4_msg(sb, KERN_ERR, "blocksize: %lu", sb->s_blocksize);
+ return -EINVAL;
}
/*
* i_atime_extra is the last extra field available for
@@ -4535,6 +4458,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
}
sb->s_time_min = EXT4_TIMESTAMP_MIN;
}
+
if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
@@ -4546,7 +4470,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (v > max) {
ext4_msg(sb, KERN_ERR,
"bad s_want_extra_isize: %d", v);
- goto failed_mount;
+ return -EINVAL;
}
if (sbi->s_want_extra_isize < v)
sbi->s_want_extra_isize = v;
@@ -4555,91 +4479,112 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (v > max) {
ext4_msg(sb, KERN_ERR,
"bad s_min_extra_isize: %d", v);
- goto failed_mount;
+ return -EINVAL;
}
if (sbi->s_want_extra_isize < v)
sbi->s_want_extra_isize = v;
}
}
- err = parse_apply_sb_mount_options(sb, ctx);
- if (err < 0)
- goto failed_mount;
+ return 0;
+}
- sbi->s_def_mount_opt = sbi->s_mount_opt;
+#if IS_ENABLED(CONFIG_UNICODE)
+static int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es)
+{
+ const struct ext4_sb_encodings *encoding_info;
+ struct unicode_map *encoding;
+ __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags);
- err = ext4_check_opt_consistency(fc, sb);
- if (err < 0)
- goto failed_mount;
+ if (!ext4_has_feature_casefold(sb) || sb->s_encoding)
+ return 0;
- ext4_apply_options(fc, sb);
+ encoding_info = ext4_sb_read_encoding(es);
+ if (!encoding_info) {
+ ext4_msg(sb, KERN_ERR,
+ "Encoding requested by superblock is unknown");
+ return -EINVAL;
+ }
-#if IS_ENABLED(CONFIG_UNICODE)
- if (ext4_has_feature_casefold(sb) && !sb->s_encoding) {
- const struct ext4_sb_encodings *encoding_info;
- struct unicode_map *encoding;
- __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags);
+ encoding = utf8_load(encoding_info->version);
+ if (IS_ERR(encoding)) {
+ ext4_msg(sb, KERN_ERR,
+ "can't mount with superblock charset: %s-%u.%u.%u "
+ "not supported by the kernel. flags: 0x%x.",
+ encoding_info->name,
+ unicode_major(encoding_info->version),
+ unicode_minor(encoding_info->version),
+ unicode_rev(encoding_info->version),
+ encoding_flags);
+ return -EINVAL;
+ }
+ ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
+ "%s-%u.%u.%u with flags 0x%hx", encoding_info->name,
+ unicode_major(encoding_info->version),
+ unicode_minor(encoding_info->version),
+ unicode_rev(encoding_info->version),
+ encoding_flags);
- encoding_info = ext4_sb_read_encoding(es);
- if (!encoding_info) {
- ext4_msg(sb, KERN_ERR,
- "Encoding requested by superblock is unknown");
- goto failed_mount;
- }
+ sb->s_encoding = encoding;
+ sb->s_encoding_flags = encoding_flags;
- encoding = utf8_load(encoding_info->version);
- if (IS_ERR(encoding)) {
- ext4_msg(sb, KERN_ERR,
- "can't mount with superblock charset: %s-%u.%u.%u "
- "not supported by the kernel. flags: 0x%x.",
- encoding_info->name,
- unicode_major(encoding_info->version),
- unicode_minor(encoding_info->version),
- unicode_rev(encoding_info->version),
- encoding_flags);
- goto failed_mount;
- }
- ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
- "%s-%u.%u.%u with flags 0x%hx", encoding_info->name,
- unicode_major(encoding_info->version),
- unicode_minor(encoding_info->version),
- unicode_rev(encoding_info->version),
- encoding_flags);
+ return 0;
+}
+#else
+static inline int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es)
+{
+ return 0;
+}
+#endif
+
+static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_block *es)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ /* Warn if metadata_csum and gdt_csum are both set. */
+ if (ext4_has_feature_metadata_csum(sb) &&
+ ext4_has_feature_gdt_csum(sb))
+ ext4_warning(sb, "metadata_csum and uninit_bg are "
+ "redundant flags; please run fsck.");
- sb->s_encoding = encoding;
- sb->s_encoding_flags = encoding_flags;
+ /* Check for a known checksum algorithm */
+ if (!ext4_verify_csum_type(sb, es)) {
+ ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+ "unknown checksum algorithm.");
+ return -EINVAL;
}
-#endif
+ ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE,
+ ext4_orphan_file_block_trigger);
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
- printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, O_DIRECT and fast_commit support!\n");
- /* can't mount with both data=journal and dioread_nolock. */
- clear_opt(sb, DIOREAD_NOLOCK);
- clear_opt2(sb, JOURNAL_FAST_COMMIT);
- if (test_opt2(sb, EXPLICIT_DELALLOC)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "both data=journal and delalloc");
- goto failed_mount;
- }
- if (test_opt(sb, DAX_ALWAYS)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "both data=journal and dax");
- goto failed_mount;
- }
- if (ext4_has_feature_encrypt(sb)) {
- ext4_msg(sb, KERN_WARNING,
- "encrypted files will use data=ordered "
- "instead of data journaling mode");
- }
- if (test_opt(sb, DELALLOC))
- clear_opt(sb, DELALLOC);
- } else {
- sb->s_iflags |= SB_I_CGROUPWB;
+ /* Load the checksum driver */
+ sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+ if (IS_ERR(sbi->s_chksum_driver)) {
+ int ret = PTR_ERR(sbi->s_chksum_driver);
+ ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
+ sbi->s_chksum_driver = NULL;
+ return ret;
}
- sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
- (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
+ /* Check superblock checksum */
+ if (!ext4_superblock_csum_verify(sb, es)) {
+ ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+ "invalid superblock checksum. Run e2fsck?");
+ return -EFSBADCRC;
+ }
+ /* Precompute checksum seed for all metadata */
+ if (ext4_has_feature_csum_seed(sb))
+ sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
+ else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
+ sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
+ sizeof(es->s_uuid));
+ return 0;
+}
+
+static int ext4_check_feature_compatibility(struct super_block *sb,
+ struct ext4_super_block *es,
+ int silent)
+{
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
(ext4_has_compat_features(sb) ||
ext4_has_ro_compat_features(sb) ||
@@ -4653,7 +4598,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (ext4_has_feature_64bit(sb)) {
ext4_msg(sb, KERN_ERR,
"The Hurd can't support 64-bit file systems");
- goto failed_mount;
+ return -EINVAL;
}
/*
@@ -4663,7 +4608,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (ext4_has_feature_ea_inode(sb)) {
ext4_msg(sb, KERN_ERR,
"ea_inode feature is not supported for Hurd");
- goto failed_mount;
+ return -EINVAL;
}
}
@@ -4677,10 +4622,10 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
* it's actually an ext[34] filesystem.
*/
if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
- goto failed_mount;
+ return -EINVAL;
ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
"to feature incompatibilities");
- goto failed_mount;
+ return -EINVAL;
}
}
@@ -4694,10 +4639,10 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
* it's actually an ext4 filesystem.
*/
if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
- goto failed_mount;
+ return -EINVAL;
ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
"to feature incompatibilities");
- goto failed_mount;
+ return -EINVAL;
}
}
@@ -4707,9 +4652,463 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
* so there is a chance incompat flags are set on a rev 0 filesystem.
*/
if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ext4_geometry_check(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ __u64 blocks_count;
+
+ /* check blocks count against device size */
+ blocks_count = sb_bdev_nr_blocks(sb);
+ if (blocks_count && ext4_blocks_count(es) > blocks_count) {
+ ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
+ "exceeds size of device (%llu blocks)",
+ ext4_blocks_count(es), blocks_count);
+ return -EINVAL;
+ }
+
+ /*
+ * It makes no sense for the first data block to be beyond the end
+ * of the filesystem.
+ */
+ if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
+ ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
+ "block %u is beyond end of filesystem (%llu)",
+ le32_to_cpu(es->s_first_data_block),
+ ext4_blocks_count(es));
+ return -EINVAL;
+ }
+ if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) &&
+ (sbi->s_cluster_ratio == 1)) {
+ ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
+ "block is 0 with a 1k block and cluster size");
+ return -EINVAL;
+ }
+
+ blocks_count = (ext4_blocks_count(es) -
+ le32_to_cpu(es->s_first_data_block) +
+ EXT4_BLOCKS_PER_GROUP(sb) - 1);
+ do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
+ if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
+ ext4_msg(sb, KERN_WARNING, "groups count too large: %llu "
+ "(block count %llu, first data block %u, "
+ "blocks per group %lu)", blocks_count,
+ ext4_blocks_count(es),
+ le32_to_cpu(es->s_first_data_block),
+ EXT4_BLOCKS_PER_GROUP(sb));
+ return -EINVAL;
+ }
+ sbi->s_groups_count = blocks_count;
+ sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+ (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
+ if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) !=
+ le32_to_cpu(es->s_inodes_count)) {
+ ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu",
+ le32_to_cpu(es->s_inodes_count),
+ ((u64)sbi->s_groups_count * sbi->s_inodes_per_group));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void ext4_group_desc_free(struct ext4_sb_info *sbi)
+{
+ struct buffer_head **group_desc;
+ int i;
+
+ rcu_read_lock();
+ group_desc = rcu_dereference(sbi->s_group_desc);
+ for (i = 0; i < sbi->s_gdb_count; i++)
+ brelse(group_desc[i]);
+ kvfree(group_desc);
+ rcu_read_unlock();
+}
+
+static int ext4_group_desc_init(struct super_block *sb,
+ struct ext4_super_block *es,
+ ext4_fsblk_t logical_sb_block,
+ ext4_group_t *first_not_zeroed)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned int db_count;
+ ext4_fsblk_t block;
+ int ret;
+ int i;
+
+ db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
+ EXT4_DESC_PER_BLOCK(sb);
+ if (ext4_has_feature_meta_bg(sb)) {
+ if (le32_to_cpu(es->s_first_meta_bg) > db_count) {
+ ext4_msg(sb, KERN_WARNING,
+ "first meta block group too large: %u "
+ "(group descriptor block count %u)",
+ le32_to_cpu(es->s_first_meta_bg), db_count);
+ return -EINVAL;
+ }
+ }
+ rcu_assign_pointer(sbi->s_group_desc,
+ kvmalloc_array(db_count,
+ sizeof(struct buffer_head *),
+ GFP_KERNEL));
+ if (sbi->s_group_desc == NULL) {
+ ext4_msg(sb, KERN_ERR, "not enough memory");
+ return -ENOMEM;
+ }
+
+ bgl_lock_init(sbi->s_blockgroup_lock);
+
+ /* Pre-read the descriptors into the buffer cache */
+ for (i = 0; i < db_count; i++) {
+ block = descriptor_loc(sb, logical_sb_block, i);
+ ext4_sb_breadahead_unmovable(sb, block);
+ }
+
+ for (i = 0; i < db_count; i++) {
+ struct buffer_head *bh;
+
+ block = descriptor_loc(sb, logical_sb_block, i);
+ bh = ext4_sb_bread_unmovable(sb, block);
+ if (IS_ERR(bh)) {
+ ext4_msg(sb, KERN_ERR,
+ "can't read group descriptor %d", i);
+ sbi->s_gdb_count = i;
+ ret = PTR_ERR(bh);
+ goto out;
+ }
+ rcu_read_lock();
+ rcu_dereference(sbi->s_group_desc)[i] = bh;
+ rcu_read_unlock();
+ }
+ sbi->s_gdb_count = db_count;
+ if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) {
+ ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
+ ret = -EFSCORRUPTED;
+ goto out;
+ }
+ return 0;
+out:
+ ext4_group_desc_free(sbi);
+ return ret;
+}
+
+static int ext4_load_and_init_journal(struct super_block *sb,
+ struct ext4_super_block *es,
+ struct ext4_fs_context *ctx)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err;
+
+ err = ext4_load_journal(sb, es, ctx->journal_devnum);
+ if (err)
+ return err;
+
+ if (ext4_has_feature_64bit(sb) &&
+ !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_64BIT)) {
+ ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
+ goto out;
+ }
+
+ if (!set_journal_csum_feature_set(sb)) {
+ ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
+ "feature set");
+ goto out;
+ }
+
+ if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
+ !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) {
+ ext4_msg(sb, KERN_ERR,
+ "Failed to set fast commit journal feature");
+ goto out;
+ }
+
+ /* We have now updated the journal if required, so we can
+ * validate the data journaling mode. */
+ switch (test_opt(sb, DATA_FLAGS)) {
+ case 0:
+ /* No mode set, assume a default based on the journal
+ * capabilities: ORDERED_DATA if the journal can
+ * cope, else JOURNAL_DATA
+ */
+ if (jbd2_journal_check_available_features
+ (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
+ set_opt(sb, ORDERED_DATA);
+ sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
+ } else {
+ set_opt(sb, JOURNAL_DATA);
+ sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
+ }
+ break;
+
+ case EXT4_MOUNT_ORDERED_DATA:
+ case EXT4_MOUNT_WRITEBACK_DATA:
+ if (!jbd2_journal_check_available_features
+ (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
+ ext4_msg(sb, KERN_ERR, "Journal does not support "
+ "requested data journaling mode");
+ goto out;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
+ test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "journal_async_commit in data=ordered mode");
+ goto out;
+ }
+
+ set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);
+
+ sbi->s_journal->j_submit_inode_data_buffers =
+ ext4_journal_submit_inode_data_buffers;
+ sbi->s_journal->j_finish_inode_data_buffers =
+ ext4_journal_finish_inode_data_buffers;
+
+ return 0;
+
+out:
+ /* flush s_error_work before journal destroy. */
+ flush_work(&sbi->s_error_work);
+ jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
+ return -EINVAL;
+}
+
+static int ext4_journal_data_mode_check(struct super_block *sb)
+{
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with "
+ "data=journal disables delayed allocation, "
+ "dioread_nolock, O_DIRECT and fast_commit support!\n");
+ /* can't mount with both data=journal and dioread_nolock. */
+ clear_opt(sb, DIOREAD_NOLOCK);
+ clear_opt2(sb, JOURNAL_FAST_COMMIT);
+ if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and delalloc");
+ return -EINVAL;
+ }
+ if (test_opt(sb, DAX_ALWAYS)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and dax");
+ return -EINVAL;
+ }
+ if (ext4_has_feature_encrypt(sb)) {
+ ext4_msg(sb, KERN_WARNING,
+ "encrypted files will use data=ordered "
+ "instead of data journaling mode");
+ }
+ if (test_opt(sb, DELALLOC))
+ clear_opt(sb, DELALLOC);
+ } else {
+ sb->s_iflags |= SB_I_CGROUPWB;
+ }
+
+ return 0;
+}
+
+static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb,
+ int silent)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es;
+ ext4_fsblk_t logical_sb_block;
+ unsigned long offset = 0;
+ struct buffer_head *bh;
+ int ret = -EINVAL;
+ int blocksize;
+
+ blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
+ if (!blocksize) {
+ ext4_msg(sb, KERN_ERR, "unable to set blocksize");
+ return -EINVAL;
+ }
+
+ /*
+ * The ext4 superblock will not be buffer aligned for other than 1kB
+ * block sizes. We need to calculate the offset from buffer start.
+ */
+ if (blocksize != EXT4_MIN_BLOCK_SIZE) {
+ logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
+ offset = do_div(logical_sb_block, blocksize);
+ } else {
+ logical_sb_block = sbi->s_sb_block;
+ }
+
+ bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
+ if (IS_ERR(bh)) {
+ ext4_msg(sb, KERN_ERR, "unable to read superblock");
+ return PTR_ERR(bh);
+ }
+ /*
+ * Note: s_es must be initialized as soon as possible because
+ * some ext4 macro-instructions depend on its value
+ */
+ es = (struct ext4_super_block *) (bh->b_data + offset);
+ sbi->s_es = es;
+ sb->s_magic = le16_to_cpu(es->s_magic);
+ if (sb->s_magic != EXT4_SUPER_MAGIC) {
+ if (!silent)
+ ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
+ goto out;
+ }
+
+ if (le32_to_cpu(es->s_log_block_size) >
+ (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
+ ext4_msg(sb, KERN_ERR,
+ "Invalid log block size: %u",
+ le32_to_cpu(es->s_log_block_size));
+ goto out;
+ }
+ if (le32_to_cpu(es->s_log_cluster_size) >
+ (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
+ ext4_msg(sb, KERN_ERR,
+ "Invalid log cluster size: %u",
+ le32_to_cpu(es->s_log_cluster_size));
+ goto out;
+ }
+
+ blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+
+ /*
+ * If the default block size is not the same as the real block size,
+ * we need to reload it.
+ */
+ if (sb->s_blocksize == blocksize) {
+ *lsb = logical_sb_block;
+ sbi->s_sbh = bh;
+ return 0;
+ }
+
+ /*
+ * bh must be released before kill_bdev(), otherwise
+ * it won't be freed and its page also. kill_bdev()
+ * is called by sb_set_blocksize().
+ */
+ brelse(bh);
+ /* Validate the filesystem blocksize */
+ if (!sb_set_blocksize(sb, blocksize)) {
+ ext4_msg(sb, KERN_ERR, "bad block size %d",
+ blocksize);
+ bh = NULL;
+ goto out;
+ }
+
+ logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
+ offset = do_div(logical_sb_block, blocksize);
+ bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
+ if (IS_ERR(bh)) {
+ ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try");
+ ret = PTR_ERR(bh);
+ bh = NULL;
+ goto out;
+ }
+ es = (struct ext4_super_block *)(bh->b_data + offset);
+ sbi->s_es = es;
+ if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
+ ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!");
+ goto out;
+ }
+ *lsb = logical_sb_block;
+ sbi->s_sbh = bh;
+ return 0;
+out:
+ brelse(bh);
+ return ret;
+}
+
+static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
+{
+ struct ext4_super_block *es = NULL;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct flex_groups **flex_groups;
+ ext4_fsblk_t block;
+ ext4_fsblk_t logical_sb_block;
+ struct inode *root;
+ int ret = -ENOMEM;
+ unsigned int i;
+ int needs_recovery, has_huge_files;
+ int err = 0;
+ ext4_group_t first_not_zeroed;
+ struct ext4_fs_context *ctx = fc->fs_private;
+ int silent = fc->sb_flags & SB_SILENT;
+
+ /* Set defaults for the variables that will be set during parsing */
+ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO))
+ ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+
+ sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
+ sbi->s_sectors_written_start =
+ part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);
+
+ /* -EINVAL is default */
+ ret = -EINVAL;
+ err = ext4_load_super(sb, &logical_sb_block, silent);
+ if (err)
+ goto out_fail;
+
+ es = sbi->s_es;
+ sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
+
+ err = ext4_init_metadata_csum(sb, es);
+ if (err)
goto failed_mount;
- if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) {
+ ext4_set_def_opts(sb, es);
+
+ sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
+ sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
+ sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
+ sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
+ sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
+
+ /*
+ * set default s_li_wait_mult for lazyinit, for the case there is
+ * no mount option specified.
+ */
+ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+
+ if (ext4_inode_info_init(sb, es))
+ goto failed_mount;
+
+ err = parse_apply_sb_mount_options(sb, ctx);
+ if (err < 0)
+ goto failed_mount;
+
+ sbi->s_def_mount_opt = sbi->s_mount_opt;
+
+ err = ext4_check_opt_consistency(fc, sb);
+ if (err < 0)
+ goto failed_mount;
+
+ ext4_apply_options(fc, sb);
+
+ if (ext4_encoding_init(sb, es))
+ goto failed_mount;
+
+ if (ext4_journal_data_mode_check(sb))
+ goto failed_mount;
+
+ sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+ (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
+
+ /* i_version is always enabled now */
+ sb->s_flags |= SB_I_VERSION;
+
+ if (ext4_check_feature_compatibility(sb, es, silent))
+ goto failed_mount;
+
+ if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) {
ext4_msg(sb, KERN_ERR,
"Number of reserved GDT blocks insanely large: %d",
le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks));
@@ -4717,7 +5116,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
}
if (sbi->s_daxdev) {
- if (blocksize == PAGE_SIZE)
+ if (sb->s_blocksize == PAGE_SIZE)
set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
else
ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n");
@@ -4742,40 +5141,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
goto failed_mount;
}
- if (sb->s_blocksize != blocksize) {
- /*
- * bh must be released before kill_bdev(), otherwise
- * it won't be freed and its page also. kill_bdev()
- * is called by sb_set_blocksize().
- */
- brelse(bh);
- /* Validate the filesystem blocksize */
- if (!sb_set_blocksize(sb, blocksize)) {
- ext4_msg(sb, KERN_ERR, "bad block size %d",
- blocksize);
- bh = NULL;
- goto failed_mount;
- }
-
- logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
- offset = do_div(logical_sb_block, blocksize);
- bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
- if (IS_ERR(bh)) {
- ext4_msg(sb, KERN_ERR,
- "Can't read superblock on 2nd try");
- ret = PTR_ERR(bh);
- bh = NULL;
- goto failed_mount;
- }
- es = (struct ext4_super_block *)(bh->b_data + offset);
- sbi->s_es = es;
- if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
- ext4_msg(sb, KERN_ERR,
- "Magic mismatch, very weird!");
- goto failed_mount;
- }
- }
-
has_huge_files = ext4_has_feature_huge_file(sb);
sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
has_huge_files);
@@ -4797,19 +5162,21 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
- sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
- if (sbi->s_inodes_per_block == 0)
- goto cantfind_ext4;
+ sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb);
+ if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) {
+ if (!silent)
+ ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
+ goto failed_mount;
+ }
if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
- sbi->s_inodes_per_group > blocksize * 8) {
+ sbi->s_inodes_per_group > sb->s_blocksize * 8) {
ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
sbi->s_inodes_per_group);
goto failed_mount;
}
sbi->s_itb_per_group = sbi->s_inodes_per_group /
sbi->s_inodes_per_block;
- sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
- sbi->s_sbh = bh;
+ sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb);
sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY;
sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
@@ -4835,54 +5202,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
}
}
- /* Handle clustersize */
- clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
- if (ext4_has_feature_bigalloc(sb)) {
- if (clustersize < blocksize) {
- ext4_msg(sb, KERN_ERR,
- "cluster size (%d) smaller than "
- "block size (%d)", clustersize, blocksize);
- goto failed_mount;
- }
- sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
- le32_to_cpu(es->s_log_block_size);
- sbi->s_clusters_per_group =
- le32_to_cpu(es->s_clusters_per_group);
- if (sbi->s_clusters_per_group > blocksize * 8) {
- ext4_msg(sb, KERN_ERR,
- "#clusters per group too big: %lu",
- sbi->s_clusters_per_group);
- goto failed_mount;
- }
- if (sbi->s_blocks_per_group !=
- (sbi->s_clusters_per_group * (clustersize / blocksize))) {
- ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
- "clusters per group (%lu) inconsistent",
- sbi->s_blocks_per_group,
- sbi->s_clusters_per_group);
- goto failed_mount;
- }
- } else {
- if (clustersize != blocksize) {
- ext4_msg(sb, KERN_ERR,
- "fragment/cluster size (%d) != "
- "block size (%d)", clustersize, blocksize);
- goto failed_mount;
- }
- if (sbi->s_blocks_per_group > blocksize * 8) {
- ext4_msg(sb, KERN_ERR,
- "#blocks per group too big: %lu",
- sbi->s_blocks_per_group);
- goto failed_mount;
- }
- sbi->s_clusters_per_group = sbi->s_blocks_per_group;
- sbi->s_cluster_bits = 0;
- }
- sbi->s_cluster_ratio = clustersize / blocksize;
-
- /* Do we have standard group size of clustersize * 8 blocks ? */
- if (sbi->s_blocks_per_group == clustersize << 3)
- set_opt2(sb, STD_GROUP_SIZE);
+ if (ext4_handle_clustersize(sb))
+ goto failed_mount;
/*
* Test whether we have more sectors than will fit in sector_t,
@@ -4896,111 +5217,12 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
goto failed_mount;
}
- if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
- goto cantfind_ext4;
-
- /* check blocks count against device size */
- blocks_count = sb_bdev_nr_blocks(sb);
- if (blocks_count && ext4_blocks_count(es) > blocks_count) {
- ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
- "exceeds size of device (%llu blocks)",
- ext4_blocks_count(es), blocks_count);
+ if (ext4_geometry_check(sb, es))
goto failed_mount;
- }
- /*
- * It makes no sense for the first data block to be beyond the end
- * of the filesystem.
- */
- if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
- ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
- "block %u is beyond end of filesystem (%llu)",
- le32_to_cpu(es->s_first_data_block),
- ext4_blocks_count(es));
- goto failed_mount;
- }
- if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) &&
- (sbi->s_cluster_ratio == 1)) {
- ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
- "block is 0 with a 1k block and cluster size");
- goto failed_mount;
- }
-
- blocks_count = (ext4_blocks_count(es) -
- le32_to_cpu(es->s_first_data_block) +
- EXT4_BLOCKS_PER_GROUP(sb) - 1);
- do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
- if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
- ext4_msg(sb, KERN_WARNING, "groups count too large: %llu "
- "(block count %llu, first data block %u, "
- "blocks per group %lu)", blocks_count,
- ext4_blocks_count(es),
- le32_to_cpu(es->s_first_data_block),
- EXT4_BLOCKS_PER_GROUP(sb));
- goto failed_mount;
- }
- sbi->s_groups_count = blocks_count;
- sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
- (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
- if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) !=
- le32_to_cpu(es->s_inodes_count)) {
- ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu",
- le32_to_cpu(es->s_inodes_count),
- ((u64)sbi->s_groups_count * sbi->s_inodes_per_group));
- ret = -EINVAL;
- goto failed_mount;
- }
- db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
- EXT4_DESC_PER_BLOCK(sb);
- if (ext4_has_feature_meta_bg(sb)) {
- if (le32_to_cpu(es->s_first_meta_bg) > db_count) {
- ext4_msg(sb, KERN_WARNING,
- "first meta block group too large: %u "
- "(group descriptor block count %u)",
- le32_to_cpu(es->s_first_meta_bg), db_count);
- goto failed_mount;
- }
- }
- rcu_assign_pointer(sbi->s_group_desc,
- kvmalloc_array(db_count,
- sizeof(struct buffer_head *),
- GFP_KERNEL));
- if (sbi->s_group_desc == NULL) {
- ext4_msg(sb, KERN_ERR, "not enough memory");
- ret = -ENOMEM;
+ err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed);
+ if (err)
goto failed_mount;
- }
-
- bgl_lock_init(sbi->s_blockgroup_lock);
-
- /* Pre-read the descriptors into the buffer cache */
- for (i = 0; i < db_count; i++) {
- block = descriptor_loc(sb, logical_sb_block, i);
- ext4_sb_breadahead_unmovable(sb, block);
- }
-
- for (i = 0; i < db_count; i++) {
- struct buffer_head *bh;
-
- block = descriptor_loc(sb, logical_sb_block, i);
- bh = ext4_sb_bread_unmovable(sb, block);
- if (IS_ERR(bh)) {
- ext4_msg(sb, KERN_ERR,
- "can't read group descriptor %d", i);
- db_count = i;
- ret = PTR_ERR(bh);
- goto failed_mount2;
- }
- rcu_read_lock();
- rcu_dereference(sbi->s_group_desc)[i] = bh;
- rcu_read_unlock();
- }
- sbi->s_gdb_count = db_count;
- if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
- ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
- ret = -EFSCORRUPTED;
- goto failed_mount2;
- }
timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
spin_lock_init(&sbi->s_error_lock);
@@ -5038,24 +5260,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
mutex_init(&sbi->s_orphan_lock);
- /* Initialize fast commit stuff */
- atomic_set(&sbi->s_fc_subtid, 0);
- INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]);
- INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]);
- INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
- INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
- sbi->s_fc_bytes = 0;
- ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
- sbi->s_fc_ineligible_tid = 0;
- spin_lock_init(&sbi->s_fc_lock);
- memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
- sbi->s_fc_replay_state.fc_regions = NULL;
- sbi->s_fc_replay_state.fc_regions_size = 0;
- sbi->s_fc_replay_state.fc_regions_used = 0;
- sbi->s_fc_replay_state.fc_regions_valid = 0;
- sbi->s_fc_replay_state.fc_modified_inodes = NULL;
- sbi->s_fc_replay_state.fc_modified_inodes_size = 0;
- sbi->s_fc_replay_state.fc_modified_inodes_used = 0;
+ ext4_fast_commit_init(sb);
sb->s_root = NULL;
@@ -5072,37 +5277,38 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
* root first: it may be modified in the journal!
*/
if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
- err = ext4_load_journal(sb, es, ctx->journal_devnum);
+ err = ext4_load_and_init_journal(sb, es, ctx);
if (err)
goto failed_mount3a;
} else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
ext4_has_feature_journal_needs_recovery(sb)) {
ext4_msg(sb, KERN_ERR, "required journal recovery "
"suppressed and not mounted read-only");
- goto failed_mount_wq;
+ goto failed_mount3a;
} else {
/* Nojournal mode, all journal mount options are illegal */
- if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "journal_checksum, fs mounted w/o journal");
- goto failed_mount_wq;
- }
if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"journal_async_commit, fs mounted w/o journal");
- goto failed_mount_wq;
+ goto failed_mount3a;
+ }
+
+ if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "journal_checksum, fs mounted w/o journal");
+ goto failed_mount3a;
}
if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"commit=%lu, fs mounted w/o journal",
sbi->s_commit_interval / HZ);
- goto failed_mount_wq;
+ goto failed_mount3a;
}
if (EXT4_MOUNT_DATA_FLAGS &
(sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"data=, fs mounted w/o journal");
- goto failed_mount_wq;
+ goto failed_mount3a;
}
sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
clear_opt(sb, JOURNAL_CHECKSUM);
@@ -5110,76 +5316,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
clear_opt2(sb, JOURNAL_FAST_COMMIT);
sbi->s_journal = NULL;
needs_recovery = 0;
- goto no_journal;
}
- if (ext4_has_feature_64bit(sb) &&
- !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
- JBD2_FEATURE_INCOMPAT_64BIT)) {
- ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
- goto failed_mount_wq;
- }
-
- if (!set_journal_csum_feature_set(sb)) {
- ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
- "feature set");
- goto failed_mount_wq;
- }
-
- if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
- !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
- JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) {
- ext4_msg(sb, KERN_ERR,
- "Failed to set fast commit journal feature");
- goto failed_mount_wq;
- }
-
- /* We have now updated the journal if required, so we can
- * validate the data journaling mode. */
- switch (test_opt(sb, DATA_FLAGS)) {
- case 0:
- /* No mode set, assume a default based on the journal
- * capabilities: ORDERED_DATA if the journal can
- * cope, else JOURNAL_DATA
- */
- if (jbd2_journal_check_available_features
- (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
- set_opt(sb, ORDERED_DATA);
- sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
- } else {
- set_opt(sb, JOURNAL_DATA);
- sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
- }
- break;
-
- case EXT4_MOUNT_ORDERED_DATA:
- case EXT4_MOUNT_WRITEBACK_DATA:
- if (!jbd2_journal_check_available_features
- (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
- ext4_msg(sb, KERN_ERR, "Journal does not support "
- "requested data journaling mode");
- goto failed_mount_wq;
- }
- break;
- default:
- break;
- }
-
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
- test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "journal_async_commit in data=ordered mode");
- goto failed_mount_wq;
- }
-
- set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);
-
- sbi->s_journal->j_submit_inode_data_buffers =
- ext4_journal_submit_inode_data_buffers;
- sbi->s_journal->j_finish_inode_data_buffers =
- ext4_journal_finish_inode_data_buffers;
-
-no_journal:
if (!test_opt(sb, NO_MBCACHE)) {
sbi->s_ea_block_cache = ext4_xattr_create_cache();
if (!sbi->s_ea_block_cache) {
@@ -5198,7 +5336,7 @@ no_journal:
}
}
- if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) {
+ if (ext4_has_feature_verity(sb) && sb->s_blocksize != PAGE_SIZE) {
ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
goto failed_mount_wq;
}
@@ -5408,11 +5546,6 @@ no_journal:
return 0;
-cantfind_ext4:
- if (!silent)
- ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
- goto failed_mount;
-
failed_mount9:
ext4_release_orphan_info(sb);
failed_mount8:
@@ -5466,13 +5599,7 @@ failed_mount3:
flush_work(&sbi->s_error_work);
del_timer_sync(&sbi->s_err_report);
ext4_stop_mmpd(sbi);
-failed_mount2:
- rcu_read_lock();
- group_desc = rcu_dereference(sbi->s_group_desc);
- for (i = 0; i < db_count; i++)
- brelse(group_desc[i]);
- kvfree(group_desc);
- rcu_read_unlock();
+ ext4_group_desc_free(sbi);
failed_mount:
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
@@ -5487,7 +5614,7 @@ failed_mount:
#endif
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
/* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */
- brelse(bh);
+ brelse(sbi->s_sbh);
ext4_blkdev_remove(sbi);
out_fail:
sb->s_fs_info = NULL;
@@ -5529,8 +5656,9 @@ static int ext4_fill_super(struct super_block *sb, struct fs_context *fc)
descr = "out journal";
if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
- ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
- "Quota mode: %s.", descr, ext4_quota_mode(sb));
+ ext4_msg(sb, KERN_INFO, "mounted filesystem %pU with%s. "
+ "Quota mode: %s.", &sb->s_uuid, descr,
+ ext4_quota_mode(sb));
/* Update the s_overhead_clusters if necessary */
ext4_update_overhead(sb, false);
@@ -5597,7 +5725,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
ext4_debug("Journal inode found at %p: %lld bytes\n",
journal_inode, journal_inode->i_size);
- if (!S_ISREG(journal_inode->i_mode)) {
+ if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) {
ext4_msg(sb, KERN_ERR, "invalid journal inode");
iput(journal_inode);
return NULL;
@@ -6485,8 +6613,8 @@ static int ext4_reconfigure(struct fs_context *fc)
if (ret < 0)
return ret;
- ext4_msg(sb, KERN_INFO, "re-mounted. Quota mode: %s.",
- ext4_quota_mode(sb));
+ ext4_msg(sb, KERN_INFO, "re-mounted %pU. Quota mode: %s.",
+ &sb->s_uuid, ext4_quota_mode(sb));
return 0;
}
@@ -6653,7 +6781,7 @@ static int ext4_write_info(struct super_block *sb, int type)
handle_t *handle;
/* Data block + inode block */
- handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2);
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_commit_info(sb, type);
@@ -6760,6 +6888,20 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
return err;
}
+static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum)
+{
+ switch (type) {
+ case USRQUOTA:
+ return qf_inum == EXT4_USR_QUOTA_INO;
+ case GRPQUOTA:
+ return qf_inum == EXT4_GRP_QUOTA_INO;
+ case PRJQUOTA:
+ return qf_inum >= EXT4_GOOD_OLD_FIRST_INO;
+ default:
+ BUG();
+ }
+}
+
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
unsigned int flags)
{
@@ -6776,9 +6918,16 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
if (!qf_inums[type])
return -EPERM;
+ if (!ext4_check_quota_inum(type, qf_inums[type])) {
+ ext4_error(sb, "Bad quota inum: %lu, type: %d",
+ qf_inums[type], type);
+ return -EUCLEAN;
+ }
+
qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);
if (IS_ERR(qf_inode)) {
- ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]);
+ ext4_error(sb, "Bad quota inode: %lu, type: %d",
+ qf_inums[type], type);
return PTR_ERR(qf_inode);
}
@@ -6817,8 +6966,9 @@ int ext4_enable_quotas(struct super_block *sb)
if (err) {
ext4_warning(sb,
"Failed to enable quota tracking "
- "(type=%d, err=%d). Please run "
- "e2fsck to fix.", type, err);
+ "(type=%d, err=%d, ino=%lu). "
+ "Please run e2fsck to fix.", type,
+ err, qf_inums[type]);
for (type--; type >= 0; type--) {
struct inode *inode;
@@ -6905,8 +7055,7 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
len = i_size-off;
toread = len;
while (toread > 0) {
- tocopy = sb->s_blocksize - offset < toread ?
- sb->s_blocksize - offset : toread;
+ tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
bh = ext4_bread(NULL, inode, blk, 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index b051d19b5c8a..30e3b65798b5 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -79,7 +79,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
size_t n = min_t(size_t, count,
PAGE_SIZE - offset_in_page(pos));
struct page *page;
- void *fsdata;
+ void *fsdata = NULL;
int res;
res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata);
@@ -298,16 +298,14 @@ static int ext4_get_verity_descriptor_location(struct inode *inode,
last_extent = path[path->p_depth].p_ext;
if (!last_extent) {
EXT4_ERROR_INODE(inode, "verity file has no extents");
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return -EFSCORRUPTED;
}
end_lblk = le32_to_cpu(last_extent->ee_block) +
ext4_ext_get_actual_len(last_extent);
desc_size_pos = (u64)end_lblk << inode->i_blkbits;
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
if (desc_size_pos < sizeof(desc_size_disk))
goto bad;
@@ -365,13 +363,14 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
{
- DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
struct page *page;
index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
if (!page || !PageUptodate(page)) {
+ DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
+
if (page)
put_page(page);
else if (num_ra_pages > 1)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 533216e80fa2..7decaaf27e82 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1281,7 +1281,7 @@ retry_ref:
ce = mb_cache_entry_get(ea_block_cache, hash,
bh->b_blocknr);
if (ce) {
- ce->e_reusable = 1;
+ set_bit(MBE_REUSABLE_B, &ce->e_flags);
mb_cache_entry_put(ea_block_cache, ce);
}
}
@@ -1441,6 +1441,9 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
if (!err)
err = ext4_inode_attach_jinode(ea_inode);
if (err) {
+ if (ext4_xattr_inode_dec_ref(handle, ea_inode))
+ ext4_warning_inode(ea_inode,
+ "cleanup dec ref error %d", err);
iput(ea_inode);
return ERR_PTR(err);
}
@@ -1540,7 +1543,8 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
if (err) {
- ext4_xattr_inode_dec_ref(handle, ea_inode);
+ if (ext4_xattr_inode_dec_ref(handle, ea_inode))
+ ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err);
iput(ea_inode);
return err;
}
@@ -2042,7 +2046,7 @@ inserted:
}
BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
if (ref == EXT4_XATTR_REFCOUNT_MAX)
- ce->e_reusable = 0;
+ clear_bit(MBE_REUSABLE_B, &ce->e_flags);
ea_bdebug(new_bh, "reusing; refcount now=%d",
ref);
ext4_xattr_block_csum_set(inode, new_bh);
@@ -2070,19 +2074,11 @@ inserted:
goal = ext4_group_first_block_no(sb,
EXT4_I(inode)->i_block_group);
-
- /* non-extent files can't have physical blocks past 2^32 */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
-
block = ext4_new_meta_blocks(handle, inode, goal, 0,
NULL, &error);
if (error)
goto cleanup;
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
-
ea_idebug(inode, "creating block %llu",
(unsigned long long)block);
@@ -2412,6 +2408,7 @@ retry_inode:
if (!error) {
ext4_xattr_update_super_block(handle, inode->i_sb);
inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
if (!value)
no_expand = 0;
error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
@@ -2554,7 +2551,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS);
- buffer = kmalloc(value_size, GFP_NOFS);
+ buffer = kvmalloc(value_size, GFP_NOFS);
b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS);
if (!is || !bs || !buffer || !b_entry_name) {
error = -ENOMEM;
@@ -2606,7 +2603,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
error = 0;
out:
kfree(b_entry_name);
- kfree(buffer);
+ kvfree(buffer);
if (is)
brelse(is->iloc.bh);
if (bs)
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index eaa240b21f07..c1c74aa658ae 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -219,7 +219,7 @@ static int f2fs_acl_update_mode(struct user_namespace *mnt_userns,
return error;
if (error == 0)
*acl = NULL;
- if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) &&
+ if (!vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)) &&
!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
mode &= ~S_ISGID;
*mode_p = mode;
@@ -276,9 +276,11 @@ static int __f2fs_set_acl(struct user_namespace *mnt_userns,
return error;
}
-int f2fs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int f2fs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
+ struct inode *inode = d_inode(dentry);
+
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
return -EIO;
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index a26e33cab4ff..ea2bbb3f264b 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -34,7 +34,7 @@ struct f2fs_acl_header {
#ifdef CONFIG_F2FS_FS_POSIX_ACL
extern struct posix_acl *f2fs_get_acl(struct inode *, int, bool);
-extern int f2fs_set_acl(struct user_namespace *, struct inode *,
+extern int f2fs_set_acl(struct user_namespace *, struct dentry *,
struct posix_acl *, int);
extern int f2fs_init_acl(struct inode *, struct inode *, struct page *,
struct page *);
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 8259e0fa97e1..56f7d0d6a8b2 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -26,12 +26,16 @@
static struct kmem_cache *ino_entry_slab;
struct kmem_cache *f2fs_inode_entry_slab;
-void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io)
+void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io,
+ unsigned char reason)
{
f2fs_build_fault_attr(sbi, 0, 0);
set_ckpt_flags(sbi, CP_ERROR_FLAG);
- if (!end_io)
+ if (!end_io) {
f2fs_flush_merged_writes(sbi);
+
+ f2fs_handle_stop(sbi, reason);
+ }
}
/*
@@ -89,7 +93,7 @@ repeat:
return ERR_PTR(err);
}
- f2fs_update_iostat(sbi, FS_META_READ_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE);
lock_page(page);
if (unlikely(page->mapping != mapping)) {
@@ -122,7 +126,7 @@ retry:
if (PTR_ERR(page) == -EIO &&
++count <= DEFAULT_RETRY_IO_COUNT)
goto retry;
- f2fs_stop_checkpoint(sbi, false);
+ f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_META_PAGE);
}
return page;
}
@@ -140,7 +144,7 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr,
unsigned int segno, offset;
bool exist;
- if (type != DATA_GENERIC_ENHANCE && type != DATA_GENERIC_ENHANCE_READ)
+ if (type == DATA_GENERIC)
return true;
segno = GET_SEGNO(sbi, blkaddr);
@@ -148,6 +152,13 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr,
se = get_seg_entry(sbi, segno);
exist = f2fs_test_bit(offset, se->cur_valid_map);
+ if (exist && type == DATA_GENERIC_ENHANCE_UPDATE) {
+ f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d",
+ blkaddr, exist);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ return exist;
+ }
+
if (!exist && type == DATA_GENERIC_ENHANCE) {
f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d",
blkaddr, exist);
@@ -160,6 +171,11 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr,
bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
block_t blkaddr, int type)
{
+ if (time_to_inject(sbi, FAULT_BLKADDR)) {
+ f2fs_show_injection_info(sbi, FAULT_BLKADDR);
+ return false;
+ }
+
switch (type) {
case META_NAT:
break;
@@ -185,6 +201,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
case DATA_GENERIC:
case DATA_GENERIC_ENHANCE:
case DATA_GENERIC_ENHANCE_READ:
+ case DATA_GENERIC_ENHANCE_UPDATE:
if (unlikely(blkaddr >= MAX_BLKADDR(sbi) ||
blkaddr < MAIN_BLKADDR(sbi))) {
f2fs_warn(sbi, "access invalid blkaddr:%u",
@@ -276,7 +293,8 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
f2fs_put_page(page, err ? 1 : 0);
if (!err)
- f2fs_update_iostat(sbi, FS_META_READ_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(sbi, NULL, FS_META_READ_IO,
+ F2FS_BLKSIZE);
}
out:
blk_finish_plug(&plug);
@@ -448,8 +466,7 @@ static bool f2fs_dirty_meta_folio(struct address_space *mapping,
if (!folio_test_uptodate(folio))
folio_mark_uptodate(folio);
- if (!folio_test_dirty(folio)) {
- filemap_dirty_folio(mapping, folio);
+ if (filemap_dirty_folio(mapping, folio)) {
inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META);
set_page_private_reference(&folio->page);
return true;
@@ -1053,7 +1070,8 @@ void f2fs_remove_dirty_inode(struct inode *inode)
spin_unlock(&sbi->inode_lock[type]);
}
-int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type)
+int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type,
+ bool from_cp)
{
struct list_head *head;
struct inode *inode;
@@ -1088,11 +1106,15 @@ retry:
if (inode) {
unsigned long cur_ino = inode->i_ino;
- F2FS_I(inode)->cp_task = current;
+ if (from_cp)
+ F2FS_I(inode)->cp_task = current;
+ F2FS_I(inode)->wb_task = current;
filemap_fdatawrite(inode->i_mapping);
- F2FS_I(inode)->cp_task = NULL;
+ F2FS_I(inode)->wb_task = NULL;
+ if (from_cp)
+ F2FS_I(inode)->cp_task = NULL;
iput(inode);
/* We need to give cpu to another writers. */
@@ -1221,7 +1243,7 @@ retry_flush_dents:
/* write all the dirty dentry pages */
if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
f2fs_unlock_all(sbi);
- err = f2fs_sync_dirty_inodes(sbi, DIR_INODE);
+ err = f2fs_sync_dirty_inodes(sbi, DIR_INODE, true);
if (err)
return err;
cond_resched();
@@ -1880,8 +1902,10 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi)
cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi,
"f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(cprc->f2fs_issue_ckpt)) {
+ int err = PTR_ERR(cprc->f2fs_issue_ckpt);
+
cprc->f2fs_issue_ckpt = NULL;
- return -ENOMEM;
+ return err;
}
set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio);
@@ -1892,15 +1916,27 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi)
void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi)
{
struct ckpt_req_control *cprc = &sbi->cprc_info;
+ struct task_struct *ckpt_task;
- if (cprc->f2fs_issue_ckpt) {
- struct task_struct *ckpt_task = cprc->f2fs_issue_ckpt;
+ if (!cprc->f2fs_issue_ckpt)
+ return;
- cprc->f2fs_issue_ckpt = NULL;
- kthread_stop(ckpt_task);
+ ckpt_task = cprc->f2fs_issue_ckpt;
+ cprc->f2fs_issue_ckpt = NULL;
+ kthread_stop(ckpt_task);
- flush_remained_ckpt_reqs(sbi, NULL);
- }
+ f2fs_flush_ckpt_thread(sbi);
+}
+
+void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi)
+{
+ struct ckpt_req_control *cprc = &sbi->cprc_info;
+
+ flush_remained_ckpt_reqs(sbi, NULL);
+
+ /* Let's wait for the previous dispatched checkpoint. */
+ while (atomic_read(&cprc->queued_ckpt))
+ io_schedule_timeout(DEFAULT_IO_TIMEOUT);
}
void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 70e97075e535..2532f369cb10 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -346,7 +346,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc)
if (!level)
level = F2FS_ZSTD_DEFAULT_CLEVEL;
- params = zstd_get_params(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen);
+ params = zstd_get_params(level, cc->rlen);
workspace_size = zstd_cstream_workspace_bound(&params.cParams);
workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
@@ -567,10 +567,7 @@ MODULE_PARM_DESC(num_compress_pages,
int f2fs_init_compress_mempool(void)
{
compress_page_pool = mempool_create_page_pool(num_compress_pages, 0);
- if (!compress_page_pool)
- return -ENOMEM;
-
- return 0;
+ return compress_page_pool ? 0 : -ENOMEM;
}
void f2fs_destroy_compress_mempool(void)
@@ -762,6 +759,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task)
if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) {
ret = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION);
goto out_release;
}
@@ -912,17 +910,15 @@ bool f2fs_sanity_check_cluster(struct dnode_of_data *dn)
reason = "[C|*|C|*]";
goto out;
}
- if (compressed) {
- if (!__is_valid_data_blkaddr(blkaddr)) {
- if (!cluster_end)
- cluster_end = i;
- continue;
- }
- /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */
- if (cluster_end) {
- reason = "[C|N|N|V]";
- goto out;
- }
+ if (!__is_valid_data_blkaddr(blkaddr)) {
+ if (!cluster_end)
+ cluster_end = i;
+ continue;
+ }
+ /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */
+ if (cluster_end) {
+ reason = "[C|N|N|V]";
+ goto out;
}
}
return false;
@@ -952,6 +948,7 @@ static int __f2fs_cluster_blocks(struct inode *inode,
if (f2fs_sanity_check_cluster(&dn)) {
ret = -EFSCORRUPTED;
+ f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_CLUSTER);
goto fail;
}
@@ -1568,12 +1565,8 @@ static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic,
if (!dic->cbuf)
return -ENOMEM;
- if (cops->init_decompress_ctx) {
- int ret = cops->init_decompress_ctx(dic);
-
- if (ret)
- return ret;
- }
+ if (cops->init_decompress_ctx)
+ return cops->init_decompress_ctx(dic);
return 0;
}
@@ -1715,50 +1708,27 @@ static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task)
}
}
-/*
- * Update and unlock the cluster's pagecache pages, and release the reference to
- * the decompress_io_ctx that was being held for I/O completion.
- */
-static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
- bool in_task)
+static void f2fs_verify_cluster(struct work_struct *work)
{
+ struct decompress_io_ctx *dic =
+ container_of(work, struct decompress_io_ctx, verity_work);
int i;
+ /* Verify, update, and unlock the decompressed pages. */
for (i = 0; i < dic->cluster_size; i++) {
struct page *rpage = dic->rpages[i];
if (!rpage)
continue;
- /* PG_error was set if verity failed. */
- if (failed || PageError(rpage)) {
- ClearPageUptodate(rpage);
- /* will re-read again later */
- ClearPageError(rpage);
- } else {
+ if (fsverity_verify_page(rpage))
SetPageUptodate(rpage);
- }
+ else
+ ClearPageUptodate(rpage);
unlock_page(rpage);
}
- f2fs_put_dic(dic, in_task);
-}
-
-static void f2fs_verify_cluster(struct work_struct *work)
-{
- struct decompress_io_ctx *dic =
- container_of(work, struct decompress_io_ctx, verity_work);
- int i;
-
- /* Verify the cluster's decompressed pages with fs-verity. */
- for (i = 0; i < dic->cluster_size; i++) {
- struct page *rpage = dic->rpages[i];
-
- if (rpage && !fsverity_verify_page(rpage))
- SetPageError(rpage);
- }
-
- __f2fs_decompress_end_io(dic, false, true);
+ f2fs_put_dic(dic, true);
}
/*
@@ -1768,6 +1738,8 @@ static void f2fs_verify_cluster(struct work_struct *work)
void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
bool in_task)
{
+ int i;
+
if (!failed && dic->need_verity) {
/*
* Note that to avoid deadlocks, the verity work can't be done
@@ -1777,9 +1749,28 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
*/
INIT_WORK(&dic->verity_work, f2fs_verify_cluster);
fsverity_enqueue_verify_work(&dic->verity_work);
- } else {
- __f2fs_decompress_end_io(dic, failed, in_task);
+ return;
}
+
+ /* Update and unlock the cluster's pagecache pages. */
+ for (i = 0; i < dic->cluster_size; i++) {
+ struct page *rpage = dic->rpages[i];
+
+ if (!rpage)
+ continue;
+
+ if (failed)
+ ClearPageUptodate(rpage);
+ else
+ SetPageUptodate(rpage);
+ unlock_page(rpage);
+ }
+
+ /*
+ * Release the reference to the decompress_io_ctx that was being held
+ * for I/O completion.
+ */
+ f2fs_put_dic(dic, in_task);
}
/*
@@ -1905,7 +1896,7 @@ bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino)
{
- struct address_space *mapping = sbi->compress_inode->i_mapping;
+ struct address_space *mapping = COMPRESS_MAPPING(sbi);
struct folio_batch fbatch;
pgoff_t index = 0;
pgoff_t end = MAX_BLKADDR(sbi);
@@ -1987,9 +1978,7 @@ int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi)
sbi->page_array_slab = f2fs_kmem_cache_create(slab_name,
sbi->page_array_slab_size);
- if (!sbi->page_array_slab)
- return -ENOMEM;
- return 0;
+ return sbi->page_array_slab ? 0 : -ENOMEM;
}
void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi)
@@ -1997,53 +1986,24 @@ void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi)
kmem_cache_destroy(sbi->page_array_slab);
}
-static int __init f2fs_init_cic_cache(void)
+int __init f2fs_init_compress_cache(void)
{
cic_entry_slab = f2fs_kmem_cache_create("f2fs_cic_entry",
sizeof(struct compress_io_ctx));
if (!cic_entry_slab)
return -ENOMEM;
- return 0;
-}
-
-static void f2fs_destroy_cic_cache(void)
-{
- kmem_cache_destroy(cic_entry_slab);
-}
-
-static int __init f2fs_init_dic_cache(void)
-{
dic_entry_slab = f2fs_kmem_cache_create("f2fs_dic_entry",
sizeof(struct decompress_io_ctx));
if (!dic_entry_slab)
- return -ENOMEM;
- return 0;
-}
-
-static void f2fs_destroy_dic_cache(void)
-{
- kmem_cache_destroy(dic_entry_slab);
-}
-
-int __init f2fs_init_compress_cache(void)
-{
- int err;
-
- err = f2fs_init_cic_cache();
- if (err)
- goto out;
- err = f2fs_init_dic_cache();
- if (err)
goto free_cic;
return 0;
free_cic:
- f2fs_destroy_cic_cache();
-out:
+ kmem_cache_destroy(cic_entry_slab);
return -ENOMEM;
}
void f2fs_destroy_compress_cache(void)
{
- f2fs_destroy_dic_cache();
- f2fs_destroy_cic_cache();
+ kmem_cache_destroy(dic_entry_slab);
+ kmem_cache_destroy(cic_entry_slab);
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index aa3ccddfa037..6e43e19c7d1c 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -39,10 +39,8 @@ static struct bio_set f2fs_bioset;
int __init f2fs_init_bioset(void)
{
- if (bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE,
- 0, BIOSET_NEED_BVECS))
- return -ENOMEM;
- return 0;
+ return bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE,
+ 0, BIOSET_NEED_BVECS);
}
void f2fs_destroy_bioset(void)
@@ -116,43 +114,56 @@ struct bio_post_read_ctx {
struct f2fs_sb_info *sbi;
struct work_struct work;
unsigned int enabled_steps;
+ /*
+ * decompression_attempted keeps track of whether
+ * f2fs_end_read_compressed_page() has been called on the pages in the
+ * bio that belong to a compressed cluster yet.
+ */
+ bool decompression_attempted;
block_t fs_blkaddr;
};
+/*
+ * Update and unlock a bio's pages, and free the bio.
+ *
+ * This marks pages up-to-date only if there was no error in the bio (I/O error,
+ * decryption error, or verity error), as indicated by bio->bi_status.
+ *
+ * "Compressed pages" (pagecache pages backed by a compressed cluster on-disk)
+ * aren't marked up-to-date here, as decompression is done on a per-compression-
+ * cluster basis rather than a per-bio basis. Instead, we only must do two
+ * things for each compressed page here: call f2fs_end_read_compressed_page()
+ * with failed=true if an error occurred before it would have normally gotten
+ * called (i.e., I/O error or decryption error, but *not* verity error), and
+ * release the bio's reference to the decompress_io_ctx of the page's cluster.
+ */
static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
{
struct bio_vec *bv;
struct bvec_iter_all iter_all;
+ struct bio_post_read_ctx *ctx = bio->bi_private;
- /*
- * Update and unlock the bio's pagecache pages, and put the
- * decompression context for any compressed pages.
- */
bio_for_each_segment_all(bv, bio, iter_all) {
struct page *page = bv->bv_page;
if (f2fs_is_compressed_page(page)) {
- if (bio->bi_status)
+ if (ctx && !ctx->decompression_attempted)
f2fs_end_read_compressed_page(page, true, 0,
in_task);
f2fs_put_page_dic(page, in_task);
continue;
}
- /* PG_error was set if decryption or verity failed. */
- if (bio->bi_status || PageError(page)) {
+ if (bio->bi_status)
ClearPageUptodate(page);
- /* will re-read again later */
- ClearPageError(page);
- } else {
+ else
SetPageUptodate(page);
- }
dec_page_count(F2FS_P_SB(page), __read_io_type(page));
unlock_page(page);
}
- if (bio->bi_private)
- mempool_free(bio->bi_private, bio_post_read_ctx_pool);
+ if (ctx)
+ mempool_free(ctx, bio_post_read_ctx_pool);
bio_put(bio);
}
@@ -185,8 +196,10 @@ static void f2fs_verify_bio(struct work_struct *work)
struct page *page = bv->bv_page;
if (!f2fs_is_compressed_page(page) &&
- !PageError(page) && !fsverity_verify_page(page))
- SetPageError(page);
+ !fsverity_verify_page(page)) {
+ bio->bi_status = BLK_STS_IOERR;
+ break;
+ }
}
} else {
fsverity_verify_bio(bio);
@@ -236,16 +249,17 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx,
bio_for_each_segment_all(bv, ctx->bio, iter_all) {
struct page *page = bv->bv_page;
- /* PG_error was set if decryption failed. */
if (f2fs_is_compressed_page(page))
- f2fs_end_read_compressed_page(page, PageError(page),
- blkaddr, in_task);
+ f2fs_end_read_compressed_page(page, false, blkaddr,
+ in_task);
else
all_compressed = false;
blkaddr++;
}
+ ctx->decompression_attempted = true;
+
/*
* Optimization: if all the bio's pages are compressed, then scheduling
* the per-bio verity work is unnecessary, as verity will be fully
@@ -259,14 +273,17 @@ static void f2fs_post_read_work(struct work_struct *work)
{
struct bio_post_read_ctx *ctx =
container_of(work, struct bio_post_read_ctx, work);
+ struct bio *bio = ctx->bio;
- if (ctx->enabled_steps & STEP_DECRYPT)
- fscrypt_decrypt_bio(ctx->bio);
+ if ((ctx->enabled_steps & STEP_DECRYPT) && !fscrypt_decrypt_bio(bio)) {
+ f2fs_finish_read_bio(bio, true);
+ return;
+ }
if (ctx->enabled_steps & STEP_DECOMPRESS)
f2fs_handle_step_decompress(ctx, true);
- f2fs_verify_and_finish_bio(ctx->bio, true);
+ f2fs_verify_and_finish_bio(bio, true);
}
static void f2fs_read_end_io(struct bio *bio)
@@ -333,7 +350,8 @@ static void f2fs_write_end_io(struct bio *bio)
mempool_free(page, sbi->write_io_dummy);
if (unlikely(bio->bi_status))
- f2fs_stop_checkpoint(sbi, true);
+ f2fs_stop_checkpoint(sbi, true,
+ STOP_CP_REASON_WRITE_FAIL);
continue;
}
@@ -349,7 +367,8 @@ static void f2fs_write_end_io(struct bio *bio)
if (unlikely(bio->bi_status)) {
mapping_set_error(page->mapping, -EIO);
if (type == F2FS_WB_CP_DATA)
- f2fs_stop_checkpoint(sbi, true);
+ f2fs_stop_checkpoint(sbi, true,
+ STOP_CP_REASON_WRITE_FAIL);
}
f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) &&
@@ -703,8 +722,10 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr,
fio->is_por ? META_POR : (__is_meta_io(fio) ?
- META_GENERIC : DATA_GENERIC_ENHANCE)))
+ META_GENERIC : DATA_GENERIC_ENHANCE))) {
+ f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR);
return -EFSCORRUPTED;
+ }
trace_f2fs_submit_page_bio(page, fio);
@@ -723,7 +744,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE);
inc_page_count(fio->sbi, is_read_io(fio->op) ?
- __read_io_type(page): WB_DATA_TYPE(fio->page));
+ __read_io_type(page) : WB_DATA_TYPE(fio->page));
__submit_bio(fio->sbi, bio, fio->type);
return 0;
@@ -904,8 +925,10 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
fio->encrypted_page : fio->page;
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr,
- __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC))
+ __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) {
+ f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR);
return -EFSCORRUPTED;
+ }
trace_f2fs_submit_page_bio(page, fio);
@@ -1054,6 +1077,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
ctx->sbi = sbi;
ctx->enabled_steps = post_read_steps;
ctx->fs_blkaddr = blkaddr;
+ ctx->decompression_attempted = false;
bio->bi_private = ctx;
}
iostat_alloc_and_bind_ctx(sbi, bio, ctx);
@@ -1081,9 +1105,8 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page,
bio_put(bio);
return -EFAULT;
}
- ClearPageError(page);
inc_page_count(sbi, F2FS_RD_DATA);
- f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE);
__submit_bio(sbi, bio, DATA);
return 0;
}
@@ -1120,7 +1143,7 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
{
dn->data_blkaddr = blkaddr;
f2fs_set_data_blkaddr(dn);
- f2fs_update_extent_cache(dn);
+ f2fs_update_read_extent_cache(dn);
}
/* dn->ofs_in_node will be returned with up-to-date last block pointer */
@@ -1189,7 +1212,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
struct extent_info ei = {0, };
struct inode *inode = dn->inode;
- if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
dn->data_blkaddr = ei.blk + index - ei.fofs;
return 0;
}
@@ -1198,7 +1221,8 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
}
struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
- blk_opf_t op_flags, bool for_write)
+ blk_opf_t op_flags, bool for_write,
+ pgoff_t *next_pgofs)
{
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
@@ -1210,11 +1234,13 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
if (!page)
return ERR_PTR(-ENOMEM);
- if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
dn.data_blkaddr = ei.blk + index - ei.fofs;
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
DATA_GENERIC_ENHANCE_READ)) {
err = -EFSCORRUPTED;
+ f2fs_handle_error(F2FS_I_SB(inode),
+ ERROR_INVALID_BLKADDR);
goto put_err;
}
goto got_it;
@@ -1222,12 +1248,17 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
- if (err)
+ if (err) {
+ if (err == -ENOENT && next_pgofs)
+ *next_pgofs = f2fs_get_next_page_offset(&dn, index);
goto put_err;
+ }
f2fs_put_dnode(&dn);
if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
err = -ENOENT;
+ if (next_pgofs)
+ *next_pgofs = index + 1;
goto put_err;
}
if (dn.data_blkaddr != NEW_ADDR &&
@@ -1235,6 +1266,8 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
dn.data_blkaddr,
DATA_GENERIC_ENHANCE)) {
err = -EFSCORRUPTED;
+ f2fs_handle_error(F2FS_I_SB(inode),
+ ERROR_INVALID_BLKADDR);
goto put_err;
}
got_it:
@@ -1269,7 +1302,8 @@ put_err:
return ERR_PTR(err);
}
-struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index)
+struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index,
+ pgoff_t *next_pgofs)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
@@ -1279,7 +1313,7 @@ struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index)
return page;
f2fs_put_page(page, 0);
- page = f2fs_get_read_data_page(inode, index, 0, false);
+ page = f2fs_get_read_data_page(inode, index, 0, false, next_pgofs);
if (IS_ERR(page))
return page;
@@ -1305,7 +1339,7 @@ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct page *page;
repeat:
- page = f2fs_get_read_data_page(inode, index, 0, for_write);
+ page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL);
if (IS_ERR(page))
return page;
@@ -1468,7 +1502,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
pgofs = (pgoff_t)map->m_lblk;
end = pgofs + maxblocks;
- if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
+ if (!create && f2fs_lookup_read_extent_cache(inode, pgofs, &ei)) {
if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO &&
map->m_may_create)
goto next_dnode;
@@ -1548,6 +1582,7 @@ next_block:
if (__is_valid_data_blkaddr(blkaddr) &&
!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) {
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto sync_out;
}
@@ -1593,6 +1628,8 @@ next_block:
(flag != F2FS_GET_BLOCK_FIEMAP ||
IS_ENABLED(CONFIG_F2FS_CHECK_FS))) {
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi,
+ ERROR_CORRUPTED_CLUSTER);
goto sync_out;
}
if (flag == F2FS_GET_BLOCK_BMAP) {
@@ -1675,7 +1712,7 @@ skip:
if (map->m_flags & F2FS_MAP_MAPPED) {
unsigned int ofs = start_pgofs - map->m_lblk;
- f2fs_update_extent_cache_range(&dn,
+ f2fs_update_read_extent_cache_range(&dn,
start_pgofs, map->m_pblk + ofs,
map->m_len - ofs);
}
@@ -1720,7 +1757,7 @@ sync_out:
if (map->m_flags & F2FS_MAP_MAPPED) {
unsigned int ofs = start_pgofs - map->m_lblk;
- f2fs_update_extent_cache_range(&dn,
+ f2fs_update_read_extent_cache_range(&dn,
start_pgofs, map->m_pblk + ofs,
map->m_len - ofs);
}
@@ -1816,7 +1853,7 @@ static int f2fs_xattr_fiemap(struct inode *inode,
err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags);
trace_f2fs_fiemap(inode, 0, phys, len, flags, err);
- if (err || err == 1)
+ if (err)
return err;
}
@@ -2074,6 +2111,8 @@ got_it:
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
DATA_GENERIC_ENHANCE_READ)) {
ret = -EFSCORRUPTED;
+ f2fs_handle_error(F2FS_I_SB(inode),
+ ERROR_INVALID_BLKADDR);
goto out;
}
} else {
@@ -2122,8 +2161,8 @@ submit_and_realloc:
goto submit_and_realloc;
inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
- f2fs_update_iostat(F2FS_I_SB(inode), FS_DATA_READ_IO, F2FS_BLKSIZE);
- ClearPageError(page);
+ f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO,
+ F2FS_BLKSIZE);
*last_block_in_bio = block_nr;
goto out;
out:
@@ -2178,7 +2217,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
if (f2fs_cluster_is_empty(cc))
goto out;
- if (f2fs_lookup_extent_cache(inode, start_idx, &ei))
+ if (f2fs_lookup_read_extent_cache(inode, start_idx, &ei))
from_dnode = false;
if (!from_dnode)
@@ -2270,9 +2309,7 @@ submit_and_realloc:
refcount_inc(&dic->refcnt);
inc_page_count(sbi, F2FS_RD_DATA);
- f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE);
- f2fs_update_iostat(sbi, FS_CDATA_READ_IO, F2FS_BLKSIZE);
- ClearPageError(page);
+ f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE);
*last_block_in_bio = blkaddr;
}
@@ -2289,7 +2326,6 @@ out:
for (i = 0; i < cc->cluster_size; i++) {
if (cc->rpages[i]) {
ClearPageUptodate(cc->rpages[i]);
- ClearPageError(cc->rpages[i]);
unlock_page(cc->rpages[i]);
}
}
@@ -2386,7 +2422,6 @@ read_single_page:
#ifdef CONFIG_F2FS_FS_COMPRESSION
set_error_page:
#endif
- SetPageError(page);
zero_user_segment(page, 0, PAGE_SIZE);
unlock_page(page);
}
@@ -2543,7 +2578,7 @@ bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio)
return true;
/* if this is cold file, we should overwrite to avoid fragmentation */
- if (file_is_cold(inode))
+ if (file_is_cold(inode) && !is_inode_flag_set(inode, FI_OPU_WRITE))
return true;
return check_inplace_update_policy(inode, fio);
@@ -2613,12 +2648,15 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
set_new_dnode(&dn, inode, NULL, NULL, 0);
if (need_inplace_update(fio) &&
- f2fs_lookup_extent_cache(inode, page->index, &ei)) {
+ f2fs_lookup_read_extent_cache(inode, page->index, &ei)) {
fio->old_blkaddr = ei.blk + page->index - ei.fofs;
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
- DATA_GENERIC_ENHANCE))
+ DATA_GENERIC_ENHANCE)) {
+ f2fs_handle_error(fio->sbi,
+ ERROR_INVALID_BLKADDR);
return -EFSCORRUPTED;
+ }
ipu_force = true;
fio->need_lock = LOCK_DONE;
@@ -2646,6 +2684,7 @@ got_it:
!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
DATA_GENERIC_ENHANCE)) {
err = -EFSCORRUPTED;
+ f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR);
goto out_writepage;
}
@@ -2856,7 +2895,7 @@ out:
}
unlock_page(page);
if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) &&
- !F2FS_I(inode)->cp_task && allow_balance)
+ !F2FS_I(inode)->wb_task && allow_balance)
f2fs_balance_fs(sbi, need_balance_fs);
if (unlikely(f2fs_cp_error(sbi))) {
@@ -3156,7 +3195,7 @@ static inline bool __should_serialize_io(struct inode *inode,
struct writeback_control *wbc)
{
/* to avoid deadlock in path of data flush */
- if (F2FS_I(inode)->cp_task)
+ if (F2FS_I(inode)->wb_task)
return false;
if (!S_ISREG(inode->i_mode))
@@ -3333,7 +3372,7 @@ restart:
} else if (locked) {
err = f2fs_get_block(&dn, index);
} else {
- if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
dn.data_blkaddr = ei.blk + index - ei.fofs;
} else {
/* hole case */
@@ -3374,7 +3413,7 @@ static int __find_data_block(struct inode *inode, pgoff_t index,
set_new_dnode(&dn, inode, ipage, ipage, 0);
- if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
dn.data_blkaddr = ei.blk + index - ei.fofs;
} else {
/* hole case */
@@ -3438,6 +3477,9 @@ static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi,
else if (*blk_addr != NULL_ADDR)
return 0;
+ if (is_inode_flag_set(inode, FI_ATOMIC_REPLACE))
+ goto reserve_block;
+
/* Look for the block in the original inode */
err = __find_data_block(inode, index, &ori_blk_addr);
if (err)
@@ -3559,6 +3601,7 @@ repeat:
if (!f2fs_is_valid_blkaddr(sbi, blkaddr,
DATA_GENERIC_ENHANCE_READ)) {
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto fail;
}
err = f2fs_submit_page_read(inode, page, blkaddr, 0, true);
@@ -3697,8 +3740,7 @@ static bool f2fs_dirty_data_folio(struct address_space *mapping,
folio_mark_uptodate(folio);
BUG_ON(folio_test_swapcache(folio));
- if (!folio_test_dirty(folio)) {
- filemap_dirty_folio(mapping, folio);
+ if (filemap_dirty_folio(mapping, folio)) {
f2fs_update_dirty_folio(inode, folio);
return true;
}
@@ -3970,6 +4012,7 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
if (ret < 0)
return ret;
+ stat_inc_swapfile_inode(inode);
set_inode_flag(inode, FI_PIN_FILE);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return ret;
@@ -3979,6 +4022,7 @@ static void f2fs_swap_deactivate(struct file *file)
{
struct inode *inode = file_inode(file);
+ stat_dec_swapfile_inode(inode);
clear_inode_flag(inode, FI_PIN_FILE);
}
#else
@@ -4057,9 +4101,7 @@ int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi)
sbi->post_read_wq = alloc_workqueue("f2fs_post_read_wq",
WQ_UNBOUND | WQ_HIGHPRI,
num_online_cpus());
- if (!sbi->post_read_wq)
- return -ENOMEM;
- return 0;
+ return sbi->post_read_wq ? 0 : -ENOMEM;
}
void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi)
@@ -4072,9 +4114,7 @@ int __init f2fs_init_bio_entry_cache(void)
{
bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
sizeof(struct bio_entry));
- if (!bio_entry_slab)
- return -ENOMEM;
- return 0;
+ return bio_entry_slab ? 0 : -ENOMEM;
}
void f2fs_destroy_bio_entry_cache(void)
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index c01471573977..32af4f0c5735 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -72,15 +72,26 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->main_area_zones = si->main_area_sections /
le32_to_cpu(raw_super->secs_per_zone);
- /* validation check of the segment numbers */
+ /* general extent cache stats */
+ for (i = 0; i < NR_EXTENT_CACHES; i++) {
+ struct extent_tree_info *eti = &sbi->extent_tree[i];
+
+ si->hit_cached[i] = atomic64_read(&sbi->read_hit_cached[i]);
+ si->hit_rbtree[i] = atomic64_read(&sbi->read_hit_rbtree[i]);
+ si->total_ext[i] = atomic64_read(&sbi->total_hit_ext[i]);
+ si->hit_total[i] = si->hit_cached[i] + si->hit_rbtree[i];
+ si->ext_tree[i] = atomic_read(&eti->total_ext_tree);
+ si->zombie_tree[i] = atomic_read(&eti->total_zombie_tree);
+ si->ext_node[i] = atomic_read(&eti->total_ext_node);
+ }
+ /* read extent_cache only */
si->hit_largest = atomic64_read(&sbi->read_hit_largest);
- si->hit_cached = atomic64_read(&sbi->read_hit_cached);
- si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree);
- si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree;
- si->total_ext = atomic64_read(&sbi->total_hit_ext);
- si->ext_tree = atomic_read(&sbi->total_ext_tree);
- si->zombie_tree = atomic_read(&sbi->total_zombie_tree);
- si->ext_node = atomic_read(&sbi->total_ext_node);
+ si->hit_total[EX_READ] += si->hit_largest;
+
+ /* block age extent_cache only */
+ si->allocated_data_blocks = atomic64_read(&sbi->allocated_data_blocks);
+
+ /* validation check of the segment numbers */
si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
@@ -91,7 +102,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
si->nquota_files = sbi->nquota_files;
si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
- si->aw_cnt = sbi->atomic_files;
+ si->aw_cnt = atomic_read(&sbi->atomic_files);
si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt);
si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ);
si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE);
@@ -135,6 +146,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->inline_inode = atomic_read(&sbi->inline_inode);
si->inline_dir = atomic_read(&sbi->inline_dir);
si->compr_inode = atomic_read(&sbi->compr_inode);
+ si->swapfile_inode = atomic_read(&sbi->swapfile_inode);
si->compr_blocks = atomic64_read(&sbi->compr_blocks);
si->append = sbi->im[APPEND_INO].ino_num;
si->update = sbi->im[UPDATE_INO].ino_num;
@@ -293,25 +305,32 @@ get_cache:
sizeof(struct nat_entry_set);
for (i = 0; i < MAX_INO_ENTRY; i++)
si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
- si->cache_mem += atomic_read(&sbi->total_ext_tree) *
+
+ for (i = 0; i < NR_EXTENT_CACHES; i++) {
+ struct extent_tree_info *eti = &sbi->extent_tree[i];
+
+ si->ext_mem[i] = atomic_read(&eti->total_ext_tree) *
sizeof(struct extent_tree);
- si->cache_mem += atomic_read(&sbi->total_ext_node) *
+ si->ext_mem[i] += atomic_read(&eti->total_ext_node) *
sizeof(struct extent_node);
+ si->cache_mem += si->ext_mem[i];
+ }
si->page_mem = 0;
if (sbi->node_inode) {
- unsigned npages = NODE_MAPPING(sbi)->nrpages;
+ unsigned long npages = NODE_MAPPING(sbi)->nrpages;
si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
}
if (sbi->meta_inode) {
- unsigned npages = META_MAPPING(sbi)->nrpages;
+ unsigned long npages = META_MAPPING(sbi)->nrpages;
si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (sbi->compress_inode) {
- unsigned npages = COMPRESS_MAPPING(sbi)->nrpages;
+ unsigned long npages = COMPRESS_MAPPING(sbi)->nrpages;
+
si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
}
#endif
@@ -347,7 +366,7 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n",
si->sbi->sb->s_bdev, i++,
- f2fs_readonly(si->sbi->sb) ? "RO": "RW",
+ f2fs_readonly(si->sbi->sb) ? "RO" : "RW",
is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ?
"Disabled" : (f2fs_cp_error(si->sbi) ? "Error" : "Good"));
if (si->sbi->s_flag) {
@@ -385,6 +404,8 @@ static int stat_show(struct seq_file *s, void *v)
si->inline_dir);
seq_printf(s, " - Compressed Inode: %u, Blocks: %llu\n",
si->compr_inode, si->compr_blocks);
+ seq_printf(s, " - Swapfile Inode: %u\n",
+ si->swapfile_inode);
seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n",
si->orphans, si->append, si->update);
seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
@@ -457,28 +478,28 @@ static int stat_show(struct seq_file *s, void *v)
si->meta_count[META_NAT]);
seq_printf(s, " - ssa blocks : %u\n",
si->meta_count[META_SSA]);
- seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, "
- "Cur time: %4d(ms), Peak time: %4d(ms))\n",
- si->nr_queued_ckpt, si->nr_issued_ckpt,
- si->nr_total_ckpt, si->cur_ckpt_time,
- si->peak_ckpt_time);
+ seq_puts(s, "CP merge:\n");
+ seq_printf(s, " - Queued : %4d\n", si->nr_queued_ckpt);
+ seq_printf(s, " - Issued : %4d\n", si->nr_issued_ckpt);
+ seq_printf(s, " - Total : %4d\n", si->nr_total_ckpt);
+ seq_printf(s, " - Cur time : %4d(ms)\n", si->cur_ckpt_time);
+ seq_printf(s, " - Peak time : %4d(ms)\n", si->peak_ckpt_time);
seq_printf(s, "GC calls: %d (BG: %d)\n",
si->call_count, si->bg_gc);
seq_printf(s, " - data segments : %d (%d)\n",
si->data_segs, si->bg_data_segs);
seq_printf(s, " - node segments : %d (%d)\n",
si->node_segs, si->bg_node_segs);
- seq_printf(s, " - Reclaimed segs : Normal (%d), Idle CB (%d), "
- "Idle Greedy (%d), Idle AT (%d), "
- "Urgent High (%d), Urgent Mid (%d), "
- "Urgent Low (%d)\n",
- si->sbi->gc_reclaimed_segs[GC_NORMAL],
- si->sbi->gc_reclaimed_segs[GC_IDLE_CB],
- si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY],
- si->sbi->gc_reclaimed_segs[GC_IDLE_AT],
- si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH],
- si->sbi->gc_reclaimed_segs[GC_URGENT_MID],
- si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]);
+ seq_puts(s, " - Reclaimed segs :\n");
+ seq_printf(s, " - Normal : %d\n", si->sbi->gc_reclaimed_segs[GC_NORMAL]);
+ seq_printf(s, " - Idle CB : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_CB]);
+ seq_printf(s, " - Idle Greedy : %d\n",
+ si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY]);
+ seq_printf(s, " - Idle AT : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_AT]);
+ seq_printf(s, " - Urgent High : %d\n",
+ si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH]);
+ seq_printf(s, " - Urgent Mid : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_MID]);
+ seq_printf(s, " - Urgent Low : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]);
seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks,
si->bg_data_blks + si->bg_node_blks);
seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks,
@@ -487,26 +508,44 @@ static int stat_show(struct seq_file *s, void *v)
si->bg_node_blks);
seq_printf(s, "BG skip : IO: %u, Other: %u\n",
si->io_skip_bggc, si->other_skip_bggc);
- seq_puts(s, "\nExtent Cache:\n");
+ seq_puts(s, "\nExtent Cache (Read):\n");
seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n",
- si->hit_largest, si->hit_cached,
- si->hit_rbtree);
+ si->hit_largest, si->hit_cached[EX_READ],
+ si->hit_rbtree[EX_READ]);
+ seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n",
+ !si->total_ext[EX_READ] ? 0 :
+ div64_u64(si->hit_total[EX_READ] * 100,
+ si->total_ext[EX_READ]),
+ si->hit_total[EX_READ], si->total_ext[EX_READ]);
+ seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n",
+ si->ext_tree[EX_READ], si->zombie_tree[EX_READ],
+ si->ext_node[EX_READ]);
+ seq_puts(s, "\nExtent Cache (Block Age):\n");
+ seq_printf(s, " - Allocated Data Blocks: %llu\n",
+ si->allocated_data_blocks);
+ seq_printf(s, " - Hit Count: L1:%llu L2:%llu\n",
+ si->hit_cached[EX_BLOCK_AGE],
+ si->hit_rbtree[EX_BLOCK_AGE]);
seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n",
- !si->total_ext ? 0 :
- div64_u64(si->hit_total * 100, si->total_ext),
- si->hit_total, si->total_ext);
+ !si->total_ext[EX_BLOCK_AGE] ? 0 :
+ div64_u64(si->hit_total[EX_BLOCK_AGE] * 100,
+ si->total_ext[EX_BLOCK_AGE]),
+ si->hit_total[EX_BLOCK_AGE],
+ si->total_ext[EX_BLOCK_AGE]);
seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n",
- si->ext_tree, si->zombie_tree, si->ext_node);
+ si->ext_tree[EX_BLOCK_AGE],
+ si->zombie_tree[EX_BLOCK_AGE],
+ si->ext_node[EX_BLOCK_AGE]);
seq_puts(s, "\nBalancing F2FS Async:\n");
seq_printf(s, " - DIO (R: %4d, W: %4d)\n",
si->nr_dio_read, si->nr_dio_write);
seq_printf(s, " - IO_R (Data: %4d, Node: %4d, Meta: %4d\n",
si->nr_rd_data, si->nr_rd_node, si->nr_rd_meta);
- seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), "
- "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n",
+ seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), ",
si->nr_wb_cp_data, si->nr_wb_data,
si->nr_flushing, si->nr_flushed,
- si->flush_list_empty,
+ si->flush_list_empty);
+ seq_printf(s, "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n",
si->nr_discarding, si->nr_discarded,
si->nr_discard_cmd, si->undiscard_blks);
seq_printf(s, " - atomic IO: %4d (Max. %4d)\n",
@@ -563,8 +602,12 @@ static int stat_show(struct seq_file *s, void *v)
(si->base_mem + si->cache_mem + si->page_mem) >> 10);
seq_printf(s, " - static: %llu KB\n",
si->base_mem >> 10);
- seq_printf(s, " - cached: %llu KB\n",
+ seq_printf(s, " - cached all: %llu KB\n",
si->cache_mem >> 10);
+ seq_printf(s, " - read extent cache: %llu KB\n",
+ si->ext_mem[EX_READ] >> 10);
+ seq_printf(s, " - block age extent cache: %llu KB\n",
+ si->ext_mem[EX_BLOCK_AGE] >> 10);
seq_printf(s, " - paged : %llu KB\n",
si->page_mem >> 10);
}
@@ -597,16 +640,23 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
si->sbi = sbi;
sbi->stat_info = si;
- atomic64_set(&sbi->total_hit_ext, 0);
- atomic64_set(&sbi->read_hit_rbtree, 0);
+ /* general extent cache stats */
+ for (i = 0; i < NR_EXTENT_CACHES; i++) {
+ atomic64_set(&sbi->total_hit_ext[i], 0);
+ atomic64_set(&sbi->read_hit_rbtree[i], 0);
+ atomic64_set(&sbi->read_hit_cached[i], 0);
+ }
+
+ /* read extent_cache only */
atomic64_set(&sbi->read_hit_largest, 0);
- atomic64_set(&sbi->read_hit_cached, 0);
atomic_set(&sbi->inline_xattr, 0);
atomic_set(&sbi->inline_inode, 0);
atomic_set(&sbi->inline_dir, 0);
atomic_set(&sbi->compr_inode, 0);
atomic64_set(&sbi->compr_blocks, 0);
+ atomic_set(&sbi->swapfile_inode, 0);
+ atomic_set(&sbi->atomic_files, 0);
atomic_set(&sbi->inplace_count, 0);
for (i = META_CP; i < META_MAX; i++)
atomic_set(&sbi->meta_count[i], 0);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index d5bd7932fb64..8e025157f35c 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -340,6 +340,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
unsigned int bidx, end_block;
struct page *dentry_page;
struct f2fs_dir_entry *de = NULL;
+ pgoff_t next_pgofs;
bool room = false;
int max_slots;
@@ -350,12 +351,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
le32_to_cpu(fname->hash) % nbucket);
end_block = bidx + nblock;
- for (; bidx < end_block; bidx++) {
+ while (bidx < end_block) {
/* no need to allocate new dentry pages to all the indices */
- dentry_page = f2fs_find_data_page(dir, bidx);
+ dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs);
if (IS_ERR(dentry_page)) {
if (PTR_ERR(dentry_page) == -ENOENT) {
room = true;
+ bidx = next_pgofs;
continue;
} else {
*res_page = dentry_page;
@@ -376,6 +378,8 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
if (max_slots >= s)
room = true;
f2fs_put_page(dentry_page, 0);
+
+ bidx++;
}
if (!de && room && F2FS_I(dir)->chash != fname->hash) {
@@ -956,7 +960,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
bool f2fs_empty_dir(struct inode *dir)
{
- unsigned long bidx;
+ unsigned long bidx = 0;
struct page *dentry_page;
unsigned int bit_pos;
struct f2fs_dentry_block *dentry_blk;
@@ -965,13 +969,17 @@ bool f2fs_empty_dir(struct inode *dir)
if (f2fs_has_inline_dentry(dir))
return f2fs_empty_inline_dir(dir);
- for (bidx = 0; bidx < nblock; bidx++) {
- dentry_page = f2fs_get_lock_data_page(dir, bidx, false);
+ while (bidx < nblock) {
+ pgoff_t next_pgofs;
+
+ dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs);
if (IS_ERR(dentry_page)) {
- if (PTR_ERR(dentry_page) == -ENOENT)
+ if (PTR_ERR(dentry_page) == -ENOENT) {
+ bidx = next_pgofs;
continue;
- else
+ } else {
return false;
+ }
}
dentry_blk = page_address(dentry_page);
@@ -983,10 +991,12 @@ bool f2fs_empty_dir(struct inode *dir)
NR_DENTRY_IN_BLOCK,
bit_pos);
- f2fs_put_page(dentry_page, 1);
+ f2fs_put_page(dentry_page, 0);
if (bit_pos < NR_DENTRY_IN_BLOCK)
return false;
+
+ bidx++;
}
return true;
}
@@ -1000,7 +1010,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode);
struct blk_plug plug;
- bool readdir_ra = sbi->readdir_ra == 1;
+ bool readdir_ra = sbi->readdir_ra;
bool found_valid_dirent = false;
int err = 0;
@@ -1041,6 +1051,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
__func__, le16_to_cpu(de->name_len));
set_sbi_flag(sbi, SBI_NEED_FSCK);
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_CORRUPTED_DIRENT);
goto out;
}
@@ -1103,7 +1114,8 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
goto out_free;
}
- for (; n < npages; n++, ctx->pos = n * NR_DENTRY_IN_BLOCK) {
+ for (; n < npages; ctx->pos = n * NR_DENTRY_IN_BLOCK) {
+ pgoff_t next_pgofs;
/* allow readdir() to be interrupted */
if (fatal_signal_pending(current)) {
@@ -1117,11 +1129,12 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
page_cache_sync_readahead(inode->i_mapping, ra, file, n,
min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
- dentry_page = f2fs_find_data_page(inode, n);
+ dentry_page = f2fs_find_data_page(inode, n, &next_pgofs);
if (IS_ERR(dentry_page)) {
err = PTR_ERR(dentry_page);
if (err == -ENOENT) {
err = 0;
+ n = next_pgofs;
continue;
} else {
goto out_free;
@@ -1140,6 +1153,8 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
}
f2fs_put_page(dentry_page, 0);
+
+ n++;
}
out_free:
fscrypt_fname_free_buffer(&fstr);
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 866e72b29bd5..1bd38a78ebba 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -6,6 +6,10 @@
* Copyright (c) 2015 Samsung Electronics
* Authors: Jaegeuk Kim <jaegeuk@kernel.org>
* Chao Yu <chao2.yu@samsung.com>
+ *
+ * block_age-based extent cache added by:
+ * Copyright (c) 2022 xiaomi Co., Ltd.
+ * http://www.xiaomi.com/
*/
#include <linux/fs.h>
@@ -15,6 +19,123 @@
#include "node.h"
#include <trace/events/f2fs.h>
+static void __set_extent_info(struct extent_info *ei,
+ unsigned int fofs, unsigned int len,
+ block_t blk, bool keep_clen,
+ unsigned long age, unsigned long last_blocks,
+ enum extent_type type)
+{
+ ei->fofs = fofs;
+ ei->len = len;
+
+ if (type == EX_READ) {
+ ei->blk = blk;
+ if (keep_clen)
+ return;
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ ei->c_len = 0;
+#endif
+ } else if (type == EX_BLOCK_AGE) {
+ ei->age = age;
+ ei->last_blocks = last_blocks;
+ }
+}
+
+static bool __may_read_extent_tree(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (!test_opt(sbi, READ_EXTENT_CACHE))
+ return false;
+ if (is_inode_flag_set(inode, FI_NO_EXTENT))
+ return false;
+ if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
+ !f2fs_sb_has_readonly(sbi))
+ return false;
+ return S_ISREG(inode->i_mode);
+}
+
+static bool __may_age_extent_tree(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (!test_opt(sbi, AGE_EXTENT_CACHE))
+ return false;
+ /* don't cache block age info for cold file */
+ if (is_inode_flag_set(inode, FI_COMPRESSED_FILE))
+ return false;
+ if (file_is_cold(inode))
+ return false;
+
+ return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode);
+}
+
+static bool __init_may_extent_tree(struct inode *inode, enum extent_type type)
+{
+ if (type == EX_READ)
+ return __may_read_extent_tree(inode);
+ else if (type == EX_BLOCK_AGE)
+ return __may_age_extent_tree(inode);
+ return false;
+}
+
+static bool __may_extent_tree(struct inode *inode, enum extent_type type)
+{
+ /*
+ * for recovered files during mount do not create extents
+ * if shrinker is not registered.
+ */
+ if (list_empty(&F2FS_I_SB(inode)->s_list))
+ return false;
+
+ return __init_may_extent_tree(inode, type);
+}
+
+static void __try_update_largest_extent(struct extent_tree *et,
+ struct extent_node *en)
+{
+ if (et->type != EX_READ)
+ return;
+ if (en->ei.len <= et->largest.len)
+ return;
+
+ et->largest = en->ei;
+ et->largest_updated = true;
+}
+
+static bool __is_extent_mergeable(struct extent_info *back,
+ struct extent_info *front, enum extent_type type)
+{
+ if (type == EX_READ) {
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (back->c_len && back->len != back->c_len)
+ return false;
+ if (front->c_len && front->len != front->c_len)
+ return false;
+#endif
+ return (back->fofs + back->len == front->fofs &&
+ back->blk + back->len == front->blk);
+ } else if (type == EX_BLOCK_AGE) {
+ return (back->fofs + back->len == front->fofs &&
+ abs(back->age - front->age) <= SAME_AGE_REGION &&
+ abs(back->last_blocks - front->last_blocks) <=
+ SAME_AGE_REGION);
+ }
+ return false;
+}
+
+static bool __is_back_mergeable(struct extent_info *cur,
+ struct extent_info *back, enum extent_type type)
+{
+ return __is_extent_mergeable(back, cur, type);
+}
+
+static bool __is_front_mergeable(struct extent_info *cur,
+ struct extent_info *front, enum extent_type type)
+{
+ return __is_extent_mergeable(cur, front, type);
+}
+
static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re,
unsigned int ofs)
{
@@ -237,6 +358,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
struct rb_node *parent, struct rb_node **p,
bool leftmost)
{
+ struct extent_tree_info *eti = &sbi->extent_tree[et->type];
struct extent_node *en;
en = f2fs_kmem_cache_alloc(extent_node_slab, GFP_ATOMIC, false, sbi);
@@ -250,16 +372,18 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
rb_link_node(&en->rb_node, parent, p);
rb_insert_color_cached(&en->rb_node, &et->root, leftmost);
atomic_inc(&et->node_cnt);
- atomic_inc(&sbi->total_ext_node);
+ atomic_inc(&eti->total_ext_node);
return en;
}
static void __detach_extent_node(struct f2fs_sb_info *sbi,
struct extent_tree *et, struct extent_node *en)
{
+ struct extent_tree_info *eti = &sbi->extent_tree[et->type];
+
rb_erase_cached(&en->rb_node, &et->root);
atomic_dec(&et->node_cnt);
- atomic_dec(&sbi->total_ext_node);
+ atomic_dec(&eti->total_ext_node);
if (et->cached_en == en)
et->cached_en = NULL;
@@ -275,61 +399,51 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi,
static void __release_extent_node(struct f2fs_sb_info *sbi,
struct extent_tree *et, struct extent_node *en)
{
- spin_lock(&sbi->extent_lock);
+ struct extent_tree_info *eti = &sbi->extent_tree[et->type];
+
+ spin_lock(&eti->extent_lock);
f2fs_bug_on(sbi, list_empty(&en->list));
list_del_init(&en->list);
- spin_unlock(&sbi->extent_lock);
+ spin_unlock(&eti->extent_lock);
__detach_extent_node(sbi, et, en);
}
-static struct extent_tree *__grab_extent_tree(struct inode *inode)
+static struct extent_tree *__grab_extent_tree(struct inode *inode,
+ enum extent_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree_info *eti = &sbi->extent_tree[type];
struct extent_tree *et;
nid_t ino = inode->i_ino;
- mutex_lock(&sbi->extent_tree_lock);
- et = radix_tree_lookup(&sbi->extent_tree_root, ino);
+ mutex_lock(&eti->extent_tree_lock);
+ et = radix_tree_lookup(&eti->extent_tree_root, ino);
if (!et) {
et = f2fs_kmem_cache_alloc(extent_tree_slab,
GFP_NOFS, true, NULL);
- f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et);
+ f2fs_radix_tree_insert(&eti->extent_tree_root, ino, et);
memset(et, 0, sizeof(struct extent_tree));
et->ino = ino;
+ et->type = type;
et->root = RB_ROOT_CACHED;
et->cached_en = NULL;
rwlock_init(&et->lock);
INIT_LIST_HEAD(&et->list);
atomic_set(&et->node_cnt, 0);
- atomic_inc(&sbi->total_ext_tree);
+ atomic_inc(&eti->total_ext_tree);
} else {
- atomic_dec(&sbi->total_zombie_tree);
+ atomic_dec(&eti->total_zombie_tree);
list_del_init(&et->list);
}
- mutex_unlock(&sbi->extent_tree_lock);
+ mutex_unlock(&eti->extent_tree_lock);
/* never died until evict_inode */
- F2FS_I(inode)->extent_tree = et;
+ F2FS_I(inode)->extent_tree[type] = et;
return et;
}
-static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi,
- struct extent_tree *et, struct extent_info *ei)
-{
- struct rb_node **p = &et->root.rb_root.rb_node;
- struct extent_node *en;
-
- en = __attach_extent_node(sbi, et, ei, NULL, p, true);
- if (!en)
- return NULL;
-
- et->largest = en->ei;
- et->cached_en = en;
- return en;
-}
-
static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
struct extent_tree *et)
{
@@ -358,70 +472,88 @@ static void __drop_largest_extent(struct extent_tree *et,
}
}
-/* return true, if inode page is changed */
-static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
+void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct f2fs_extent *i_ext = ipage ? &F2FS_INODE(ipage)->i_ext : NULL;
+ struct extent_tree_info *eti = &sbi->extent_tree[EX_READ];
+ struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext;
struct extent_tree *et;
struct extent_node *en;
struct extent_info ei;
- if (!f2fs_may_extent_tree(inode)) {
- /* drop largest extent */
+ if (!__may_extent_tree(inode, EX_READ)) {
+ /* drop largest read extent */
if (i_ext && i_ext->len) {
f2fs_wait_on_page_writeback(ipage, NODE, true, true);
i_ext->len = 0;
set_page_dirty(ipage);
- return;
}
- return;
+ goto out;
}
- et = __grab_extent_tree(inode);
+ et = __grab_extent_tree(inode, EX_READ);
if (!i_ext || !i_ext->len)
- return;
+ goto out;
- get_extent_info(&ei, i_ext);
+ get_read_extent_info(&ei, i_ext);
write_lock(&et->lock);
if (atomic_read(&et->node_cnt))
- goto out;
+ goto unlock_out;
- en = __init_extent_tree(sbi, et, &ei);
+ en = __attach_extent_node(sbi, et, &ei, NULL,
+ &et->root.rb_root.rb_node, true);
if (en) {
- spin_lock(&sbi->extent_lock);
- list_add_tail(&en->list, &sbi->extent_list);
- spin_unlock(&sbi->extent_lock);
+ et->largest = en->ei;
+ et->cached_en = en;
+
+ spin_lock(&eti->extent_lock);
+ list_add_tail(&en->list, &eti->extent_list);
+ spin_unlock(&eti->extent_lock);
}
-out:
+unlock_out:
write_unlock(&et->lock);
+out:
+ if (!F2FS_I(inode)->extent_tree[EX_READ])
+ set_inode_flag(inode, FI_NO_EXTENT);
}
-void f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
+void f2fs_init_age_extent_tree(struct inode *inode)
{
- __f2fs_init_extent_tree(inode, ipage);
+ if (!__init_may_extent_tree(inode, EX_BLOCK_AGE))
+ return;
+ __grab_extent_tree(inode, EX_BLOCK_AGE);
+}
- if (!F2FS_I(inode)->extent_tree)
- set_inode_flag(inode, FI_NO_EXTENT);
+void f2fs_init_extent_tree(struct inode *inode)
+{
+ /* initialize read cache */
+ if (__init_may_extent_tree(inode, EX_READ))
+ __grab_extent_tree(inode, EX_READ);
+
+ /* initialize block age cache */
+ if (__init_may_extent_tree(inode, EX_BLOCK_AGE))
+ __grab_extent_tree(inode, EX_BLOCK_AGE);
}
-static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
- struct extent_info *ei)
+static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei, enum extent_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_tree_info *eti = &sbi->extent_tree[type];
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
struct extent_node *en;
bool ret = false;
f2fs_bug_on(sbi, !et);
- trace_f2fs_lookup_extent_tree_start(inode, pgofs);
+ trace_f2fs_lookup_extent_tree_start(inode, pgofs, type);
read_lock(&et->lock);
- if (et->largest.fofs <= pgofs &&
+ if (type == EX_READ &&
+ et->largest.fofs <= pgofs &&
et->largest.fofs + et->largest.len > pgofs) {
*ei = et->largest;
ret = true;
@@ -435,23 +567,26 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
goto out;
if (en == et->cached_en)
- stat_inc_cached_node_hit(sbi);
+ stat_inc_cached_node_hit(sbi, type);
else
- stat_inc_rbtree_node_hit(sbi);
+ stat_inc_rbtree_node_hit(sbi, type);
*ei = en->ei;
- spin_lock(&sbi->extent_lock);
+ spin_lock(&eti->extent_lock);
if (!list_empty(&en->list)) {
- list_move_tail(&en->list, &sbi->extent_list);
+ list_move_tail(&en->list, &eti->extent_list);
et->cached_en = en;
}
- spin_unlock(&sbi->extent_lock);
+ spin_unlock(&eti->extent_lock);
ret = true;
out:
- stat_inc_total_hit(sbi);
+ stat_inc_total_hit(sbi, type);
read_unlock(&et->lock);
- trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei);
+ if (type == EX_READ)
+ trace_f2fs_lookup_read_extent_tree_end(inode, pgofs, ei);
+ else if (type == EX_BLOCK_AGE)
+ trace_f2fs_lookup_age_extent_tree_end(inode, pgofs, ei);
return ret;
}
@@ -460,18 +595,20 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
struct extent_node *prev_ex,
struct extent_node *next_ex)
{
+ struct extent_tree_info *eti = &sbi->extent_tree[et->type];
struct extent_node *en = NULL;
- if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) {
+ if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei, et->type)) {
prev_ex->ei.len += ei->len;
ei = &prev_ex->ei;
en = prev_ex;
}
- if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) {
+ if (next_ex && __is_front_mergeable(ei, &next_ex->ei, et->type)) {
next_ex->ei.fofs = ei->fofs;
- next_ex->ei.blk = ei->blk;
next_ex->ei.len += ei->len;
+ if (et->type == EX_READ)
+ next_ex->ei.blk = ei->blk;
if (en)
__release_extent_node(sbi, et, prev_ex);
@@ -483,12 +620,12 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
__try_update_largest_extent(et, en);
- spin_lock(&sbi->extent_lock);
+ spin_lock(&eti->extent_lock);
if (!list_empty(&en->list)) {
- list_move_tail(&en->list, &sbi->extent_list);
+ list_move_tail(&en->list, &eti->extent_list);
et->cached_en = en;
}
- spin_unlock(&sbi->extent_lock);
+ spin_unlock(&eti->extent_lock);
return en;
}
@@ -498,6 +635,7 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
struct rb_node *insert_parent,
bool leftmost)
{
+ struct extent_tree_info *eti = &sbi->extent_tree[et->type];
struct rb_node **p;
struct rb_node *parent = NULL;
struct extent_node *en = NULL;
@@ -520,47 +658,54 @@ do_insert:
__try_update_largest_extent(et, en);
/* update in global extent list */
- spin_lock(&sbi->extent_lock);
- list_add_tail(&en->list, &sbi->extent_list);
+ spin_lock(&eti->extent_lock);
+ list_add_tail(&en->list, &eti->extent_list);
et->cached_en = en;
- spin_unlock(&sbi->extent_lock);
+ spin_unlock(&eti->extent_lock);
return en;
}
-static void f2fs_update_extent_tree_range(struct inode *inode,
- pgoff_t fofs, block_t blkaddr, unsigned int len)
+static void __update_extent_tree_range(struct inode *inode,
+ struct extent_info *tei, enum extent_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
struct extent_node *en = NULL, *en1 = NULL;
struct extent_node *prev_en = NULL, *next_en = NULL;
struct extent_info ei, dei, prev;
struct rb_node **insert_p = NULL, *insert_parent = NULL;
+ unsigned int fofs = tei->fofs, len = tei->len;
unsigned int end = fofs + len;
- unsigned int pos = (unsigned int)fofs;
bool updated = false;
bool leftmost = false;
if (!et)
return;
- trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len);
+ if (type == EX_READ)
+ trace_f2fs_update_read_extent_tree_range(inode, fofs, len,
+ tei->blk, 0);
+ else if (type == EX_BLOCK_AGE)
+ trace_f2fs_update_age_extent_tree_range(inode, fofs, len,
+ tei->age, tei->last_blocks);
write_lock(&et->lock);
- if (is_inode_flag_set(inode, FI_NO_EXTENT)) {
- write_unlock(&et->lock);
- return;
- }
+ if (type == EX_READ) {
+ if (is_inode_flag_set(inode, FI_NO_EXTENT)) {
+ write_unlock(&et->lock);
+ return;
+ }
- prev = et->largest;
- dei.len = 0;
+ prev = et->largest;
+ dei.len = 0;
- /*
- * drop largest extent before lookup, in case it's already
- * been shrunk from extent tree
- */
- __drop_largest_extent(et, fofs, len);
+ /*
+ * drop largest extent before lookup, in case it's already
+ * been shrunk from extent tree
+ */
+ __drop_largest_extent(et, fofs, len);
+ }
/* 1. lookup first extent node in range [fofs, fofs + len - 1] */
en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root,
@@ -581,26 +726,32 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
dei = en->ei;
org_end = dei.fofs + dei.len;
- f2fs_bug_on(sbi, pos >= org_end);
+ f2fs_bug_on(sbi, fofs >= org_end);
- if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
- en->ei.len = pos - en->ei.fofs;
+ if (fofs > dei.fofs && (type != EX_READ ||
+ fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN)) {
+ en->ei.len = fofs - en->ei.fofs;
prev_en = en;
parts = 1;
}
- if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) {
+ if (end < org_end && (type != EX_READ ||
+ org_end - end >= F2FS_MIN_EXTENT_LEN)) {
if (parts) {
- set_extent_info(&ei, end,
- end - dei.fofs + dei.blk,
- org_end - end);
+ __set_extent_info(&ei,
+ end, org_end - end,
+ end - dei.fofs + dei.blk, false,
+ dei.age, dei.last_blocks,
+ type);
en1 = __insert_extent_tree(sbi, et, &ei,
NULL, NULL, true);
next_en = en1;
} else {
- en->ei.fofs = end;
- en->ei.blk += end - dei.fofs;
- en->ei.len -= end - dei.fofs;
+ __set_extent_info(&en->ei,
+ end, en->ei.len - (end - dei.fofs),
+ en->ei.blk + (end - dei.fofs), true,
+ dei.age, dei.last_blocks,
+ type);
next_en = en;
}
parts++;
@@ -630,10 +781,15 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
en = next_en;
}
- /* 3. update extent in extent cache */
- if (blkaddr) {
+ if (type == EX_BLOCK_AGE)
+ goto update_age_extent_cache;
+
+ /* 3. update extent in read extent cache */
+ BUG_ON(type != EX_READ);
- set_extent_info(&ei, fofs, blkaddr, len);
+ if (tei->blk) {
+ __set_extent_info(&ei, fofs, len, tei->blk, false,
+ 0, 0, EX_READ);
if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
__insert_extent_tree(sbi, et, &ei,
insert_p, insert_parent, leftmost);
@@ -655,7 +811,17 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
et->largest_updated = false;
updated = true;
}
+ goto out_read_extent_cache;
+update_age_extent_cache:
+ if (!tei->last_blocks)
+ goto out_read_extent_cache;
+ __set_extent_info(&ei, fofs, len, 0, false,
+ tei->age, tei->last_blocks, EX_BLOCK_AGE);
+ if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
+ __insert_extent_tree(sbi, et, &ei,
+ insert_p, insert_parent, leftmost);
+out_read_extent_cache:
write_unlock(&et->lock);
if (updated)
@@ -663,19 +829,20 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
-void f2fs_update_extent_tree_range_compressed(struct inode *inode,
+void f2fs_update_read_extent_tree_range_compressed(struct inode *inode,
pgoff_t fofs, block_t blkaddr, unsigned int llen,
unsigned int c_len)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ];
struct extent_node *en = NULL;
struct extent_node *prev_en = NULL, *next_en = NULL;
struct extent_info ei;
struct rb_node **insert_p = NULL, *insert_parent = NULL;
bool leftmost = false;
- trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen);
+ trace_f2fs_update_read_extent_tree_range(inode, fofs, llen,
+ blkaddr, c_len);
/* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */
if (is_inode_flag_set(inode, FI_NO_EXTENT))
@@ -692,7 +859,7 @@ void f2fs_update_extent_tree_range_compressed(struct inode *inode,
if (en)
goto unlock_out;
- set_extent_info(&ei, fofs, blkaddr, llen);
+ __set_extent_info(&ei, fofs, llen, blkaddr, true, 0, 0, EX_READ);
ei.c_len = c_len;
if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
@@ -703,24 +870,113 @@ unlock_out:
}
#endif
-unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
+static unsigned long long __calculate_block_age(unsigned long long new,
+ unsigned long long old)
+{
+ unsigned long long diff;
+
+ diff = (new >= old) ? new - (new - old) : new + (old - new);
+
+ return div_u64(diff * LAST_AGE_WEIGHT, 100);
+}
+
+/* This returns a new age and allocated blocks in ei */
+static int __get_new_block_age(struct inode *inode, struct extent_info *ei)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ loff_t f_size = i_size_read(inode);
+ unsigned long long cur_blocks =
+ atomic64_read(&sbi->allocated_data_blocks);
+
+ /*
+ * When I/O is not aligned to a PAGE_SIZE, update will happen to the last
+ * file block even in seq write. So don't record age for newly last file
+ * block here.
+ */
+ if ((f_size >> PAGE_SHIFT) == ei->fofs && f_size & (PAGE_SIZE - 1) &&
+ ei->blk == NEW_ADDR)
+ return -EINVAL;
+
+ if (__lookup_extent_tree(inode, ei->fofs, ei, EX_BLOCK_AGE)) {
+ unsigned long long cur_age;
+
+ if (cur_blocks >= ei->last_blocks)
+ cur_age = cur_blocks - ei->last_blocks;
+ else
+ /* allocated_data_blocks overflow */
+ cur_age = ULLONG_MAX - ei->last_blocks + cur_blocks;
+
+ if (ei->age)
+ ei->age = __calculate_block_age(cur_age, ei->age);
+ else
+ ei->age = cur_age;
+ ei->last_blocks = cur_blocks;
+ WARN_ON(ei->age > cur_blocks);
+ return 0;
+ }
+
+ f2fs_bug_on(sbi, ei->blk == NULL_ADDR);
+
+ /* the data block was allocated for the first time */
+ if (ei->blk == NEW_ADDR)
+ goto out;
+
+ if (__is_valid_data_blkaddr(ei->blk) &&
+ !f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC_ENHANCE)) {
+ f2fs_bug_on(sbi, 1);
+ return -EINVAL;
+ }
+out:
+ /*
+ * init block age with zero, this can happen when the block age extent
+ * was reclaimed due to memory constraint or system reboot
+ */
+ ei->age = 0;
+ ei->last_blocks = cur_blocks;
+ return 0;
+}
+
+static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type)
{
+ struct extent_info ei;
+
+ if (!__may_extent_tree(dn->inode, type))
+ return;
+
+ ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
+ dn->ofs_in_node;
+ ei.len = 1;
+
+ if (type == EX_READ) {
+ if (dn->data_blkaddr == NEW_ADDR)
+ ei.blk = NULL_ADDR;
+ else
+ ei.blk = dn->data_blkaddr;
+ } else if (type == EX_BLOCK_AGE) {
+ ei.blk = dn->data_blkaddr;
+ if (__get_new_block_age(dn->inode, &ei))
+ return;
+ }
+ __update_extent_tree_range(dn->inode, &ei, type);
+}
+
+static unsigned int __shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink,
+ enum extent_type type)
+{
+ struct extent_tree_info *eti = &sbi->extent_tree[type];
struct extent_tree *et, *next;
struct extent_node *en;
unsigned int node_cnt = 0, tree_cnt = 0;
int remained;
- if (!test_opt(sbi, EXTENT_CACHE))
- return 0;
-
- if (!atomic_read(&sbi->total_zombie_tree))
+ if (!atomic_read(&eti->total_zombie_tree))
goto free_node;
- if (!mutex_trylock(&sbi->extent_tree_lock))
+ if (!mutex_trylock(&eti->extent_tree_lock))
goto out;
/* 1. remove unreferenced extent tree */
- list_for_each_entry_safe(et, next, &sbi->zombie_list, list) {
+ list_for_each_entry_safe(et, next, &eti->zombie_list, list) {
if (atomic_read(&et->node_cnt)) {
write_lock(&et->lock);
node_cnt += __free_extent_tree(sbi, et);
@@ -728,61 +984,137 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
}
f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
list_del_init(&et->list);
- radix_tree_delete(&sbi->extent_tree_root, et->ino);
+ radix_tree_delete(&eti->extent_tree_root, et->ino);
kmem_cache_free(extent_tree_slab, et);
- atomic_dec(&sbi->total_ext_tree);
- atomic_dec(&sbi->total_zombie_tree);
+ atomic_dec(&eti->total_ext_tree);
+ atomic_dec(&eti->total_zombie_tree);
tree_cnt++;
if (node_cnt + tree_cnt >= nr_shrink)
goto unlock_out;
cond_resched();
}
- mutex_unlock(&sbi->extent_tree_lock);
+ mutex_unlock(&eti->extent_tree_lock);
free_node:
/* 2. remove LRU extent entries */
- if (!mutex_trylock(&sbi->extent_tree_lock))
+ if (!mutex_trylock(&eti->extent_tree_lock))
goto out;
remained = nr_shrink - (node_cnt + tree_cnt);
- spin_lock(&sbi->extent_lock);
+ spin_lock(&eti->extent_lock);
for (; remained > 0; remained--) {
- if (list_empty(&sbi->extent_list))
+ if (list_empty(&eti->extent_list))
break;
- en = list_first_entry(&sbi->extent_list,
+ en = list_first_entry(&eti->extent_list,
struct extent_node, list);
et = en->et;
if (!write_trylock(&et->lock)) {
/* refresh this extent node's position in extent list */
- list_move_tail(&en->list, &sbi->extent_list);
+ list_move_tail(&en->list, &eti->extent_list);
continue;
}
list_del_init(&en->list);
- spin_unlock(&sbi->extent_lock);
+ spin_unlock(&eti->extent_lock);
__detach_extent_node(sbi, et, en);
write_unlock(&et->lock);
node_cnt++;
- spin_lock(&sbi->extent_lock);
+ spin_lock(&eti->extent_lock);
}
- spin_unlock(&sbi->extent_lock);
+ spin_unlock(&eti->extent_lock);
unlock_out:
- mutex_unlock(&sbi->extent_tree_lock);
+ mutex_unlock(&eti->extent_tree_lock);
out:
- trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt);
+ trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt, type);
return node_cnt + tree_cnt;
}
-unsigned int f2fs_destroy_extent_node(struct inode *inode)
+/* read extent cache operations */
+bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei)
+{
+ if (!__may_extent_tree(inode, EX_READ))
+ return false;
+
+ return __lookup_extent_tree(inode, pgofs, ei, EX_READ);
+}
+
+void f2fs_update_read_extent_cache(struct dnode_of_data *dn)
+{
+ return __update_extent_cache(dn, EX_READ);
+}
+
+void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn,
+ pgoff_t fofs, block_t blkaddr, unsigned int len)
+{
+ struct extent_info ei = {
+ .fofs = fofs,
+ .len = len,
+ .blk = blkaddr,
+ };
+
+ if (!__may_extent_tree(dn->inode, EX_READ))
+ return;
+
+ __update_extent_tree_range(dn->inode, &ei, EX_READ);
+}
+
+unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+ if (!test_opt(sbi, READ_EXTENT_CACHE))
+ return 0;
+
+ return __shrink_extent_tree(sbi, nr_shrink, EX_READ);
+}
+
+/* block age extent cache operations */
+bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei)
+{
+ if (!__may_extent_tree(inode, EX_BLOCK_AGE))
+ return false;
+
+ return __lookup_extent_tree(inode, pgofs, ei, EX_BLOCK_AGE);
+}
+
+void f2fs_update_age_extent_cache(struct dnode_of_data *dn)
+{
+ return __update_extent_cache(dn, EX_BLOCK_AGE);
+}
+
+void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn,
+ pgoff_t fofs, unsigned int len)
+{
+ struct extent_info ei = {
+ .fofs = fofs,
+ .len = len,
+ };
+
+ if (!__may_extent_tree(dn->inode, EX_BLOCK_AGE))
+ return;
+
+ __update_extent_tree_range(dn->inode, &ei, EX_BLOCK_AGE);
+}
+
+unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+ if (!test_opt(sbi, AGE_EXTENT_CACHE))
+ return 0;
+
+ return __shrink_extent_tree(sbi, nr_shrink, EX_BLOCK_AGE);
+}
+
+static unsigned int __destroy_extent_node(struct inode *inode,
+ enum extent_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
unsigned int node_cnt = 0;
if (!et || !atomic_read(&et->node_cnt))
@@ -795,32 +1127,46 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode)
return node_cnt;
}
-void f2fs_drop_extent_tree(struct inode *inode)
+void f2fs_destroy_extent_node(struct inode *inode)
+{
+ __destroy_extent_node(inode, EX_READ);
+ __destroy_extent_node(inode, EX_BLOCK_AGE);
+}
+
+static void __drop_extent_tree(struct inode *inode, enum extent_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
bool updated = false;
- if (!f2fs_may_extent_tree(inode))
+ if (!__may_extent_tree(inode, type))
return;
- set_inode_flag(inode, FI_NO_EXTENT);
-
write_lock(&et->lock);
__free_extent_tree(sbi, et);
- if (et->largest.len) {
- et->largest.len = 0;
- updated = true;
+ if (type == EX_READ) {
+ set_inode_flag(inode, FI_NO_EXTENT);
+ if (et->largest.len) {
+ et->largest.len = 0;
+ updated = true;
+ }
}
write_unlock(&et->lock);
if (updated)
f2fs_mark_inode_dirty_sync(inode, true);
}
-void f2fs_destroy_extent_tree(struct inode *inode)
+void f2fs_drop_extent_tree(struct inode *inode)
+{
+ __drop_extent_tree(inode, EX_READ);
+ __drop_extent_tree(inode, EX_BLOCK_AGE);
+}
+
+static void __destroy_extent_tree(struct inode *inode, enum extent_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_tree_info *eti = &sbi->extent_tree[type];
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
unsigned int node_cnt = 0;
if (!et)
@@ -828,76 +1174,56 @@ void f2fs_destroy_extent_tree(struct inode *inode)
if (inode->i_nlink && !is_bad_inode(inode) &&
atomic_read(&et->node_cnt)) {
- mutex_lock(&sbi->extent_tree_lock);
- list_add_tail(&et->list, &sbi->zombie_list);
- atomic_inc(&sbi->total_zombie_tree);
- mutex_unlock(&sbi->extent_tree_lock);
+ mutex_lock(&eti->extent_tree_lock);
+ list_add_tail(&et->list, &eti->zombie_list);
+ atomic_inc(&eti->total_zombie_tree);
+ mutex_unlock(&eti->extent_tree_lock);
return;
}
/* free all extent info belong to this extent tree */
- node_cnt = f2fs_destroy_extent_node(inode);
+ node_cnt = __destroy_extent_node(inode, type);
/* delete extent tree entry in radix tree */
- mutex_lock(&sbi->extent_tree_lock);
+ mutex_lock(&eti->extent_tree_lock);
f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
- radix_tree_delete(&sbi->extent_tree_root, inode->i_ino);
+ radix_tree_delete(&eti->extent_tree_root, inode->i_ino);
kmem_cache_free(extent_tree_slab, et);
- atomic_dec(&sbi->total_ext_tree);
- mutex_unlock(&sbi->extent_tree_lock);
+ atomic_dec(&eti->total_ext_tree);
+ mutex_unlock(&eti->extent_tree_lock);
- F2FS_I(inode)->extent_tree = NULL;
+ F2FS_I(inode)->extent_tree[type] = NULL;
- trace_f2fs_destroy_extent_tree(inode, node_cnt);
+ trace_f2fs_destroy_extent_tree(inode, node_cnt, type);
}
-bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
- struct extent_info *ei)
-{
- if (!f2fs_may_extent_tree(inode))
- return false;
-
- return f2fs_lookup_extent_tree(inode, pgofs, ei);
-}
-
-void f2fs_update_extent_cache(struct dnode_of_data *dn)
+void f2fs_destroy_extent_tree(struct inode *inode)
{
- pgoff_t fofs;
- block_t blkaddr;
-
- if (!f2fs_may_extent_tree(dn->inode))
- return;
-
- if (dn->data_blkaddr == NEW_ADDR)
- blkaddr = NULL_ADDR;
- else
- blkaddr = dn->data_blkaddr;
-
- fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
- dn->ofs_in_node;
- f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1);
+ __destroy_extent_tree(inode, EX_READ);
+ __destroy_extent_tree(inode, EX_BLOCK_AGE);
}
-void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
- pgoff_t fofs, block_t blkaddr, unsigned int len)
-
+static void __init_extent_tree_info(struct extent_tree_info *eti)
{
- if (!f2fs_may_extent_tree(dn->inode))
- return;
-
- f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len);
+ INIT_RADIX_TREE(&eti->extent_tree_root, GFP_NOIO);
+ mutex_init(&eti->extent_tree_lock);
+ INIT_LIST_HEAD(&eti->extent_list);
+ spin_lock_init(&eti->extent_lock);
+ atomic_set(&eti->total_ext_tree, 0);
+ INIT_LIST_HEAD(&eti->zombie_list);
+ atomic_set(&eti->total_zombie_tree, 0);
+ atomic_set(&eti->total_ext_node, 0);
}
void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi)
{
- INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO);
- mutex_init(&sbi->extent_tree_lock);
- INIT_LIST_HEAD(&sbi->extent_list);
- spin_lock_init(&sbi->extent_lock);
- atomic_set(&sbi->total_ext_tree, 0);
- INIT_LIST_HEAD(&sbi->zombie_list);
- atomic_set(&sbi->total_zombie_tree, 0);
- atomic_set(&sbi->total_ext_node, 0);
+ __init_extent_tree_info(&sbi->extent_tree[EX_READ]);
+ __init_extent_tree_info(&sbi->extent_tree[EX_BLOCK_AGE]);
+
+ /* initialize for block age extents */
+ atomic64_set(&sbi->allocated_data_blocks, 0);
+ sbi->hot_data_age_threshold = DEF_HOT_DATA_AGE_THRESHOLD;
+ sbi->warm_data_age_threshold = DEF_WARM_DATA_AGE_THRESHOLD;
}
int __init f2fs_create_extent_cache(void)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 3c7cdb70fe2e..e8953c3dc81a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -60,6 +60,7 @@ enum {
FAULT_SLAB_ALLOC,
FAULT_DQUOT_INIT,
FAULT_LOCK_OP,
+ FAULT_BLKADDR,
FAULT_MAX,
};
@@ -91,7 +92,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
#define F2FS_MOUNT_FLUSH_MERGE 0x00000400
#define F2FS_MOUNT_NOBARRIER 0x00000800
#define F2FS_MOUNT_FASTBOOT 0x00001000
-#define F2FS_MOUNT_EXTENT_CACHE 0x00002000
+#define F2FS_MOUNT_READ_EXTENT_CACHE 0x00002000
#define F2FS_MOUNT_DATA_FLUSH 0x00008000
#define F2FS_MOUNT_FAULT_INJECTION 0x00010000
#define F2FS_MOUNT_USRQUOTA 0x00080000
@@ -106,6 +107,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
#define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000
#define F2FS_MOUNT_GC_MERGE 0x20000000
#define F2FS_MOUNT_COMPRESS_CACHE 0x40000000
+#define F2FS_MOUNT_AGE_EXTENT_CACHE 0x80000000
#define F2FS_OPTION(sbi) ((sbi)->mount_opt)
#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -202,10 +204,6 @@ struct f2fs_mount_info {
#define __F2FS_HAS_FEATURE(raw_super, mask) \
((raw_super->feature & cpu_to_le32(mask)) != 0)
#define F2FS_HAS_FEATURE(sbi, mask) __F2FS_HAS_FEATURE(sbi->raw_super, mask)
-#define F2FS_SET_FEATURE(sbi, mask) \
- (sbi->raw_super->feature |= cpu_to_le32(mask))
-#define F2FS_CLEAR_FEATURE(sbi, mask) \
- (sbi->raw_super->feature &= ~cpu_to_le32(mask))
/*
* Default values for user and/or group using reserved blocks
@@ -266,6 +264,10 @@ enum {
* condition of read on truncated area
* by extent_cache
*/
+ DATA_GENERIC_ENHANCE_UPDATE, /*
+ * strong check on range and segment
+ * bitmap for update case
+ */
META_GENERIC,
};
@@ -274,7 +276,7 @@ enum {
ORPHAN_INO, /* for orphan ino list */
APPEND_INO, /* for append ino list */
UPDATE_INO, /* for update ino list */
- TRANS_DIR_INO, /* for trasactions dir ino list */
+ TRANS_DIR_INO, /* for transactions dir ino list */
FLUSH_INO, /* for multiple device flushing */
MAX_INO_ENTRY, /* max. list */
};
@@ -324,8 +326,12 @@ struct discard_entry {
unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */
};
+/* minimum discard granularity, unit: block count */
+#define MIN_DISCARD_GRANULARITY 1
/* default discard granularity of inner discard thread, unit: block count */
#define DEFAULT_DISCARD_GRANULARITY 16
+/* default maximum discard granularity of ordered discard, unit: block count */
+#define DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY 16
/* max discard pend list number */
#define MAX_PLIST_NUM 512
@@ -404,7 +410,9 @@ struct discard_cmd_control {
unsigned int min_discard_issue_time; /* min. interval between discard issue */
unsigned int mid_discard_issue_time; /* mid. interval between discard issue */
unsigned int max_discard_issue_time; /* max. interval between discard issue */
+ unsigned int discard_urgent_util; /* utilization which issue discard proactively */
unsigned int discard_granularity; /* discard granularity */
+ unsigned int max_ordered_discard; /* maximum discard granularity issued by lba order */
unsigned int undiscard_blks; /* # of undiscard blocks */
unsigned int next_pos; /* next discard position */
atomic_t issued_discard; /* # of issued discard */
@@ -589,16 +597,35 @@ enum {
/* dirty segments threshold for triggering CP */
#define DEFAULT_DIRTY_THRESHOLD 4
+#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS
+#define RECOVERY_MIN_RA_BLOCKS 1
+
+#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */
+
/* for in-memory extent cache entry */
#define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */
/* number of extent info in extent cache we try to shrink */
-#define EXTENT_CACHE_SHRINK_NUMBER 128
+#define READ_EXTENT_CACHE_SHRINK_NUMBER 128
-#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS
-#define RECOVERY_MIN_RA_BLOCKS 1
+/* number of age extent info in extent cache we try to shrink */
+#define AGE_EXTENT_CACHE_SHRINK_NUMBER 128
+#define LAST_AGE_WEIGHT 30
+#define SAME_AGE_REGION 1024
-#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */
+/*
+ * Define data block with age less than 1GB as hot data
+ * define data block with age less than 10GB but more than 1GB as warm data
+ */
+#define DEF_HOT_DATA_AGE_THRESHOLD 262144
+#define DEF_WARM_DATA_AGE_THRESHOLD 2621440
+
+/* extent cache type */
+enum extent_type {
+ EX_READ,
+ EX_BLOCK_AGE,
+ NR_EXTENT_CACHES,
+};
struct rb_entry {
struct rb_node rb_node; /* rb node located in rb-tree */
@@ -614,10 +641,24 @@ struct rb_entry {
struct extent_info {
unsigned int fofs; /* start offset in a file */
unsigned int len; /* length of the extent */
- u32 blk; /* start block address of the extent */
+ union {
+ /* read extent_cache */
+ struct {
+ /* start block address of the extent */
+ block_t blk;
#ifdef CONFIG_F2FS_FS_COMPRESSION
- unsigned int c_len; /* physical extent length of compressed blocks */
+ /* physical extent length of compressed blocks */
+ unsigned int c_len;
#endif
+ };
+ /* block age extent_cache */
+ struct {
+ /* block age of the extent */
+ unsigned long long age;
+ /* last total blocks allocated */
+ unsigned long long last_blocks;
+ };
+ };
};
struct extent_node {
@@ -629,13 +670,25 @@ struct extent_node {
struct extent_tree {
nid_t ino; /* inode number */
+ enum extent_type type; /* keep the extent tree type */
struct rb_root_cached root; /* root of extent info rb-tree */
struct extent_node *cached_en; /* recently accessed extent node */
- struct extent_info largest; /* largested extent info */
struct list_head list; /* to be used by sbi->zombie_list */
rwlock_t lock; /* protect extent info rb-tree */
atomic_t node_cnt; /* # of extent node in rb-tree*/
bool largest_updated; /* largest extent updated */
+ struct extent_info largest; /* largest cached extent for EX_READ */
+};
+
+struct extent_tree_info {
+ struct radix_tree_root extent_tree_root;/* cache extent cache entries */
+ struct mutex extent_tree_lock; /* locking extent radix tree */
+ struct list_head extent_list; /* lru list for shrinker */
+ spinlock_t extent_lock; /* locking extent lru list */
+ atomic_t total_ext_tree; /* extent tree count */
+ struct list_head zombie_list; /* extent zombie tree list */
+ atomic_t total_zombie_tree; /* extent zombie tree count */
+ atomic_t total_ext_node; /* extent info count */
};
/*
@@ -760,6 +813,8 @@ enum {
FI_COMPRESS_RELEASED, /* compressed blocks were released */
FI_ALIGNED_WRITE, /* enable aligned write */
FI_COW_FILE, /* indicate COW file */
+ FI_ATOMIC_COMMITTED, /* indicate atomic commit completed except disk sync */
+ FI_ATOMIC_REPLACE, /* indicate atomic replace */
FI_MAX, /* max flag, never be used */
};
@@ -782,6 +837,7 @@ struct f2fs_inode_info {
unsigned int clevel; /* maximum level of given file name */
struct task_struct *task; /* lookup and create consistency */
struct task_struct *cp_task; /* separate cp/wb IO stats*/
+ struct task_struct *wb_task; /* indicate inode is in context of writeback */
nid_t i_xattr_nid; /* node id that contains xattrs */
loff_t last_disk_size; /* lastly written file size */
spinlock_t i_size_lock; /* protect last_disk_size */
@@ -795,7 +851,8 @@ struct f2fs_inode_info {
struct list_head dirty_list; /* dirty list for dirs and files */
struct list_head gdirty_list; /* linked in global dirty list */
struct task_struct *atomic_write_task; /* store atomic write task */
- struct extent_tree *extent_tree; /* cached extent_tree entry */
+ struct extent_tree *extent_tree[NR_EXTENT_CACHES];
+ /* cached extent_tree entry */
struct inode *cow_inode; /* copy-on-write inode for atomic write */
/* avoid racing between foreground op and gc */
@@ -817,9 +874,10 @@ struct f2fs_inode_info {
unsigned int i_cluster_size; /* cluster size */
unsigned int atomic_write_cnt;
+ loff_t original_i_size; /* original i_size before atomic write */
};
-static inline void get_extent_info(struct extent_info *ext,
+static inline void get_read_extent_info(struct extent_info *ext,
struct f2fs_extent *i_ext)
{
ext->fofs = le32_to_cpu(i_ext->fofs);
@@ -827,7 +885,7 @@ static inline void get_extent_info(struct extent_info *ext,
ext->len = le32_to_cpu(i_ext->len);
}
-static inline void set_raw_extent(struct extent_info *ext,
+static inline void set_raw_read_extent(struct extent_info *ext,
struct f2fs_extent *i_ext)
{
i_ext->fofs = cpu_to_le32(ext->fofs);
@@ -835,17 +893,6 @@ static inline void set_raw_extent(struct extent_info *ext,
i_ext->len = cpu_to_le32(ext->len);
}
-static inline void set_extent_info(struct extent_info *ei, unsigned int fofs,
- u32 blk, unsigned int len)
-{
- ei->fofs = fofs;
- ei->blk = blk;
- ei->len = len;
-#ifdef CONFIG_F2FS_FS_COMPRESSION
- ei->c_len = 0;
-#endif
-}
-
static inline bool __is_discard_mergeable(struct discard_info *back,
struct discard_info *front, unsigned int max_len)
{
@@ -865,41 +912,6 @@ static inline bool __is_discard_front_mergeable(struct discard_info *cur,
return __is_discard_mergeable(cur, front, max_len);
}
-static inline bool __is_extent_mergeable(struct extent_info *back,
- struct extent_info *front)
-{
-#ifdef CONFIG_F2FS_FS_COMPRESSION
- if (back->c_len && back->len != back->c_len)
- return false;
- if (front->c_len && front->len != front->c_len)
- return false;
-#endif
- return (back->fofs + back->len == front->fofs &&
- back->blk + back->len == front->blk);
-}
-
-static inline bool __is_back_mergeable(struct extent_info *cur,
- struct extent_info *back)
-{
- return __is_extent_mergeable(back, cur);
-}
-
-static inline bool __is_front_mergeable(struct extent_info *cur,
- struct extent_info *front)
-{
- return __is_extent_mergeable(cur, front);
-}
-
-extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync);
-static inline void __try_update_largest_extent(struct extent_tree *et,
- struct extent_node *en)
-{
- if (en->ei.len > et->largest.len) {
- et->largest = en->ei;
- et->largest_updated = true;
- }
-}
-
/*
* For free nid management
*/
@@ -1057,9 +1069,6 @@ struct f2fs_sm_info {
/* a threshold to reclaim prefree segments */
unsigned int rec_prefree_segments;
- /* for batched trimming */
- unsigned int trim_sections; /* # of sections to trim */
-
struct list_head sit_entry_set; /* sit entry set list */
unsigned int ipu_policy; /* in-place-update policy */
@@ -1158,7 +1167,10 @@ enum iostat_type {
APP_BUFFERED_IO, /* app buffered write IOs */
APP_WRITE_IO, /* app write IOs */
APP_MAPPED_IO, /* app mapped IOs */
+ APP_BUFFERED_CDATA_IO, /* app buffered write IOs on compressed file */
+ APP_MAPPED_CDATA_IO, /* app mapped write IOs on compressed file */
FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */
+ FS_CDATA_IO, /* data IOs from kworker/fsync/reclaimer on compressed file */
FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */
FS_META_IO, /* meta IOs from kworker/reclaimer */
FS_GC_DATA_IO, /* data IOs from forground gc */
@@ -1172,6 +1184,8 @@ enum iostat_type {
APP_BUFFERED_READ_IO, /* app buffered read IOs */
APP_READ_IO, /* app read IOs */
APP_MAPPED_READ_IO, /* app mapped read IOs */
+ APP_BUFFERED_CDATA_READ_IO, /* app buffered read IOs on compressed file */
+ APP_MAPPED_CDATA_READ_IO, /* app mapped read IOs on compressed file */
FS_DATA_READ_IO, /* data read IOs */
FS_GDATA_READ_IO, /* data read IOs from background gc */
FS_CDATA_READ_IO, /* compressed data read IOs */
@@ -1247,7 +1261,6 @@ enum inode_type {
DIR_INODE, /* for dirty dir inode */
FILE_INODE, /* for dirty regular/symlink inode */
DIRTY_META, /* for all dirtied inode metadata */
- ATOMIC_FILE, /* for all atomic files */
NR_INODE_TYPE,
};
@@ -1309,6 +1322,7 @@ enum {
MAX_TIME,
};
+/* Note that you need to keep synchronization with this gc_mode_names array */
enum {
GC_NORMAL,
GC_IDLE_CB,
@@ -1659,14 +1673,12 @@ struct f2fs_sb_info {
struct mutex flush_lock; /* for flush exclusion */
/* for extent tree cache */
- struct radix_tree_root extent_tree_root;/* cache extent cache entries */
- struct mutex extent_tree_lock; /* locking extent radix tree */
- struct list_head extent_list; /* lru list for shrinker */
- spinlock_t extent_lock; /* locking extent lru list */
- atomic_t total_ext_tree; /* extent tree count */
- struct list_head zombie_list; /* extent zombie tree list */
- atomic_t total_zombie_tree; /* extent zombie tree count */
- atomic_t total_ext_node; /* extent info count */
+ struct extent_tree_info extent_tree[NR_EXTENT_CACHES];
+ atomic64_t allocated_data_blocks; /* for block age extent_cache */
+
+ /* The threshold used for hot and warm data seperation*/
+ unsigned int hot_data_age_threshold;
+ unsigned int warm_data_age_threshold;
/* basic filesystem units */
unsigned int log_sectors_per_block; /* log2 sectors per block */
@@ -1684,7 +1696,7 @@ struct f2fs_sb_info {
unsigned int total_node_count; /* total node block count */
unsigned int total_valid_node_count; /* valid node block count */
int dir_level; /* directory level */
- int readdir_ra; /* readahead inode in readdir */
+ bool readdir_ra; /* readahead inode in readdir */
u64 max_io_bytes; /* max io bytes to merge IOs */
block_t user_block_count; /* # of user blocks */
@@ -1725,12 +1737,11 @@ struct f2fs_sb_info {
unsigned int cur_victim_sec; /* current victim section num */
unsigned int gc_mode; /* current GC state */
unsigned int next_victim_seg[2]; /* next segment in victim section */
- spinlock_t gc_urgent_high_lock;
- bool gc_urgent_high_limited; /* indicates having limited trial count */
- unsigned int gc_urgent_high_remaining; /* remaining trial count for GC_URGENT_HIGH */
+ spinlock_t gc_remaining_trials_lock;
+ /* remaining trial count for GC_URGENT_* and GC_IDLE_* */
+ unsigned int gc_remaining_trials;
/* for skip statistic */
- unsigned int atomic_files; /* # of opened atomic file */
unsigned long long skipped_gc_rwsem; /* FG_GC only */
/* threshold for gc trials on pinned files */
@@ -1752,15 +1763,21 @@ struct f2fs_sb_info {
unsigned int segment_count[2]; /* # of allocated segments */
unsigned int block_count[2]; /* # of allocated blocks */
atomic_t inplace_count; /* # of inplace update */
- atomic64_t total_hit_ext; /* # of lookup extent cache */
- atomic64_t read_hit_rbtree; /* # of hit rbtree extent node */
- atomic64_t read_hit_largest; /* # of hit largest extent node */
- atomic64_t read_hit_cached; /* # of hit cached extent node */
+ /* # of lookup extent cache */
+ atomic64_t total_hit_ext[NR_EXTENT_CACHES];
+ /* # of hit rbtree extent node */
+ atomic64_t read_hit_rbtree[NR_EXTENT_CACHES];
+ /* # of hit cached extent node */
+ atomic64_t read_hit_cached[NR_EXTENT_CACHES];
+ /* # of hit largest extent node in read extent cache */
+ atomic64_t read_hit_largest;
atomic_t inline_xattr; /* # of inline_xattr inodes */
atomic_t inline_inode; /* # of inline_data inodes */
atomic_t inline_dir; /* # of inline_dentry inodes */
atomic_t compr_inode; /* # of compressed inodes */
atomic64_t compr_blocks; /* # of compressed blocks */
+ atomic_t swapfile_inode; /* # of swapfile inodes */
+ atomic_t atomic_files; /* # of opened atomic file */
atomic_t max_aw_cnt; /* max # of atomic writes */
unsigned int io_skip_bggc; /* skip background gc for in-flight IO */
unsigned int other_skip_bggc; /* skip background gc for other reasons */
@@ -1806,6 +1823,10 @@ struct f2fs_sb_info {
struct workqueue_struct *post_read_wq; /* post read workqueue */
+ unsigned char errors[MAX_F2FS_ERRORS]; /* error flags */
+ spinlock_t error_lock; /* protect errors array */
+ bool error_dirty; /* errors of sb is dirty */
+
struct kmem_cache *inline_xattr_slab; /* inline xattr entry */
unsigned int inline_xattr_slab_size; /* default inline xattr slab size */
@@ -2525,7 +2546,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
if (__cp_payload(sbi) > 0) {
if (flag == NAT_BITMAP)
- return &ckpt->sit_nat_version_bitmap;
+ return tmp_ptr;
else
return (unsigned char *)ckpt + F2FS_BLKSIZE;
} else {
@@ -2563,6 +2584,7 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
}
+extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync);
static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
struct inode *inode, bool is_inode)
{
@@ -2961,7 +2983,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
/* Flags that should be inherited by new inodes from their parent. */
#define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \
F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \
- F2FS_CASEFOLD_FL | F2FS_COMPR_FL | F2FS_NOCOMP_FL)
+ F2FS_CASEFOLD_FL)
/* Flags that are appropriate for regular files (all but dir-specific ones). */
#define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \
@@ -3059,6 +3081,8 @@ static inline void f2fs_i_blocks_write(struct inode *inode,
set_inode_flag(inode, FI_AUTO_RECOVER);
}
+static inline bool f2fs_is_atomic_file(struct inode *inode);
+
static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size)
{
bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE);
@@ -3068,6 +3092,10 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size)
return;
i_size_write(inode, i_size);
+
+ if (f2fs_is_atomic_file(inode))
+ return;
+
f2fs_mark_inode_dirty_sync(inode, true);
if (clean || recover)
set_inode_flag(inode, FI_AUTO_RECOVER);
@@ -3547,6 +3575,8 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly);
int f2fs_quota_sync(struct super_block *sb, int type);
loff_t max_file_blocks(struct inode *inode);
void f2fs_quota_off_umount(struct super_block *sb);
+void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason);
+void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error);
int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover);
int f2fs_sync_fs(struct super_block *sb, int sync);
int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi);
@@ -3706,7 +3736,9 @@ static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi)
/*
* checkpoint.c
*/
-void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io);
+void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io,
+ unsigned char reason);
+void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi);
struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index);
struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index);
struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index);
@@ -3736,7 +3768,8 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi);
int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi);
void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio);
void f2fs_remove_dirty_inode(struct inode *inode);
-int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type);
+int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type,
+ bool from_cp);
void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type);
u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi);
int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc);
@@ -3778,8 +3811,9 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn);
int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index);
int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index);
struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
- blk_opf_t op_flags, bool for_write);
-struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index);
+ blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs);
+struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index,
+ pgoff_t *next_pgofs);
struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
bool for_write);
struct page *f2fs_get_new_data_page(struct inode *inode,
@@ -3838,9 +3872,19 @@ struct f2fs_stat_info {
struct f2fs_sb_info *sbi;
int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
int main_area_segs, main_area_sections, main_area_zones;
- unsigned long long hit_largest, hit_cached, hit_rbtree;
- unsigned long long hit_total, total_ext;
- int ext_tree, zombie_tree, ext_node;
+ unsigned long long hit_cached[NR_EXTENT_CACHES];
+ unsigned long long hit_rbtree[NR_EXTENT_CACHES];
+ unsigned long long total_ext[NR_EXTENT_CACHES];
+ unsigned long long hit_total[NR_EXTENT_CACHES];
+ int ext_tree[NR_EXTENT_CACHES];
+ int zombie_tree[NR_EXTENT_CACHES];
+ int ext_node[NR_EXTENT_CACHES];
+ /* to count memory footprint */
+ unsigned long long ext_mem[NR_EXTENT_CACHES];
+ /* for read extent cache */
+ unsigned long long hit_largest;
+ /* for block age extent cache */
+ unsigned long long allocated_data_blocks;
int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta;
int ndirty_data, ndirty_qdata;
unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all;
@@ -3858,7 +3902,7 @@ struct f2fs_stat_info {
int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt;
unsigned int cur_ckpt_time, peak_ckpt_time;
int inline_xattr, inline_inode, inline_dir, append, update, orphans;
- int compr_inode;
+ int compr_inode, swapfile_inode;
unsigned long long compr_blocks;
int aw_cnt, max_aw_cnt;
unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks;
@@ -3899,10 +3943,10 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
#define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++)
#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++)
#define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--)
-#define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext))
-#define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree))
+#define stat_inc_total_hit(sbi, type) (atomic64_inc(&(sbi)->total_hit_ext[type]))
+#define stat_inc_rbtree_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_rbtree[type]))
#define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest))
-#define stat_inc_cached_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_cached))
+#define stat_inc_cached_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_cached[type]))
#define stat_inc_inline_xattr(inode) \
do { \
if (f2fs_has_inline_xattr(inode)) \
@@ -3947,6 +3991,14 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
(atomic64_add(blocks, &F2FS_I_SB(inode)->compr_blocks))
#define stat_sub_compr_blocks(inode, blocks) \
(atomic64_sub(blocks, &F2FS_I_SB(inode)->compr_blocks))
+#define stat_inc_swapfile_inode(inode) \
+ (atomic_inc(&F2FS_I_SB(inode)->swapfile_inode))
+#define stat_dec_swapfile_inode(inode) \
+ (atomic_dec(&F2FS_I_SB(inode)->swapfile_inode))
+#define stat_inc_atomic_inode(inode) \
+ (atomic_inc(&F2FS_I_SB(inode)->atomic_files))
+#define stat_dec_atomic_inode(inode) \
+ (atomic_dec(&F2FS_I_SB(inode)->atomic_files))
#define stat_inc_meta_count(sbi, blkaddr) \
do { \
if (blkaddr < SIT_I(sbi)->sit_base_addr) \
@@ -3966,7 +4018,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
(atomic_inc(&(sbi)->inplace_count))
#define stat_update_max_atomic_write(inode) \
do { \
- int cur = F2FS_I_SB(inode)->atomic_files; \
+ int cur = atomic_read(&F2FS_I_SB(inode)->atomic_files); \
int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \
if (cur > max) \
atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \
@@ -4017,10 +4069,10 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi);
#define stat_other_skip_bggc_count(sbi) do { } while (0)
#define stat_inc_dirty_inode(sbi, type) do { } while (0)
#define stat_dec_dirty_inode(sbi, type) do { } while (0)
-#define stat_inc_total_hit(sbi) do { } while (0)
-#define stat_inc_rbtree_node_hit(sbi) do { } while (0)
+#define stat_inc_total_hit(sbi, type) do { } while (0)
+#define stat_inc_rbtree_node_hit(sbi, type) do { } while (0)
#define stat_inc_largest_node_hit(sbi) do { } while (0)
-#define stat_inc_cached_node_hit(sbi) do { } while (0)
+#define stat_inc_cached_node_hit(sbi, type) do { } while (0)
#define stat_inc_inline_xattr(inode) do { } while (0)
#define stat_dec_inline_xattr(inode) do { } while (0)
#define stat_inc_inline_inode(inode) do { } while (0)
@@ -4031,6 +4083,10 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi);
#define stat_dec_compr_inode(inode) do { } while (0)
#define stat_add_compr_blocks(inode, blocks) do { } while (0)
#define stat_sub_compr_blocks(inode, blocks) do { } while (0)
+#define stat_inc_swapfile_inode(inode) do { } while (0)
+#define stat_dec_swapfile_inode(inode) do { } while (0)
+#define stat_inc_atomic_inode(inode) do { } while (0)
+#define stat_dec_atomic_inode(inode) do { } while (0)
#define stat_update_max_atomic_write(inode) do { } while (0)
#define stat_inc_meta_count(sbi, blkaddr) do { } while (0)
#define stat_inc_seg_type(sbi, curseg) do { } while (0)
@@ -4122,20 +4178,34 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root,
bool force, bool *leftmost);
bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi,
struct rb_root_cached *root, bool check_key);
-unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink);
-void f2fs_init_extent_tree(struct inode *inode, struct page *ipage);
+void f2fs_init_extent_tree(struct inode *inode);
void f2fs_drop_extent_tree(struct inode *inode);
-unsigned int f2fs_destroy_extent_node(struct inode *inode);
+void f2fs_destroy_extent_node(struct inode *inode);
void f2fs_destroy_extent_tree(struct inode *inode);
-bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
- struct extent_info *ei);
-void f2fs_update_extent_cache(struct dnode_of_data *dn);
-void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
- pgoff_t fofs, block_t blkaddr, unsigned int len);
void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi);
int __init f2fs_create_extent_cache(void);
void f2fs_destroy_extent_cache(void);
+/* read extent cache ops */
+void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage);
+bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei);
+void f2fs_update_read_extent_cache(struct dnode_of_data *dn);
+void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn,
+ pgoff_t fofs, block_t blkaddr, unsigned int len);
+unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi,
+ int nr_shrink);
+
+/* block age extent cache ops */
+void f2fs_init_age_extent_tree(struct inode *inode);
+bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei);
+void f2fs_update_age_extent_cache(struct dnode_of_data *dn);
+void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn,
+ pgoff_t fofs, unsigned int len);
+unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi,
+ int nr_shrink);
+
/*
* sysfs.c
*/
@@ -4205,9 +4275,9 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
struct writeback_control *wbc,
enum iostat_type io_type);
int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index);
-void f2fs_update_extent_tree_range_compressed(struct inode *inode,
- pgoff_t fofs, block_t blkaddr, unsigned int llen,
- unsigned int c_len);
+void f2fs_update_read_extent_tree_range_compressed(struct inode *inode,
+ pgoff_t fofs, block_t blkaddr,
+ unsigned int llen, unsigned int c_len);
int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
unsigned nr_pages, sector_t *last_block_in_bio,
bool is_readahead, bool for_write);
@@ -4288,9 +4358,10 @@ static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi,
static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi,
nid_t ino) { }
#define inc_compr_inode_stat(inode) do { } while (0)
-static inline void f2fs_update_extent_tree_range_compressed(struct inode *inode,
- pgoff_t fofs, block_t blkaddr, unsigned int llen,
- unsigned int c_len) { }
+static inline void f2fs_update_read_extent_tree_range_compressed(
+ struct inode *inode,
+ pgoff_t fofs, block_t blkaddr,
+ unsigned int llen, unsigned int c_len) { }
#endif
static inline int set_compress_context(struct inode *inode)
@@ -4341,7 +4412,7 @@ static inline bool f2fs_disable_compressed_file(struct inode *inode)
}
#define F2FS_FEATURE_FUNCS(name, flagname) \
-static inline int f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \
+static inline bool f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \
{ \
return F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_##flagname); \
}
@@ -4361,26 +4432,6 @@ F2FS_FEATURE_FUNCS(casefold, CASEFOLD);
F2FS_FEATURE_FUNCS(compression, COMPRESSION);
F2FS_FEATURE_FUNCS(readonly, RO);
-static inline bool f2fs_may_extent_tree(struct inode *inode)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-
- if (!test_opt(sbi, EXTENT_CACHE) ||
- is_inode_flag_set(inode, FI_NO_EXTENT) ||
- (is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
- !f2fs_sb_has_readonly(sbi)))
- return false;
-
- /*
- * for recovered files during mount do not create extents
- * if shrinker is not registered.
- */
- if (list_empty(&sbi->s_list))
- return false;
-
- return S_ISREG(inode->i_mode);
-}
-
#ifdef CONFIG_BLK_DEV_ZONED
static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi,
block_t blkaddr)
@@ -4471,17 +4522,6 @@ static inline void f2fs_i_compr_blocks_update(struct inode *inode,
f2fs_mark_inode_dirty_sync(inode, true);
}
-static inline int block_unaligned_IO(struct inode *inode,
- struct kiocb *iocb, struct iov_iter *iter)
-{
- unsigned int i_blkbits = READ_ONCE(inode->i_blkbits);
- unsigned int blocksize_mask = (1 << i_blkbits) - 1;
- loff_t offset = iocb->ki_pos;
- unsigned long align = offset | iov_iter_alignment(iter);
-
- return align & blocksize_mask;
-}
-
static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi,
int flag)
{
@@ -4492,35 +4532,6 @@ static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi,
return sbi->aligned_blksize;
}
-static inline bool f2fs_force_buffered_io(struct inode *inode,
- struct kiocb *iocb, struct iov_iter *iter)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- int rw = iov_iter_rw(iter);
-
- if (!fscrypt_dio_supported(iocb, iter))
- return true;
- if (fsverity_active(inode))
- return true;
- if (f2fs_compressed_file(inode))
- return true;
-
- /* disallow direct IO if any of devices has unaligned blksize */
- if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize)
- return true;
-
- if (f2fs_lfs_mode(sbi) && (rw == WRITE)) {
- if (block_unaligned_IO(inode, iocb, iter))
- return true;
- if (F2FS_IO_ALIGNED(sbi))
- return true;
- }
- if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED))
- return true;
-
- return false;
-}
-
static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
{
return fsverity_active(inode) &&
@@ -4573,6 +4584,11 @@ static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, pgoff_t ofs,
}
}
+static inline bool f2fs_is_readonly(struct f2fs_sb_info *sbi)
+{
+ return f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb);
+}
+
#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index ce4905a073b3..a6c401279886 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -43,8 +43,8 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf)
ret = filemap_fault(vmf);
if (!ret)
- f2fs_update_iostat(F2FS_I_SB(inode), APP_MAPPED_READ_IO,
- F2FS_BLKSIZE);
+ f2fs_update_iostat(F2FS_I_SB(inode), inode,
+ APP_MAPPED_READ_IO, F2FS_BLKSIZE);
trace_f2fs_filemap_fault(inode, vmf->pgoff, (unsigned long)ret);
@@ -154,7 +154,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
if (!PageUptodate(page))
SetPageUptodate(page);
- f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(sbi, inode, APP_MAPPED_IO, F2FS_BLKSIZE);
f2fs_update_time(sbi, REQ_TIME);
trace_f2fs_vm_page_mkwrite(page, DATA);
@@ -571,7 +571,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
raw_node = F2FS_NODE(dn->node_page);
addr = blkaddr_in_node(raw_node) + base + ofs;
- /* Assumption: truncateion starts with cluster */
+ /* Assumption: truncation starts with cluster */
for (; count > 0; count--, addr++, dn->ofs_in_node++, cluster_index++) {
block_t blkaddr = le32_to_cpu(*addr);
@@ -618,7 +618,8 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
*/
fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page),
dn->inode) + ofs;
- f2fs_update_extent_cache_range(dn, fofs, 0, len);
+ f2fs_update_read_extent_cache_range(dn, fofs, 0, len);
+ f2fs_update_age_extent_cache_range(dn, fofs, nr_free);
dec_valid_block_count(sbi, dn->inode, nr_free);
}
dn->ofs_in_node = ofs;
@@ -808,6 +809,34 @@ int f2fs_truncate(struct inode *inode)
return 0;
}
+static bool f2fs_force_buffered_io(struct inode *inode, int rw)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (!fscrypt_dio_supported(inode))
+ return true;
+ if (fsverity_active(inode))
+ return true;
+ if (f2fs_compressed_file(inode))
+ return true;
+
+ /* disallow direct IO if any of devices has unaligned blksize */
+ if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize)
+ return true;
+ /*
+ * for blkzoned device, fallback direct IO to buffered IO, so
+ * all IOs can be serialized by log-structured write.
+ */
+ if (f2fs_sb_has_blkzoned(sbi) && (rw == WRITE))
+ return true;
+ if (f2fs_lfs_mode(sbi) && rw == WRITE && F2FS_IO_ALIGNED(sbi))
+ return true;
+ if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
+ return true;
+
+ return false;
+}
+
int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
@@ -824,6 +853,24 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
stat->btime.tv_nsec = fi->i_crtime.tv_nsec;
}
+ /*
+ * Return the DIO alignment restrictions if requested. We only return
+ * this information when requested, since on encrypted files it might
+ * take a fair bit of work to get if the file wasn't opened recently.
+ *
+ * f2fs sometimes supports DIO reads but not DIO writes. STATX_DIOALIGN
+ * cannot represent that, so in that case we report no DIO support.
+ */
+ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
+ unsigned int bsize = i_blocksize(inode);
+
+ stat->result_mask |= STATX_DIOALIGN;
+ if (!f2fs_force_buffered_io(inode, WRITE)) {
+ stat->dio_mem_align = bsize;
+ stat->dio_offset_align = bsize;
+ }
+ }
+
flags = fi->i_flags;
if (flags & F2FS_COMPR_FL)
stat->attributes |= STATX_ATTR_COMPRESSED;
@@ -871,9 +918,10 @@ static void __setattr_copy(struct user_namespace *mnt_userns,
inode->i_ctime = attr->ia_ctime;
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
- kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
- if (!in_group_p(kgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
+ if (!vfsgid_in_group_p(vfsgid) &&
+ !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
mode &= ~S_ISGID;
set_acl_inode(inode, mode);
}
@@ -978,7 +1026,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
__setattr_copy(mnt_userns, inode, attr);
if (attr->ia_valid & ATTR_MODE) {
- err = posix_acl_chmod(mnt_userns, inode, f2fs_get_inode_mode(inode));
+ err = posix_acl_chmod(mnt_userns, dentry, f2fs_get_inode_mode(inode));
if (is_inode_flag_set(inode, FI_ACL_MODE)) {
if (!err)
@@ -999,7 +1047,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
const struct inode_operations f2fs_file_inode_operations = {
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
- .get_acl = f2fs_get_acl,
+ .get_inode_acl = f2fs_get_acl,
.set_acl = f2fs_set_acl,
.listxattr = f2fs_listxattr,
.fiemap = f2fs_fiemap,
@@ -1155,6 +1203,7 @@ next_dnode:
!f2fs_is_valid_blkaddr(sbi, *blkaddr,
DATA_GENERIC_ENHANCE)) {
f2fs_put_dnode(&dn);
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
return -EFSCORRUPTED;
}
@@ -1439,6 +1488,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr,
DATA_GENERIC_ENHANCE)) {
ret = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
break;
}
@@ -1447,7 +1497,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
f2fs_set_data_blkaddr(dn);
}
- f2fs_update_extent_cache_range(dn, start, 0, index - start);
+ f2fs_update_read_extent_cache_range(dn, start, 0, index - start);
return ret;
}
@@ -1866,6 +1916,10 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
if (!f2fs_disable_compressed_file(inode))
return -EINVAL;
} else {
+ /* try to convert inline_data to support compression */
+ int err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
if (!f2fs_may_compress(inode))
return -EINVAL;
if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))
@@ -1981,13 +2035,14 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg)
return put_user(inode->i_generation, (int __user *)arg);
}
-static int f2fs_ioc_start_atomic_write(struct file *filp)
+static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate)
{
struct inode *inode = file_inode(filp);
struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct inode *pinode;
+ loff_t isize;
int ret;
if (!inode_owner_or_capable(mnt_userns, inode))
@@ -2046,15 +2101,25 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
goto out;
}
- f2fs_i_size_write(fi->cow_inode, i_size_read(inode));
- spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
- sbi->atomic_files++;
- spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+ f2fs_write_inode(inode, NULL);
+
+ stat_inc_atomic_inode(inode);
set_inode_flag(inode, FI_ATOMIC_FILE);
set_inode_flag(fi->cow_inode, FI_COW_FILE);
clear_inode_flag(fi->cow_inode, FI_INLINE_DATA);
+
+ isize = i_size_read(inode);
+ fi->original_i_size = isize;
+ if (truncate) {
+ set_inode_flag(inode, FI_ATOMIC_REPLACE);
+ truncate_inode_pages_final(inode->i_mapping);
+ f2fs_i_size_write(inode, 0);
+ isize = 0;
+ }
+ f2fs_i_size_write(fi->cow_inode, isize);
+
f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
f2fs_update_time(sbi, REQ_TIME);
@@ -2086,16 +2151,14 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
if (f2fs_is_atomic_file(inode)) {
ret = f2fs_commit_atomic_write(inode);
- if (ret)
- goto unlock_out;
-
- ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
if (!ret)
- f2fs_abort_atomic_write(inode, false);
+ ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
+
+ f2fs_abort_atomic_write(inode, ret);
} else {
ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false);
}
-unlock_out:
+
inode_unlock(inode);
mnt_drop_write_file(filp);
return ret;
@@ -2144,7 +2207,8 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
if (ret) {
if (ret == -EROFS) {
ret = 0;
- f2fs_stop_checkpoint(sbi, false);
+ f2fs_stop_checkpoint(sbi, false,
+ STOP_CP_REASON_SHUTDOWN);
set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
trace_f2fs_shutdown(sbi, in, ret);
}
@@ -2157,7 +2221,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
ret = freeze_bdev(sb->s_bdev);
if (ret)
goto out;
- f2fs_stop_checkpoint(sbi, false);
+ f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN);
set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
thaw_bdev(sb->s_bdev);
break;
@@ -2166,16 +2230,16 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
ret = f2fs_sync_fs(sb, 1);
if (ret)
goto out;
- f2fs_stop_checkpoint(sbi, false);
+ f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN);
set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
break;
case F2FS_GOING_DOWN_NOSYNC:
- f2fs_stop_checkpoint(sbi, false);
+ f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN);
set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
break;
case F2FS_GOING_DOWN_METAFLUSH:
f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO);
- f2fs_stop_checkpoint(sbi, false);
+ f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN);
set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
break;
case F2FS_GOING_DOWN_NEED_FSCK:
@@ -2495,7 +2559,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
struct f2fs_map_blocks map = { .m_next_extent = NULL,
.m_seg_type = NO_CHECK_TYPE,
.m_may_create = false };
- struct extent_info ei = {0, 0, 0};
+ struct extent_info ei = {0, };
pgoff_t pg_start, pg_end, next_pgofs;
unsigned int blk_per_seg = sbi->blocks_per_seg;
unsigned int total = 0, sec_num;
@@ -2527,7 +2591,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
* lookup mapping info in extent cache, skip defragmenting if physical
* block addresses are continuous.
*/
- if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) {
+ if (f2fs_lookup_read_extent_cache(inode, pg_start, &ei)) {
if (ei.fofs + ei.len >= pg_end)
goto out;
}
@@ -3321,8 +3385,10 @@ static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
if (!__is_valid_data_blkaddr(blkaddr))
continue;
if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr,
- DATA_GENERIC_ENHANCE)))
+ DATA_GENERIC_ENHANCE))) {
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
return -EFSCORRUPTED;
+ }
}
while (count) {
@@ -3483,8 +3549,10 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
if (!__is_valid_data_blkaddr(blkaddr))
continue;
if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr,
- DATA_GENERIC_ENHANCE)))
+ DATA_GENERIC_ENHANCE))) {
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
return -EFSCORRUPTED;
+ }
}
while (count) {
@@ -3756,6 +3824,8 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
DATA_GENERIC_ENHANCE)) {
ret = -EFSCORRUPTED;
f2fs_put_dnode(&dn);
+ f2fs_handle_error(sbi,
+ ERROR_INVALID_BLKADDR);
goto out;
}
@@ -4077,7 +4147,9 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case FS_IOC_GETVERSION:
return f2fs_ioc_getversion(filp, arg);
case F2FS_IOC_START_ATOMIC_WRITE:
- return f2fs_ioc_start_atomic_write(filp);
+ return f2fs_ioc_start_atomic_write(filp, false);
+ case F2FS_IOC_START_ATOMIC_REPLACE:
+ return f2fs_ioc_start_atomic_write(filp, true);
case F2FS_IOC_COMMIT_ATOMIC_WRITE:
return f2fs_ioc_commit_atomic_write(filp);
case F2FS_IOC_ABORT_ATOMIC_WRITE:
@@ -4182,7 +4254,7 @@ static bool f2fs_should_use_dio(struct inode *inode, struct kiocb *iocb,
if (!(iocb->ki_flags & IOCB_DIRECT))
return false;
- if (f2fs_force_buffered_io(inode, iocb, iter))
+ if (f2fs_force_buffered_io(inode, iov_iter_rw(iter)))
return false;
/*
@@ -4212,7 +4284,7 @@ static int f2fs_dio_read_end_io(struct kiocb *iocb, ssize_t size, int error,
dec_page_count(sbi, F2FS_DIO_READ);
if (error)
return error;
- f2fs_update_iostat(sbi, APP_DIRECT_READ_IO, size);
+ f2fs_update_iostat(sbi, NULL, APP_DIRECT_READ_IO, size);
return 0;
}
@@ -4301,7 +4373,8 @@ skip_read_trace:
} else {
ret = filemap_read(iocb, to, 0);
if (ret > 0)
- f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret);
+ f2fs_update_iostat(F2FS_I_SB(inode), inode,
+ APP_BUFFERED_READ_IO, ret);
}
if (trace_f2fs_dataread_end_enabled())
trace_f2fs_dataread_end(inode, pos, ret);
@@ -4418,7 +4491,8 @@ static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb,
if (ret > 0) {
iocb->ki_pos += ret;
- f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_IO, ret);
+ f2fs_update_iostat(F2FS_I_SB(inode), inode,
+ APP_BUFFERED_IO, ret);
}
return ret;
}
@@ -4431,7 +4505,7 @@ static int f2fs_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error,
dec_page_count(sbi, F2FS_DIO_WRITE);
if (error)
return error;
- f2fs_update_iostat(sbi, APP_DIRECT_IO, size);
+ f2fs_update_iostat(sbi, NULL, APP_DIRECT_IO, size);
return 0;
}
@@ -4619,7 +4693,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
skip_write_trace:
/* Do the actual write. */
ret = dio ?
- f2fs_dio_write_iter(iocb, from, &may_need_sync):
+ f2fs_dio_write_iter(iocb, from, &may_need_sync) :
f2fs_buffered_write_iter(iocb, from);
if (trace_f2fs_datawrite_end_enabled())
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 6da21d405ce1..6e2cae3d2e71 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -74,7 +74,8 @@ static int gc_thread_func(void *data)
if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
f2fs_show_injection_info(sbi, FAULT_CHECKPOINT);
- f2fs_stop_checkpoint(sbi, false);
+ f2fs_stop_checkpoint(sbi, false,
+ STOP_CP_REASON_FAULT_INJECT);
}
if (!sb_start_write_trylock(sbi->sb)) {
@@ -95,20 +96,6 @@ static int gc_thread_func(void *data)
* invalidated soon after by user update or deletion.
* So, I'd like to wait some time to collect dirty segments.
*/
- if (sbi->gc_mode == GC_URGENT_HIGH) {
- spin_lock(&sbi->gc_urgent_high_lock);
- if (sbi->gc_urgent_high_limited) {
- if (!sbi->gc_urgent_high_remaining) {
- sbi->gc_urgent_high_limited = false;
- spin_unlock(&sbi->gc_urgent_high_lock);
- sbi->gc_mode = GC_NORMAL;
- continue;
- }
- sbi->gc_urgent_high_remaining--;
- }
- spin_unlock(&sbi->gc_urgent_high_lock);
- }
-
if (sbi->gc_mode == GC_URGENT_HIGH ||
sbi->gc_mode == GC_URGENT_MID) {
wait_ms = gc_th->urgent_sleep_time;
@@ -154,6 +141,10 @@ do_gc:
/* don't bother wait_ms by foreground gc */
if (!foreground)
wait_ms = gc_th->no_gc_sleep_time;
+ } else {
+ /* reset wait_ms to default sleep time */
+ if (wait_ms == gc_th->no_gc_sleep_time)
+ wait_ms = gc_th->min_sleep_time;
}
if (foreground)
@@ -165,6 +156,15 @@ do_gc:
/* balancing f2fs's metadata periodically */
f2fs_balance_fs_bg(sbi, true);
next:
+ if (sbi->gc_mode != GC_NORMAL) {
+ spin_lock(&sbi->gc_remaining_trials_lock);
+ if (sbi->gc_remaining_trials) {
+ sbi->gc_remaining_trials--;
+ if (!sbi->gc_remaining_trials)
+ sbi->gc_mode = GC_NORMAL;
+ }
+ spin_unlock(&sbi->gc_remaining_trials_lock);
+ }
sb_end_write(sbi->sb);
} while (!kthread_should_stop());
@@ -175,13 +175,10 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
{
struct f2fs_gc_kthread *gc_th;
dev_t dev = sbi->sb->s_bdev->bd_dev;
- int err = 0;
gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
- if (!gc_th) {
- err = -ENOMEM;
- goto out;
- }
+ if (!gc_th)
+ return -ENOMEM;
gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME;
gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME;
@@ -196,12 +193,14 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
"f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(gc_th->f2fs_gc_task)) {
- err = PTR_ERR(gc_th->f2fs_gc_task);
+ int err = PTR_ERR(gc_th->f2fs_gc_task);
+
kfree(gc_th);
sbi->gc_thread = NULL;
+ return err;
}
-out:
- return err;
+
+ return 0;
}
void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi)
@@ -285,7 +284,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
/* let's select beginning hot/small space first in no_heap mode*/
if (f2fs_need_rand_seg(sbi))
- p->offset = prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec);
+ p->offset = get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec);
else if (test_opt(sbi, NOHEAP) &&
(type == CURSEG_HOT_DATA || IS_NODESEG(type)))
p->offset = 0;
@@ -1082,7 +1081,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
{
struct page *node_page;
nid_t nid;
- unsigned int ofs_in_node;
+ unsigned int ofs_in_node, max_addrs, base;
block_t source_blkaddr;
nid = le32_to_cpu(sum->nid);
@@ -1108,6 +1107,21 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
return false;
}
+ if (IS_INODE(node_page)) {
+ base = offset_in_addr(F2FS_INODE(node_page));
+ max_addrs = DEF_ADDRS_PER_INODE;
+ } else {
+ base = 0;
+ max_addrs = DEF_ADDRS_PER_BLOCK;
+ }
+
+ if (base + ofs_in_node >= max_addrs) {
+ f2fs_err(sbi, "Inconsistent blkaddr offset: base:%u, ofs_in_node:%u, max:%u, ino:%u, nid:%u",
+ base, ofs_in_node, max_addrs, dni->ino, dni->nid);
+ f2fs_put_page(node_page, 1);
+ return false;
+ }
+
*nofs = ofs_of_node(node_page);
source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node);
f2fs_put_page(node_page, 1);
@@ -1136,7 +1150,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
struct page *page;
- struct extent_info ei = {0, 0, 0};
+ struct extent_info ei = {0, };
struct f2fs_io_info fio = {
.sbi = sbi,
.ino = inode->i_ino,
@@ -1154,11 +1168,12 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
if (!page)
return -ENOMEM;
- if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
dn.data_blkaddr = ei.blk + index - ei.fofs;
if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
DATA_GENERIC_ENHANCE_READ))) {
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto put_page;
}
goto got_it;
@@ -1177,6 +1192,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
DATA_GENERIC_ENHANCE))) {
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto put_page;
}
got_it:
@@ -1206,8 +1222,8 @@ got_it:
f2fs_put_page(fio.encrypted_page, 0);
f2fs_put_page(page, 1);
- f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE);
- f2fs_update_iostat(sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(sbi, NULL, FS_GDATA_READ_IO, F2FS_BLKSIZE);
return 0;
put_encrypted_page:
@@ -1307,8 +1323,10 @@ static int move_data_block(struct inode *inode, block_t bidx,
goto up_out;
}
- f2fs_update_iostat(fio.sbi, FS_DATA_READ_IO, F2FS_BLKSIZE);
- f2fs_update_iostat(fio.sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(fio.sbi, inode, FS_DATA_READ_IO,
+ F2FS_BLKSIZE);
+ f2fs_update_iostat(fio.sbi, NULL, FS_GDATA_READ_IO,
+ F2FS_BLKSIZE);
lock_page(mpage);
if (unlikely(mpage->mapping != META_MAPPING(fio.sbi) ||
@@ -1360,7 +1378,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
goto put_page_out;
}
- f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(fio.sbi, NULL, FS_GC_DATA_IO, F2FS_BLKSIZE);
f2fs_update_data_blkaddr(&dn, newaddr);
set_inode_flag(inode, FI_APPEND_WRITE);
@@ -1554,8 +1572,8 @@ next_step:
continue;
}
- data_page = f2fs_get_read_data_page(inode,
- start_bidx, REQ_RAHEAD, true);
+ data_page = f2fs_get_read_data_page(inode, start_bidx,
+ REQ_RAHEAD, true, NULL);
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (IS_ERR(data_page)) {
iput(inode);
@@ -1706,7 +1724,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT",
segno, type, GET_SUM_TYPE((&sum->footer)));
set_sbi_flag(sbi, SBI_NEED_FSCK);
- f2fs_stop_checkpoint(sbi, false);
+ f2fs_stop_checkpoint(sbi, false,
+ STOP_CP_REASON_CORRUPTED_SUMMARY);
goto skip;
}
@@ -1734,8 +1753,9 @@ freed:
get_valid_blocks(sbi, segno, false) == 0)
seg_freed++;
- if (__is_large_section(sbi) && segno + 1 < end_segno)
- sbi->next_victim_seg[gc_type] = segno + 1;
+ if (__is_large_section(sbi))
+ sbi->next_victim_seg[gc_type] =
+ (segno + 1 < end_segno) ? segno + 1 : NULL_SEGNO;
skip:
f2fs_put_page(sum_page, 0);
}
@@ -1888,9 +1908,7 @@ int __init f2fs_create_garbage_collection_cache(void)
{
victim_entry_slab = f2fs_kmem_cache_create("f2fs_victim_entry",
sizeof(struct victim_entry));
- if (!victim_entry_slab)
- return -ENOMEM;
- return 0;
+ return victim_entry_slab ? 0 : -ENOMEM;
}
void f2fs_destroy_garbage_collection_cache(void)
@@ -2123,8 +2141,6 @@ out_unlock:
if (err)
return err;
- set_sbi_flag(sbi, SBI_IS_RESIZEFS);
-
freeze_super(sbi->sb);
f2fs_down_write(&sbi->gc_lock);
f2fs_down_write(&sbi->cp_global_sem);
@@ -2140,6 +2156,7 @@ out_unlock:
if (err)
goto out_err;
+ set_sbi_flag(sbi, SBI_IS_RESIZEFS);
err = free_segment_range(sbi, secs, false);
if (err)
goto recover_out;
@@ -2163,6 +2180,7 @@ out_unlock:
f2fs_commit_super(sbi, false);
}
recover_out:
+ clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
if (err) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_err(sbi, "resize_fs failed, should run fsck to repair!");
@@ -2175,6 +2193,5 @@ out_err:
f2fs_up_write(&sbi->cp_global_sem);
f2fs_up_write(&sbi->gc_lock);
thaw_super(sbi->sb);
- clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
return err;
}
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index bf46a7dfbea2..21a495234ffd 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -64,7 +64,6 @@ bool f2fs_may_inline_dentry(struct inode *inode)
void f2fs_do_read_inline_data(struct page *page, struct page *ipage)
{
struct inode *inode = page->mapping->host;
- void *src_addr, *dst_addr;
if (PageUptodate(page))
return;
@@ -74,11 +73,8 @@ void f2fs_do_read_inline_data(struct page *page, struct page *ipage)
zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE);
/* Copy the whole inline data block */
- src_addr = inline_data_addr(inode, ipage);
- dst_addr = kmap_atomic(page);
- memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode));
- flush_dcache_page(page);
- kunmap_atomic(dst_addr);
+ memcpy_to_page(page, 0, inline_data_addr(inode, ipage),
+ MAX_INLINE_DATA(inode));
if (!PageUptodate(page))
SetPageUptodate(page);
}
@@ -164,6 +160,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
set_sbi_flag(fio.sbi, SBI_NEED_FSCK);
f2fs_warn(fio.sbi, "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.",
__func__, dn->inode->i_ino, dn->data_blkaddr);
+ f2fs_handle_error(fio.sbi, ERROR_INVALID_BLKADDR);
return -EFSCORRUPTED;
}
@@ -246,7 +243,6 @@ out:
int f2fs_write_inline_data(struct inode *inode, struct page *page)
{
- void *src_addr, *dst_addr;
struct dnode_of_data dn;
int err;
@@ -263,10 +259,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
f2fs_bug_on(F2FS_I_SB(inode), page->index);
f2fs_wait_on_page_writeback(dn.inode_page, NODE, true, true);
- src_addr = kmap_atomic(page);
- dst_addr = inline_data_addr(inode, dn.inode_page);
- memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode));
- kunmap_atomic(src_addr);
+ memcpy_from_page(inline_data_addr(inode, dn.inode_page),
+ page, 0, MAX_INLINE_DATA(inode));
set_page_dirty(dn.inode_page);
f2fs_clear_page_cache_dirty_tag(page);
@@ -419,6 +413,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK);
f2fs_warn(F2FS_P_SB(page), "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.",
__func__, dir->i_ino, dn.data_blkaddr);
+ f2fs_handle_error(F2FS_P_SB(page), ERROR_INVALID_BLKADDR);
err = -EFSCORRUPTED;
goto out;
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 6d11c365d7b4..ff6cf66ed46b 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -81,8 +81,10 @@ static int __written_first_block(struct f2fs_sb_info *sbi,
if (!__is_valid_data_blkaddr(addr))
return 1;
- if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE))
+ if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE)) {
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
return -EFSCORRUPTED;
+ }
return 0;
}
@@ -260,8 +262,8 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
return false;
}
- if (fi->extent_tree) {
- struct extent_info *ei = &fi->extent_tree->largest;
+ if (fi->extent_tree[EX_READ]) {
+ struct extent_info *ei = &fi->extent_tree[EX_READ]->largest;
if (ei->len &&
(!f2fs_is_valid_blkaddr(sbi, ei->blk,
@@ -333,6 +335,16 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
return true;
}
+static void init_idisk_time(struct inode *inode)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+
+ fi->i_disk_time[0] = inode->i_atime;
+ fi->i_disk_time[1] = inode->i_ctime;
+ fi->i_disk_time[2] = inode->i_mtime;
+ fi->i_disk_time[3] = fi->i_crtime;
+}
+
static int do_read_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -380,8 +392,6 @@ static int do_read_inode(struct inode *inode)
fi->i_pino = le32_to_cpu(ri->i_pino);
fi->i_dir_level = ri->i_dir_level;
- f2fs_init_extent_tree(inode, node_page);
-
get_inline_info(inode, ri);
fi->i_extra_isize = f2fs_has_extra_attr(inode) ?
@@ -405,6 +415,7 @@ static int do_read_inode(struct inode *inode)
if (!sanity_check_inode(inode, node_page)) {
f2fs_put_page(node_page, 1);
+ f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE);
return -EFSCORRUPTED;
}
@@ -465,10 +476,12 @@ static int do_read_inode(struct inode *inode)
}
}
- fi->i_disk_time[0] = inode->i_atime;
- fi->i_disk_time[1] = inode->i_ctime;
- fi->i_disk_time[2] = inode->i_mtime;
- fi->i_disk_time[3] = fi->i_crtime;
+ init_idisk_time(inode);
+
+ /* Need all the flag bits */
+ f2fs_init_read_extent_tree(inode, node_page);
+ f2fs_init_age_extent_tree(inode);
+
f2fs_put_page(node_page, 1);
stat_inc_inline_xattr(inode);
@@ -480,6 +493,12 @@ static int do_read_inode(struct inode *inode)
return 0;
}
+static bool is_meta_ino(struct f2fs_sb_info *sbi, unsigned int ino)
+{
+ return ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi) ||
+ ino == F2FS_COMPRESS_INO(sbi);
+}
+
struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -491,16 +510,22 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW)) {
+ if (is_meta_ino(sbi, ino)) {
+ f2fs_err(sbi, "inaccessible inode: %lu, run fsck to repair", ino);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ ret = -EFSCORRUPTED;
+ trace_f2fs_iget_exit(inode, ret);
+ iput(inode);
+ f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE);
+ return ERR_PTR(ret);
+ }
+
trace_f2fs_iget(inode);
return inode;
}
- if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi))
- goto make_now;
-#ifdef CONFIG_F2FS_FS_COMPRESSION
- if (ino == F2FS_COMPRESS_INO(sbi))
+ if (is_meta_ino(sbi, ino))
goto make_now;
-#endif
ret = do_read_inode(inode);
if (ret)
@@ -585,7 +610,7 @@ retry:
void f2fs_update_inode(struct inode *inode, struct page *node_page)
{
struct f2fs_inode *ri;
- struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ];
f2fs_wait_on_page_writeback(node_page, NODE, true, true);
set_page_dirty(node_page);
@@ -599,12 +624,15 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
ri->i_uid = cpu_to_le32(i_uid_read(inode));
ri->i_gid = cpu_to_le32(i_gid_read(inode));
ri->i_links = cpu_to_le32(inode->i_nlink);
- ri->i_size = cpu_to_le64(i_size_read(inode));
ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1);
+ if (!f2fs_is_atomic_file(inode) ||
+ is_inode_flag_set(inode, FI_ATOMIC_COMMITTED))
+ ri->i_size = cpu_to_le64(i_size_read(inode));
+
if (et) {
read_lock(&et->lock);
- set_raw_extent(&et->largest, &ri->i_ext);
+ set_raw_read_extent(&et->largest, &ri->i_ext);
read_unlock(&et->lock);
} else {
memset(&ri->i_ext, 0, sizeof(ri->i_ext));
@@ -676,11 +704,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
if (inode->i_nlink == 0)
clear_page_private_inline(node_page);
- F2FS_I(inode)->i_disk_time[0] = inode->i_atime;
- F2FS_I(inode)->i_disk_time[1] = inode->i_ctime;
- F2FS_I(inode)->i_disk_time[2] = inode->i_mtime;
- F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime;
-
+ init_idisk_time(inode);
#ifdef CONFIG_F2FS_CHECK_FS
f2fs_inode_chksum_set(F2FS_I_SB(inode), node_page);
#endif
@@ -699,7 +723,8 @@ retry:
cond_resched();
goto retry;
} else if (err != -ENOENT) {
- f2fs_stop_checkpoint(sbi, false);
+ f2fs_stop_checkpoint(sbi, false,
+ STOP_CP_REASON_UPDATE_INODE);
}
return;
}
diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c
index d84c5f6cc09d..3166a8939ed4 100644
--- a/fs/f2fs/iostat.c
+++ b/fs/f2fs/iostat.c
@@ -31,55 +31,65 @@ int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset)
/* print app write IOs */
seq_puts(seq, "[WRITE]\n");
- seq_printf(seq, "app buffered: %-16llu\n",
+ seq_printf(seq, "app buffered data: %-16llu\n",
sbi->rw_iostat[APP_BUFFERED_IO]);
- seq_printf(seq, "app direct: %-16llu\n",
+ seq_printf(seq, "app direct data: %-16llu\n",
sbi->rw_iostat[APP_DIRECT_IO]);
- seq_printf(seq, "app mapped: %-16llu\n",
+ seq_printf(seq, "app mapped data: %-16llu\n",
sbi->rw_iostat[APP_MAPPED_IO]);
+ seq_printf(seq, "app buffered cdata: %-16llu\n",
+ sbi->rw_iostat[APP_BUFFERED_CDATA_IO]);
+ seq_printf(seq, "app mapped cdata: %-16llu\n",
+ sbi->rw_iostat[APP_MAPPED_CDATA_IO]);
/* print fs write IOs */
- seq_printf(seq, "fs data: %-16llu\n",
+ seq_printf(seq, "fs data: %-16llu\n",
sbi->rw_iostat[FS_DATA_IO]);
- seq_printf(seq, "fs node: %-16llu\n",
+ seq_printf(seq, "fs cdata: %-16llu\n",
+ sbi->rw_iostat[FS_CDATA_IO]);
+ seq_printf(seq, "fs node: %-16llu\n",
sbi->rw_iostat[FS_NODE_IO]);
- seq_printf(seq, "fs meta: %-16llu\n",
+ seq_printf(seq, "fs meta: %-16llu\n",
sbi->rw_iostat[FS_META_IO]);
- seq_printf(seq, "fs gc data: %-16llu\n",
+ seq_printf(seq, "fs gc data: %-16llu\n",
sbi->rw_iostat[FS_GC_DATA_IO]);
- seq_printf(seq, "fs gc node: %-16llu\n",
+ seq_printf(seq, "fs gc node: %-16llu\n",
sbi->rw_iostat[FS_GC_NODE_IO]);
- seq_printf(seq, "fs cp data: %-16llu\n",
+ seq_printf(seq, "fs cp data: %-16llu\n",
sbi->rw_iostat[FS_CP_DATA_IO]);
- seq_printf(seq, "fs cp node: %-16llu\n",
+ seq_printf(seq, "fs cp node: %-16llu\n",
sbi->rw_iostat[FS_CP_NODE_IO]);
- seq_printf(seq, "fs cp meta: %-16llu\n",
+ seq_printf(seq, "fs cp meta: %-16llu\n",
sbi->rw_iostat[FS_CP_META_IO]);
/* print app read IOs */
seq_puts(seq, "[READ]\n");
- seq_printf(seq, "app buffered: %-16llu\n",
+ seq_printf(seq, "app buffered data: %-16llu\n",
sbi->rw_iostat[APP_BUFFERED_READ_IO]);
- seq_printf(seq, "app direct: %-16llu\n",
+ seq_printf(seq, "app direct data: %-16llu\n",
sbi->rw_iostat[APP_DIRECT_READ_IO]);
- seq_printf(seq, "app mapped: %-16llu\n",
+ seq_printf(seq, "app mapped data: %-16llu\n",
sbi->rw_iostat[APP_MAPPED_READ_IO]);
+ seq_printf(seq, "app buffered cdata: %-16llu\n",
+ sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO]);
+ seq_printf(seq, "app mapped cdata: %-16llu\n",
+ sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO]);
/* print fs read IOs */
- seq_printf(seq, "fs data: %-16llu\n",
+ seq_printf(seq, "fs data: %-16llu\n",
sbi->rw_iostat[FS_DATA_READ_IO]);
- seq_printf(seq, "fs gc data: %-16llu\n",
+ seq_printf(seq, "fs gc data: %-16llu\n",
sbi->rw_iostat[FS_GDATA_READ_IO]);
- seq_printf(seq, "fs compr_data: %-16llu\n",
+ seq_printf(seq, "fs cdata: %-16llu\n",
sbi->rw_iostat[FS_CDATA_READ_IO]);
- seq_printf(seq, "fs node: %-16llu\n",
+ seq_printf(seq, "fs node: %-16llu\n",
sbi->rw_iostat[FS_NODE_READ_IO]);
- seq_printf(seq, "fs meta: %-16llu\n",
+ seq_printf(seq, "fs meta: %-16llu\n",
sbi->rw_iostat[FS_META_READ_IO]);
/* print other IOs */
seq_puts(seq, "[OTHER]\n");
- seq_printf(seq, "fs discard: %-16llu\n",
+ seq_printf(seq, "fs discard: %-16llu\n",
sbi->rw_iostat[FS_DISCARD]);
return 0;
@@ -159,7 +169,7 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
spin_unlock_irq(&sbi->iostat_lat_lock);
}
-void f2fs_update_iostat(struct f2fs_sb_info *sbi,
+void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode,
enum iostat_type type, unsigned long long io_bytes)
{
unsigned long flags;
@@ -176,6 +186,28 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi,
if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO)
sbi->rw_iostat[APP_READ_IO] += io_bytes;
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (inode && f2fs_compressed_file(inode)) {
+ if (type == APP_BUFFERED_IO)
+ sbi->rw_iostat[APP_BUFFERED_CDATA_IO] += io_bytes;
+
+ if (type == APP_BUFFERED_READ_IO)
+ sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO] += io_bytes;
+
+ if (type == APP_MAPPED_READ_IO)
+ sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO] += io_bytes;
+
+ if (type == APP_MAPPED_IO)
+ sbi->rw_iostat[APP_MAPPED_CDATA_IO] += io_bytes;
+
+ if (type == FS_DATA_READ_IO)
+ sbi->rw_iostat[FS_CDATA_READ_IO] += io_bytes;
+
+ if (type == FS_DATA_IO)
+ sbi->rw_iostat[FS_CDATA_IO] += io_bytes;
+ }
+#endif
+
spin_unlock_irqrestore(&sbi->iostat_lock, flags);
f2fs_record_iostat(sbi);
diff --git a/fs/f2fs/iostat.h b/fs/f2fs/iostat.h
index 22a2d01f57ef..2c048307b6e0 100644
--- a/fs/f2fs/iostat.h
+++ b/fs/f2fs/iostat.h
@@ -31,7 +31,7 @@ struct iostat_lat_info {
extern int __maybe_unused iostat_info_seq_show(struct seq_file *seq,
void *offset);
extern void f2fs_reset_iostat(struct f2fs_sb_info *sbi);
-extern void f2fs_update_iostat(struct f2fs_sb_info *sbi,
+extern void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode,
enum iostat_type type, unsigned long long io_bytes);
struct bio_iostat_ctx {
@@ -65,7 +65,7 @@ extern void f2fs_destroy_iostat_processing(void);
extern int f2fs_init_iostat(struct f2fs_sb_info *sbi);
extern void f2fs_destroy_iostat(struct f2fs_sb_info *sbi);
#else
-static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi,
+static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode,
enum iostat_type type, unsigned long long io_bytes) {}
static inline void iostat_update_and_unbind_ctx(struct bio *bio, int rw) {}
static inline void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi,
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index bf00d5057abb..6032589099ce 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -22,137 +22,6 @@
#include "acl.h"
#include <trace/events/f2fs.h>
-static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
- struct inode *dir, umode_t mode)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
- nid_t ino;
- struct inode *inode;
- bool nid_free = false;
- bool encrypt = false;
- int xattr_size = 0;
- int err;
-
- inode = new_inode(dir->i_sb);
- if (!inode)
- return ERR_PTR(-ENOMEM);
-
- if (!f2fs_alloc_nid(sbi, &ino)) {
- err = -ENOSPC;
- goto fail;
- }
-
- nid_free = true;
-
- inode_init_owner(mnt_userns, inode, dir, mode);
-
- inode->i_ino = ino;
- inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
- F2FS_I(inode)->i_crtime = inode->i_mtime;
- inode->i_generation = prandom_u32();
-
- if (S_ISDIR(inode->i_mode))
- F2FS_I(inode)->i_current_depth = 1;
-
- err = insert_inode_locked(inode);
- if (err) {
- err = -EINVAL;
- goto fail;
- }
-
- if (f2fs_sb_has_project_quota(sbi) &&
- (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL))
- F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid;
- else
- F2FS_I(inode)->i_projid = make_kprojid(mnt_userns,
- F2FS_DEF_PROJID);
-
- err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
- if (err)
- goto fail_drop;
-
- err = f2fs_dquot_initialize(inode);
- if (err)
- goto fail_drop;
-
- set_inode_flag(inode, FI_NEW_INODE);
-
- if (encrypt)
- f2fs_set_encrypted_inode(inode);
-
- if (f2fs_sb_has_extra_attr(sbi)) {
- set_inode_flag(inode, FI_EXTRA_ATTR);
- F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE;
- }
-
- if (test_opt(sbi, INLINE_XATTR))
- set_inode_flag(inode, FI_INLINE_XATTR);
-
- if (f2fs_may_inline_dentry(inode))
- set_inode_flag(inode, FI_INLINE_DENTRY);
-
- if (f2fs_sb_has_flexible_inline_xattr(sbi)) {
- f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
- if (f2fs_has_inline_xattr(inode))
- xattr_size = F2FS_OPTION(sbi).inline_xattr_size;
- /* Otherwise, will be 0 */
- } else if (f2fs_has_inline_xattr(inode) ||
- f2fs_has_inline_dentry(inode)) {
- xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
- }
- F2FS_I(inode)->i_inline_xattr_size = xattr_size;
-
- f2fs_init_extent_tree(inode, NULL);
-
- F2FS_I(inode)->i_flags =
- f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED);
-
- if (S_ISDIR(inode->i_mode))
- F2FS_I(inode)->i_flags |= F2FS_INDEX_FL;
-
- if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL)
- set_inode_flag(inode, FI_PROJ_INHERIT);
-
- if (f2fs_sb_has_compression(sbi)) {
- /* Inherit the compression flag in directory */
- if ((F2FS_I(dir)->i_flags & F2FS_COMPR_FL) &&
- f2fs_may_compress(inode))
- set_compress_context(inode);
- }
-
- /* Should enable inline_data after compression set */
- if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
- set_inode_flag(inode, FI_INLINE_DATA);
-
- stat_inc_inline_xattr(inode);
- stat_inc_inline_inode(inode);
- stat_inc_inline_dir(inode);
-
- f2fs_set_inode_flags(inode);
-
- trace_f2fs_new_inode(inode, 0);
- return inode;
-
-fail:
- trace_f2fs_new_inode(inode, err);
- make_bad_inode(inode);
- if (nid_free)
- set_inode_flag(inode, FI_FREE_NID);
- iput(inode);
- return ERR_PTR(err);
-fail_drop:
- trace_f2fs_new_inode(inode, err);
- dquot_drop(inode);
- inode->i_flags |= S_NOQUOTA;
- if (nid_free)
- set_inode_flag(inode, FI_FREE_NID);
- clear_nlink(inode);
- unlock_new_inode(inode);
- iput(inode);
- return ERR_PTR(err);
-}
-
static inline int is_extension_exist(const unsigned char *s, const char *sub,
bool tmp_ext)
{
@@ -187,36 +56,6 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub,
return 0;
}
-/*
- * Set file's temperature for hot/cold data separation
- */
-static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode,
- const unsigned char *name)
-{
- __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
- int i, cold_count, hot_count;
-
- f2fs_down_read(&sbi->sb_lock);
-
- cold_count = le32_to_cpu(sbi->raw_super->extension_count);
- hot_count = sbi->raw_super->hot_ext_count;
-
- for (i = 0; i < cold_count + hot_count; i++) {
- if (is_extension_exist(name, extlist[i], true))
- break;
- }
-
- f2fs_up_read(&sbi->sb_lock);
-
- if (i == cold_count + hot_count)
- return;
-
- if (i < cold_count)
- file_set_cold(inode);
- else
- file_set_hot(inode);
-}
-
int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name,
bool hot, bool set)
{
@@ -283,56 +122,215 @@ int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name,
return 0;
}
-static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
- const unsigned char *name)
+static void set_compress_new_inode(struct f2fs_sb_info *sbi, struct inode *dir,
+ struct inode *inode, const unsigned char *name)
{
__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
- unsigned char (*noext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).noextensions;
+ unsigned char (*noext)[F2FS_EXTENSION_LEN] =
+ F2FS_OPTION(sbi).noextensions;
unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions;
unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
int i, cold_count, hot_count;
- if (!f2fs_sb_has_compression(sbi) ||
- F2FS_I(inode)->i_flags & F2FS_NOCOMP_FL ||
- !f2fs_may_compress(inode) ||
- (!ext_cnt && !noext_cnt))
+ if (!f2fs_sb_has_compression(sbi))
return;
- f2fs_down_read(&sbi->sb_lock);
+ if (S_ISDIR(inode->i_mode))
+ goto inherit_comp;
+
+ /* This name comes only from normal files. */
+ if (!name)
+ return;
+ /* Don't compress hot files. */
+ f2fs_down_read(&sbi->sb_lock);
cold_count = le32_to_cpu(sbi->raw_super->extension_count);
hot_count = sbi->raw_super->hot_ext_count;
+ for (i = cold_count; i < cold_count + hot_count; i++)
+ if (is_extension_exist(name, extlist[i], false))
+ break;
+ f2fs_up_read(&sbi->sb_lock);
+ if (i < (cold_count + hot_count))
+ return;
- for (i = cold_count; i < cold_count + hot_count; i++) {
- if (is_extension_exist(name, extlist[i], false)) {
- f2fs_up_read(&sbi->sb_lock);
+ /* Don't compress unallowed extension. */
+ for (i = 0; i < noext_cnt; i++)
+ if (is_extension_exist(name, noext[i], false))
+ return;
+
+ /* Compress wanting extension. */
+ for (i = 0; i < ext_cnt; i++) {
+ if (is_extension_exist(name, ext[i], false)) {
+ set_compress_context(inode);
return;
}
}
+inherit_comp:
+ /* Inherit the {no-}compression flag in directory */
+ if (F2FS_I(dir)->i_flags & F2FS_NOCOMP_FL) {
+ F2FS_I(inode)->i_flags |= F2FS_NOCOMP_FL;
+ f2fs_mark_inode_dirty_sync(inode, true);
+ } else if (F2FS_I(dir)->i_flags & F2FS_COMPR_FL) {
+ set_compress_context(inode);
+ }
+}
+
+/*
+ * Set file's temperature for hot/cold data separation
+ */
+static void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode,
+ const unsigned char *name)
+{
+ __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
+ int i, cold_count, hot_count;
+ f2fs_down_read(&sbi->sb_lock);
+ cold_count = le32_to_cpu(sbi->raw_super->extension_count);
+ hot_count = sbi->raw_super->hot_ext_count;
+ for (i = 0; i < cold_count + hot_count; i++)
+ if (is_extension_exist(name, extlist[i], true))
+ break;
f2fs_up_read(&sbi->sb_lock);
- for (i = 0; i < noext_cnt; i++) {
- if (is_extension_exist(name, noext[i], false)) {
- f2fs_disable_compressed_file(inode);
- return;
- }
+ if (i == cold_count + hot_count)
+ return;
+
+ if (i < cold_count)
+ file_set_cold(inode);
+ else
+ file_set_hot(inode);
+}
+
+static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
+ struct inode *dir, umode_t mode,
+ const char *name)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+ nid_t ino;
+ struct inode *inode;
+ bool nid_free = false;
+ bool encrypt = false;
+ int xattr_size = 0;
+ int err;
+
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ if (!f2fs_alloc_nid(sbi, &ino)) {
+ err = -ENOSPC;
+ goto fail;
}
- if (is_inode_flag_set(inode, FI_COMPRESSED_FILE))
- return;
+ nid_free = true;
- for (i = 0; i < ext_cnt; i++) {
- if (!is_extension_exist(name, ext[i], false))
- continue;
+ inode_init_owner(mnt_userns, inode, dir, mode);
- /* Do not use inline_data with compression */
- stat_dec_inline_inode(inode);
- clear_inode_flag(inode, FI_INLINE_DATA);
- set_compress_context(inode);
- return;
+ inode->i_ino = ino;
+ inode->i_blocks = 0;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ F2FS_I(inode)->i_crtime = inode->i_mtime;
+ inode->i_generation = get_random_u32();
+
+ if (S_ISDIR(inode->i_mode))
+ F2FS_I(inode)->i_current_depth = 1;
+
+ err = insert_inode_locked(inode);
+ if (err) {
+ err = -EINVAL;
+ goto fail;
+ }
+
+ if (f2fs_sb_has_project_quota(sbi) &&
+ (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL))
+ F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid;
+ else
+ F2FS_I(inode)->i_projid = make_kprojid(mnt_userns,
+ F2FS_DEF_PROJID);
+
+ err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
+ if (err)
+ goto fail_drop;
+
+ err = f2fs_dquot_initialize(inode);
+ if (err)
+ goto fail_drop;
+
+ set_inode_flag(inode, FI_NEW_INODE);
+
+ if (encrypt)
+ f2fs_set_encrypted_inode(inode);
+
+ if (f2fs_sb_has_extra_attr(sbi)) {
+ set_inode_flag(inode, FI_EXTRA_ATTR);
+ F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE;
+ }
+
+ if (test_opt(sbi, INLINE_XATTR))
+ set_inode_flag(inode, FI_INLINE_XATTR);
+
+ if (f2fs_may_inline_dentry(inode))
+ set_inode_flag(inode, FI_INLINE_DENTRY);
+
+ if (f2fs_sb_has_flexible_inline_xattr(sbi)) {
+ f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
+ if (f2fs_has_inline_xattr(inode))
+ xattr_size = F2FS_OPTION(sbi).inline_xattr_size;
+ /* Otherwise, will be 0 */
+ } else if (f2fs_has_inline_xattr(inode) ||
+ f2fs_has_inline_dentry(inode)) {
+ xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
}
+ F2FS_I(inode)->i_inline_xattr_size = xattr_size;
+
+ F2FS_I(inode)->i_flags =
+ f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED);
+
+ if (S_ISDIR(inode->i_mode))
+ F2FS_I(inode)->i_flags |= F2FS_INDEX_FL;
+
+ if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL)
+ set_inode_flag(inode, FI_PROJ_INHERIT);
+
+ /* Check compression first. */
+ set_compress_new_inode(sbi, dir, inode, name);
+
+ /* Should enable inline_data after compression set */
+ if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
+ set_inode_flag(inode, FI_INLINE_DATA);
+
+ if (name && !test_opt(sbi, DISABLE_EXT_IDENTIFY))
+ set_file_temperature(sbi, inode, name);
+
+ stat_inc_inline_xattr(inode);
+ stat_inc_inline_inode(inode);
+ stat_inc_inline_dir(inode);
+
+ f2fs_set_inode_flags(inode);
+
+ f2fs_init_extent_tree(inode);
+
+ trace_f2fs_new_inode(inode, 0);
+ return inode;
+
+fail:
+ trace_f2fs_new_inode(inode, err);
+ make_bad_inode(inode);
+ if (nid_free)
+ set_inode_flag(inode, FI_FREE_NID);
+ iput(inode);
+ return ERR_PTR(err);
+fail_drop:
+ trace_f2fs_new_inode(inode, err);
+ dquot_drop(inode);
+ inode->i_flags |= S_NOQUOTA;
+ if (nid_free)
+ set_inode_flag(inode, FI_FREE_NID);
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ return ERR_PTR(err);
}
static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
@@ -352,15 +350,10 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(mnt_userns, dir, mode);
+ inode = f2fs_new_inode(mnt_userns, dir, mode, dentry->d_name.name);
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (!test_opt(sbi, DISABLE_EXT_IDENTIFY))
- set_file_temperature(sbi, inode, dentry->d_name.name);
-
- set_compress_inode(sbi, inode, dentry->d_name.name);
-
inode->i_op = &f2fs_file_inode_operations;
inode->i_fop = &f2fs_file_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
@@ -632,6 +625,8 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
goto fail;
}
f2fs_delete_entry(de, page, dir, inode);
+ f2fs_unlock_op(sbi);
+
#if IS_ENABLED(CONFIG_UNICODE)
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
@@ -642,8 +637,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
if (IS_CASEFOLDED(dir))
d_invalidate(dentry);
#endif
- f2fs_unlock_op(sbi);
-
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
fail:
@@ -689,7 +682,7 @@ static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO);
+ inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO, NULL);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -760,7 +753,7 @@ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode);
+ inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode, NULL);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -817,7 +810,7 @@ static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(mnt_userns, dir, mode);
+ inode = f2fs_new_inode(mnt_userns, dir, mode, NULL);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -845,7 +838,7 @@ out:
}
static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode, bool is_whiteout,
+ struct file *file, umode_t mode, bool is_whiteout,
struct inode **new_inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
@@ -856,7 +849,7 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(mnt_userns, dir, mode);
+ inode = f2fs_new_inode(mnt_userns, dir, mode, NULL);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -892,8 +885,8 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
inode->i_state |= I_LINKABLE;
spin_unlock(&inode->i_lock);
} else {
- if (dentry)
- d_tmpfile(dentry, inode);
+ if (file)
+ d_tmpfile(file, inode);
else
f2fs_i_links_write(inode, false);
}
@@ -915,16 +908,19 @@ out:
}
static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+ struct file *file, umode_t mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+ int err;
if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
if (!f2fs_is_checkpoint_ready(sbi))
return -ENOSPC;
- return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, false, NULL);
+ err = __f2fs_tmpfile(mnt_userns, dir, file, mode, false, NULL);
+
+ return finish_open_simple(file, err);
}
static int f2fs_create_whiteout(struct user_namespace *mnt_userns,
@@ -1376,7 +1372,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
.tmpfile = f2fs_tmpfile,
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
- .get_acl = f2fs_get_acl,
+ .get_inode_acl = f2fs_get_acl,
.set_acl = f2fs_set_acl,
.listxattr = f2fs_listxattr,
.fiemap = f2fs_fiemap,
@@ -1394,7 +1390,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
const struct inode_operations f2fs_special_inode_operations = {
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
- .get_acl = f2fs_get_acl,
+ .get_inode_acl = f2fs_get_acl,
.set_acl = f2fs_set_acl,
.listxattr = f2fs_listxattr,
};
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index e06a0c478b39..dde4c0458704 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -36,6 +36,7 @@ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: out-of-range nid=%x, run fsck to fix.",
__func__, nid);
+ f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE);
return -EFSCORRUPTED;
}
return 0;
@@ -59,7 +60,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
avail_ram = val.totalram - val.totalhigh;
/*
- * give 25%, 25%, 50%, 50%, 50% memory for each components respectively
+ * give 25%, 25%, 50%, 50%, 25%, 25% memory for each components respectively
*/
if (type == FREE_NIDS) {
mem_size = (nm_i->nid_cnt[FREE_NID] *
@@ -84,12 +85,16 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
sizeof(struct ino_entry);
mem_size >>= PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
- } else if (type == EXTENT_CACHE) {
- mem_size = (atomic_read(&sbi->total_ext_tree) *
+ } else if (type == READ_EXTENT_CACHE || type == AGE_EXTENT_CACHE) {
+ enum extent_type etype = type == READ_EXTENT_CACHE ?
+ EX_READ : EX_BLOCK_AGE;
+ struct extent_tree_info *eti = &sbi->extent_tree[etype];
+
+ mem_size = (atomic_read(&eti->total_ext_tree) *
sizeof(struct extent_tree) +
- atomic_read(&sbi->total_ext_node) *
+ atomic_read(&eti->total_ext_node) *
sizeof(struct extent_node)) >> PAGE_SHIFT;
- res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
+ res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
} else if (type == DISCARD_CACHE) {
mem_size = (atomic_read(&dcc->discard_cmd_cnt) *
sizeof(struct discard_cmd)) >> PAGE_SHIFT;
@@ -585,7 +590,7 @@ retry:
ne = nat_in_journal(journal, i);
node_info_from_raw_nat(ni, &ne);
}
- up_read(&curseg->journal_rwsem);
+ up_read(&curseg->journal_rwsem);
if (i >= 0) {
f2fs_up_read(&nm_i->nat_tree_lock);
goto cache;
@@ -858,7 +863,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
blkaddr = data_blkaddr(dn->inode, dn->node_page,
dn->ofs_in_node + 1);
- f2fs_update_extent_tree_range_compressed(dn->inode,
+ f2fs_update_read_extent_tree_range_compressed(dn->inode,
index, blkaddr,
F2FS_I(dn->inode)->i_cluster_size,
c_len);
@@ -1295,6 +1300,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs)
if (unlikely(new_ni.blk_addr != NULL_ADDR)) {
err = -EFSCORRUPTED;
set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto fail;
}
#endif
@@ -1358,8 +1364,7 @@ static int read_node_page(struct page *page, blk_opf_t op_flags)
return err;
/* NEW_ADDR can be seen, after cp_error drops some dirty node pages */
- if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR) ||
- is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) {
+ if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR)) {
ClearPageUptodate(page);
return -ENOENT;
}
@@ -1369,7 +1374,7 @@ static int read_node_page(struct page *page, blk_opf_t op_flags)
err = f2fs_submit_page_bio(&fio);
if (!err)
- f2fs_update_iostat(sbi, FS_NODE_READ_IO, F2FS_BLKSIZE);
+ f2fs_update_iostat(sbi, NULL, FS_NODE_READ_IO, F2FS_BLKSIZE);
return err;
}
@@ -2147,8 +2152,7 @@ static bool f2fs_dirty_node_folio(struct address_space *mapping,
if (IS_INODE(&folio->page))
f2fs_inode_chksum_set(F2FS_M_SB(mapping), &folio->page);
#endif
- if (!folio_test_dirty(folio)) {
- filemap_dirty_folio(mapping, folio);
+ if (filemap_dirty_folio(mapping, folio)) {
inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
set_page_private_reference(&folio->page);
return true;
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 3c09cae058b0..99454d46a939 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -146,7 +146,8 @@ enum mem_type {
NAT_ENTRIES, /* indicates the cached nat entry */
DIRTY_DENTS, /* indicates dirty dentry pages */
INO_ENTRIES, /* indicates inode entries */
- EXTENT_CACHE, /* indicates extent cache */
+ READ_EXTENT_CACHE, /* indicates read extent cache */
+ AGE_EXTENT_CACHE, /* indicates age extent cache */
DISCARD_CACHE, /* indicates memory of cached discard cmds */
COMPRESS_PAGE, /* indicates memory of cached compressed pages */
BASE_CHECK, /* check kernel status */
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index dcd0a1e35095..77fd453949b1 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -474,7 +474,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
struct dnode_of_data tdn = *dn;
nid_t ino, nid;
struct inode *inode;
- unsigned int offset;
+ unsigned int offset, ofs_in_node, max_addrs;
block_t bidx;
int i;
@@ -501,15 +501,25 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
got_it:
/* Use the locked dnode page and inode */
nid = le32_to_cpu(sum.nid);
+ ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+
+ max_addrs = ADDRS_PER_PAGE(dn->node_page, dn->inode);
+ if (ofs_in_node >= max_addrs) {
+ f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u",
+ ofs_in_node, dn->inode->i_ino, nid, max_addrs);
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUMMARY);
+ return -EFSCORRUPTED;
+ }
+
if (dn->inode->i_ino == nid) {
tdn.nid = nid;
if (!dn->inode_page_locked)
lock_page(dn->inode_page);
tdn.node_page = dn->inode_page;
- tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+ tdn.ofs_in_node = ofs_in_node;
goto truncate_out;
} else if (dn->nid == nid) {
- tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+ tdn.ofs_in_node = ofs_in_node;
goto truncate_out;
}
@@ -628,6 +638,7 @@ retry_dn:
inode->i_ino, ofs_of_node(dn.node_page),
ofs_of_node(page));
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER);
goto err;
}
@@ -640,12 +651,14 @@ retry_dn:
if (__is_valid_data_blkaddr(src) &&
!f2fs_is_valid_blkaddr(sbi, src, META_POR)) {
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto err;
}
if (__is_valid_data_blkaddr(dest) &&
!f2fs_is_valid_blkaddr(sbi, dest, META_POR)) {
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto err;
}
@@ -698,6 +711,16 @@ retry_prev:
goto err;
}
+ if (f2fs_is_valid_blkaddr(sbi, dest,
+ DATA_GENERIC_ENHANCE_UPDATE)) {
+ f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%lu, ofs:%u",
+ dest, inode->i_ino, dn.ofs_in_node);
+ err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi,
+ ERROR_INVALID_BLKADDR);
+ goto err;
+ }
+
/* write dummy data page */
f2fs_replace_block(sbi, &dn, src, dest,
ni.version, false, false);
@@ -900,9 +923,7 @@ int __init f2fs_create_recovery_cache(void)
{
fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
sizeof(struct fsync_inode_entry));
- if (!fsync_entry_slab)
- return -ENOMEM;
- return 0;
+ return fsync_entry_slab ? 0 : -ENOMEM;
}
void f2fs_destroy_recovery_cache(void)
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 0de21f82d7bc..25ddea478fc1 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -187,23 +187,24 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi)
void f2fs_abort_atomic_write(struct inode *inode, bool clean)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
if (!f2fs_is_atomic_file(inode))
return;
- if (clean)
- truncate_inode_pages_final(inode->i_mapping);
clear_inode_flag(fi->cow_inode, FI_COW_FILE);
iput(fi->cow_inode);
fi->cow_inode = NULL;
release_atomic_write_cnt(inode);
+ clear_inode_flag(inode, FI_ATOMIC_COMMITTED);
+ clear_inode_flag(inode, FI_ATOMIC_REPLACE);
clear_inode_flag(inode, FI_ATOMIC_FILE);
+ stat_dec_atomic_inode(inode);
- spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
- sbi->atomic_files--;
- spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+ if (clean) {
+ truncate_inode_pages_final(inode->i_mapping);
+ f2fs_i_size_write(inode, fi->original_i_size);
+ }
}
static int __replace_atomic_write_block(struct inode *inode, pgoff_t index,
@@ -261,14 +262,19 @@ static void __complete_revoke_list(struct inode *inode, struct list_head *head,
bool revoke)
{
struct revoke_entry *cur, *tmp;
+ bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE);
list_for_each_entry_safe(cur, tmp, head, list) {
if (revoke)
__replace_atomic_write_block(inode, cur->index,
cur->old_addr, NULL, true);
+
list_del(&cur->list);
kmem_cache_free(revoke_entry_slab, cur);
}
+
+ if (!revoke && truncate)
+ f2fs_do_truncate_blocks(inode, 0, false);
}
static int __f2fs_commit_atomic_write(struct inode *inode)
@@ -312,6 +318,8 @@ static int __f2fs_commit_atomic_write(struct inode *inode)
DATA_GENERIC_ENHANCE)) {
f2fs_put_dnode(&dn);
ret = -EFSCORRUPTED;
+ f2fs_handle_error(sbi,
+ ERROR_INVALID_BLKADDR);
goto out;
}
@@ -337,10 +345,12 @@ next:
}
out:
- if (ret)
+ if (ret) {
sbi->revoked_atomic_block += fi->atomic_write_cnt;
- else
+ } else {
sbi->committed_atomic_block += fi->atomic_write_cnt;
+ set_inode_flag(inode, FI_ATOMIC_COMMITTED);
+ }
__complete_revoke_list(inode, &revoke_list, ret ? true : false);
@@ -376,7 +386,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
{
if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
f2fs_show_injection_info(sbi, FAULT_CHECKPOINT);
- f2fs_stop_checkpoint(sbi, false);
+ f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT);
}
/* balance_fs_bg is able to be pending */
@@ -439,8 +449,14 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
return;
/* try to shrink extent cache when there is no enough memory */
- if (!f2fs_available_free_memory(sbi, EXTENT_CACHE))
- f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER);
+ if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE))
+ f2fs_shrink_read_extent_tree(sbi,
+ READ_EXTENT_CACHE_SHRINK_NUMBER);
+
+ /* try to shrink age extent cache when there is no enough memory */
+ if (!f2fs_available_free_memory(sbi, AGE_EXTENT_CACHE))
+ f2fs_shrink_age_extent_tree(sbi,
+ AGE_EXTENT_CACHE_SHRINK_NUMBER);
/* check the # of cached NAT entries */
if (!f2fs_available_free_memory(sbi, NAT_ENTRIES))
@@ -476,12 +492,12 @@ do_sync:
mutex_lock(&sbi->flush_lock);
blk_start_plug(&plug);
- f2fs_sync_dirty_inodes(sbi, FILE_INODE);
+ f2fs_sync_dirty_inodes(sbi, FILE_INODE, false);
blk_finish_plug(&plug);
mutex_unlock(&sbi->flush_lock);
}
- f2fs_sync_fs(sbi->sb, true);
+ f2fs_sync_fs(sbi->sb, 1);
stat_inc_bg_cp_count(sbi->stat_info);
}
@@ -622,12 +638,11 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi)
{
dev_t dev = sbi->sb->s_bdev->bd_dev;
struct flush_cmd_control *fcc;
- int err = 0;
if (SM_I(sbi)->fcc_info) {
fcc = SM_I(sbi)->fcc_info;
if (fcc->f2fs_issue_flush)
- return err;
+ return 0;
goto init_thread;
}
@@ -640,19 +655,20 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi)
init_llist_head(&fcc->issue_list);
SM_I(sbi)->fcc_info = fcc;
if (!test_opt(sbi, FLUSH_MERGE))
- return err;
+ return 0;
init_thread:
fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(fcc->f2fs_issue_flush)) {
- err = PTR_ERR(fcc->f2fs_issue_flush);
+ int err = PTR_ERR(fcc->f2fs_issue_flush);
+
kfree(fcc);
SM_I(sbi)->fcc_info = NULL;
return err;
}
- return err;
+ return 0;
}
void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free)
@@ -694,7 +710,8 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi)
} while (ret && --count);
if (ret) {
- f2fs_stop_checkpoint(sbi, false);
+ f2fs_stop_checkpoint(sbi, false,
+ STOP_CP_REASON_FLUSH_FAIL);
break;
}
@@ -857,7 +874,7 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi)
}
mutex_unlock(&dirty_i->seglist_lock);
- unusable = holes[DATA] > holes[NODE] ? holes[DATA] : holes[NODE];
+ unusable = max(holes[DATA], holes[NODE]);
if (unusable > ovp_holes)
return unusable - ovp_holes;
return 0;
@@ -1053,8 +1070,8 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->io_aware = true;
dpolicy->sync = false;
dpolicy->ordered = true;
- if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) {
- dpolicy->granularity = 1;
+ if (utilization(sbi) > dcc->discard_urgent_util) {
+ dpolicy->granularity = MIN_DISCARD_GRANULARITY;
if (atomic_read(&dcc->discard_cmd_cnt))
dpolicy->max_interval =
dcc->min_discard_issue_time;
@@ -1069,7 +1086,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
} else if (discard_type == DPOLICY_UMOUNT) {
dpolicy->io_aware = false;
/* we need to issue all to keep CP_TRIMMED_FLAG */
- dpolicy->granularity = 1;
+ dpolicy->granularity = MIN_DISCARD_GRANULARITY;
dpolicy->timeout = true;
}
}
@@ -1127,13 +1144,12 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
if (time_to_inject(sbi, FAULT_DISCARD)) {
f2fs_show_injection_info(sbi, FAULT_DISCARD);
err = -EIO;
- goto submit;
- }
- err = __blkdev_issue_discard(bdev,
+ } else {
+ err = __blkdev_issue_discard(bdev,
SECTOR_FROM_BLOCK(start),
SECTOR_FROM_BLOCK(len),
GFP_NOFS, &bio);
-submit:
+ }
if (err) {
spin_lock_irqsave(&dc->lock, flags);
if (dc->state == D_PARTIAL)
@@ -1171,7 +1187,7 @@ submit:
atomic_inc(&dcc->issued_discard);
- f2fs_update_iostat(sbi, FS_DISCARD, 1);
+ f2fs_update_iostat(sbi, NULL, FS_DISCARD, len * F2FS_BLKSIZE);
lstart += len;
start += len;
@@ -1343,13 +1359,13 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi,
}
}
-static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
+static void __queue_discard_cmd(struct f2fs_sb_info *sbi,
struct block_device *bdev, block_t blkstart, block_t blklen)
{
block_t lblkstart = blkstart;
if (!f2fs_bdev_support_discard(bdev))
- return 0;
+ return;
trace_f2fs_queue_discard(bdev, blkstart, blklen);
@@ -1361,7 +1377,6 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock);
__update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen);
mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock);
- return 0;
}
static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi,
@@ -1449,7 +1464,7 @@ retry:
if (i + 1 < dpolicy->granularity)
break;
- if (i < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered)
+ if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered)
return __issue_discard_cmd_orderly(sbi, dpolicy);
pend_list = &dcc->pend_list[i];
@@ -1646,6 +1661,9 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi)
struct discard_policy dpolicy;
bool dropped;
+ if (!atomic_read(&dcc->discard_cmd_cnt))
+ return false;
+
__init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT,
dcc->discard_granularity);
__issue_discard_cmd(sbi, &dpolicy);
@@ -1670,6 +1688,11 @@ static int issue_discard_thread(void *data)
set_freezable();
do {
+ wait_event_interruptible_timeout(*q,
+ kthread_should_stop() || freezing(current) ||
+ dcc->discard_wake,
+ msecs_to_jiffies(wait_ms));
+
if (sbi->gc_mode == GC_URGENT_HIGH ||
!f2fs_available_free_memory(sbi, DISCARD_CACHE))
__init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1);
@@ -1677,14 +1700,6 @@ static int issue_discard_thread(void *data)
__init_discard_policy(sbi, &dpolicy, DPOLICY_BG,
dcc->discard_granularity);
- if (!atomic_read(&dcc->discard_cmd_cnt))
- wait_ms = dpolicy.max_interval;
-
- wait_event_interruptible_timeout(*q,
- kthread_should_stop() || freezing(current) ||
- dcc->discard_wake,
- msecs_to_jiffies(wait_ms));
-
if (dcc->discard_wake)
dcc->discard_wake = 0;
@@ -1698,12 +1713,11 @@ static int issue_discard_thread(void *data)
continue;
if (kthread_should_stop())
return 0;
- if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) {
+ if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) ||
+ !atomic_read(&dcc->discard_cmd_cnt)) {
wait_ms = dpolicy.max_interval;
continue;
}
- if (!atomic_read(&dcc->discard_cmd_cnt))
- continue;
sb_start_intwrite(sbi->sb);
@@ -1718,6 +1732,8 @@ static int issue_discard_thread(void *data)
} else {
wait_ms = dpolicy.max_interval;
}
+ if (!atomic_read(&dcc->discard_cmd_cnt))
+ wait_ms = dpolicy.max_interval;
sb_end_intwrite(sbi->sb);
@@ -1761,7 +1777,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
}
/* For conventional zones, use regular discard if supported */
- return __queue_discard_cmd(sbi, bdev, lblkstart, blklen);
+ __queue_discard_cmd(sbi, bdev, lblkstart, blklen);
+ return 0;
}
#endif
@@ -1772,7 +1789,8 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi,
if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev))
return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
#endif
- return __queue_discard_cmd(sbi, bdev, blkstart, blklen);
+ __queue_discard_cmd(sbi, bdev, blkstart, blklen);
+ return 0;
}
static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
@@ -2026,8 +2044,10 @@ int f2fs_start_discard_thread(struct f2fs_sb_info *sbi)
dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi,
"f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev));
- if (IS_ERR(dcc->f2fs_issue_discard))
+ if (IS_ERR(dcc->f2fs_issue_discard)) {
err = PTR_ERR(dcc->f2fs_issue_discard);
+ dcc->f2fs_issue_discard = NULL;
+ }
return err;
}
@@ -2047,6 +2067,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
return -ENOMEM;
dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY;
+ dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY;
if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT)
dcc->discard_granularity = sbi->blocks_per_seg;
else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
@@ -2067,6 +2088,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME;
dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME;
dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME;
+ dcc->discard_urgent_util = DEF_DISCARD_URGENT_UTIL;
dcc->undiscard_blks = 0;
dcc->next_pos = 0;
dcc->root = RB_ROOT_CACHED;
@@ -2097,8 +2119,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi)
* Recovery can cache discard commands, so in error path of
* fill_super(), it needs to give a chance to handle them.
*/
- if (unlikely(atomic_read(&dcc->discard_cmd_cnt)))
- f2fs_issue_discard_timeout(sbi);
+ f2fs_issue_discard_timeout(sbi);
kfree(dcc);
SM_I(sbi)->dcc_info = NULL;
@@ -2535,7 +2556,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
sanity_check_seg_type(sbi, seg_type);
if (f2fs_need_rand_seg(sbi))
- return prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec);
+ return get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec);
/* if segs_per_sec is large than 1, we need to keep original policy. */
if (__is_large_section(sbi))
@@ -2589,7 +2610,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
curseg->alloc_type = LFS;
if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK)
curseg->fragment_remained_chunk =
- prandom_u32() % sbi->max_fragment_chunk + 1;
+ get_random_u32_inclusive(1, sbi->max_fragment_chunk);
}
static int __next_free_blkoff(struct f2fs_sb_info *sbi,
@@ -2626,9 +2647,9 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
/* To allocate block chunks in different sizes, use random number */
if (--seg->fragment_remained_chunk <= 0) {
seg->fragment_remained_chunk =
- prandom_u32() % sbi->max_fragment_chunk + 1;
+ get_random_u32_inclusive(1, sbi->max_fragment_chunk);
seg->next_blkoff +=
- prandom_u32() % sbi->max_fragment_hole + 1;
+ get_random_u32_inclusive(1, sbi->max_fragment_hole);
}
}
}
@@ -2643,7 +2664,7 @@ bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno)
* This function always allocates a used segment(from dirty seglist) by SSR
* manner, so it should recover the existing segment information of valid blocks
*/
-static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush)
+static void change_curseg(struct f2fs_sb_info *sbi, int type)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -2651,9 +2672,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush)
struct f2fs_summary_block *sum_node;
struct page *sum_page;
- if (flush)
- write_sum_page(sbi, curseg->sum_blk,
- GET_SUM_BLOCK(sbi, curseg->segno));
+ write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno));
__set_test_and_inuse(sbi, new_segno);
@@ -2692,7 +2711,7 @@ static void get_atssr_segment(struct f2fs_sb_info *sbi, int type,
struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno);
curseg->seg_type = se->type;
- change_curseg(sbi, type, true);
+ change_curseg(sbi, type);
} else {
/* allocate cold segment by default */
curseg->seg_type = CURSEG_COLD_DATA;
@@ -2836,31 +2855,20 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type,
return 0;
}
-/*
- * flush out current segment and replace it with new segment
- * This function should be returned with success, otherwise BUG
- */
-static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
- int type, bool force)
+static bool need_new_seg(struct f2fs_sb_info *sbi, int type)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
- if (force)
- new_curseg(sbi, type, true);
- else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) &&
- curseg->seg_type == CURSEG_WARM_NODE)
- new_curseg(sbi, type, false);
- else if (curseg->alloc_type == LFS &&
- is_next_segment_free(sbi, curseg, type) &&
- likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
- new_curseg(sbi, type, false);
- else if (f2fs_need_SSR(sbi) &&
- get_ssr_segment(sbi, type, SSR, 0))
- change_curseg(sbi, type, true);
- else
- new_curseg(sbi, type, false);
-
- stat_inc_seg_type(sbi, curseg);
+ if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) &&
+ curseg->seg_type == CURSEG_WARM_NODE)
+ return true;
+ if (curseg->alloc_type == LFS &&
+ is_next_segment_free(sbi, curseg, type) &&
+ likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+ return true;
+ if (!f2fs_need_SSR(sbi) || !get_ssr_segment(sbi, type, SSR, 0))
+ return true;
+ return false;
}
void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
@@ -2878,7 +2886,7 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
goto unlock;
if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0))
- change_curseg(sbi, type, true);
+ change_curseg(sbi, type);
else
new_curseg(sbi, type, true);
@@ -2913,7 +2921,8 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type,
return;
alloc:
old_segno = curseg->segno;
- SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true);
+ new_curseg(sbi, type, true);
+ stat_inc_seg_type(sbi, curseg);
locate_dirty_segment(sbi, old_segno);
}
@@ -2944,10 +2953,6 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
-static const struct segment_allocation default_salloc_ops = {
- .allocate_segment = allocate_segment_by_default,
-};
-
bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
struct cp_control *cpc)
{
@@ -3153,10 +3158,28 @@ static int __get_segment_type_4(struct f2fs_io_info *fio)
}
}
+static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_info ei;
+
+ if (f2fs_lookup_age_extent_cache(inode, pgofs, &ei)) {
+ if (!ei.age)
+ return NO_CHECK_TYPE;
+ if (ei.age <= sbi->hot_data_age_threshold)
+ return CURSEG_HOT_DATA;
+ if (ei.age <= sbi->warm_data_age_threshold)
+ return CURSEG_WARM_DATA;
+ return CURSEG_COLD_DATA;
+ }
+ return NO_CHECK_TYPE;
+}
+
static int __get_segment_type_6(struct f2fs_io_info *fio)
{
if (fio->type == DATA) {
struct inode *inode = fio->page->mapping->host;
+ int type;
if (is_inode_flag_set(inode, FI_ALIGNED_WRITE))
return CURSEG_COLD_DATA_PINNED;
@@ -3171,6 +3194,11 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
}
if (file_is_cold(inode) || f2fs_need_compress_data(inode))
return CURSEG_COLD_DATA;
+
+ type = __get_age_segment_type(inode, fio->page->index);
+ if (type != NO_CHECK_TYPE)
+ return type;
+
if (file_is_hot(inode) ||
is_inode_flag_set(inode, FI_HOT_DATA) ||
f2fs_is_cow_file(inode))
@@ -3267,11 +3295,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
update_sit_entry(sbi, old_blkaddr, -1);
if (!__has_curseg_space(sbi, curseg)) {
- if (from_gc)
+ /*
+ * Flush out current segment and replace it with new segment.
+ */
+ if (from_gc) {
get_atssr_segment(sbi, type, se->type,
AT_SSR, se->mtime);
- else
- sit_i->s_ops->allocate_segment(sbi, type, false);
+ } else {
+ if (need_new_seg(sbi, type))
+ new_curseg(sbi, type, false);
+ else
+ change_curseg(sbi, type);
+ stat_inc_seg_type(sbi, curseg);
+ }
}
/*
* segment dirty status should be updated after segment allocation,
@@ -3281,6 +3317,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr));
+ if (IS_DATASEG(type))
+ atomic64_inc(&sbi->allocated_data_blocks);
+
up_write(&sit_i->sentry_lock);
if (page && IS_NODESEG(type)) {
@@ -3388,7 +3427,7 @@ void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
f2fs_submit_page_write(&fio);
stat_inc_meta_count(sbi, page->index);
- f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE);
+ f2fs_update_iostat(sbi, NULL, io_type, F2FS_BLKSIZE);
}
void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio)
@@ -3398,7 +3437,7 @@ void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio)
set_summary(&sum, nid, 0, 0);
do_write_page(&sum, fio);
- f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
+ f2fs_update_iostat(fio->sbi, NULL, fio->io_type, F2FS_BLKSIZE);
}
void f2fs_outplace_write_data(struct dnode_of_data *dn,
@@ -3408,11 +3447,13 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn,
struct f2fs_summary sum;
f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
+ if (fio->io_type == FS_DATA_IO || fio->io_type == FS_CP_DATA_IO)
+ f2fs_update_age_extent_cache(dn);
set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version);
do_write_page(&sum, fio);
f2fs_update_data_blkaddr(dn, fio->new_blkaddr);
- f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE);
+ f2fs_update_iostat(sbi, dn->inode, fio->io_type, F2FS_BLKSIZE);
}
int f2fs_inplace_write_data(struct f2fs_io_info *fio)
@@ -3432,6 +3473,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.",
__func__, segno);
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE);
goto drop_bio;
}
@@ -3453,7 +3495,8 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
if (!err) {
f2fs_update_device_state(fio->sbi, fio->ino,
fio->new_blkaddr, 1);
- f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
+ f2fs_update_iostat(fio->sbi, fio->page->mapping->host,
+ fio->io_type, F2FS_BLKSIZE);
}
return err;
@@ -3530,7 +3573,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
/* change the current segment */
if (segno != curseg->segno) {
curseg->next_segno = segno;
- change_curseg(sbi, type, true);
+ change_curseg(sbi, type);
}
curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
@@ -3558,7 +3601,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
if (recover_curseg) {
if (old_cursegno != curseg->segno) {
curseg->next_segno = old_cursegno;
- change_curseg(sbi, type, true);
+ change_curseg(sbi, type);
}
curseg->next_blkoff = old_blkoff;
curseg->alloc_type = old_alloc_type;
@@ -4255,9 +4298,6 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
return -ENOMEM;
#endif
- /* init SIT information */
- sit_i->s_ops = &default_salloc_ops;
-
sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
sit_i->written_valid_blocks = 0;
@@ -4379,6 +4419,8 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
if (se->type >= NR_PERSISTENT_LOG) {
f2fs_err(sbi, "Invalid segment type: %u, segno: %u",
se->type, start);
+ f2fs_handle_error(sbi,
+ ERROR_INCONSISTENT_SUM_TYPE);
return -EFSCORRUPTED;
}
@@ -4415,6 +4457,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
f2fs_err(sbi, "Wrong journal entry on segno %u",
start);
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_CORRUPTED_JOURNAL);
break;
}
@@ -4434,6 +4477,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
f2fs_err(sbi, "Invalid segment type: %u, segno: %u",
se->type, start);
err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE);
break;
}
@@ -4465,6 +4509,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
if (sit_valid_blocks[NODE] != valid_node_count(sbi)) {
f2fs_err(sbi, "SIT is corrupted node# %u vs %u",
sit_valid_blocks[NODE], valid_node_count(sbi));
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_NODE_COUNT);
return -EFSCORRUPTED;
}
@@ -4473,6 +4518,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u",
sit_valid_blocks[DATA], sit_valid_blocks[NODE],
valid_user_blocks(sbi));
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_BLOCK_COUNT);
return -EFSCORRUPTED;
}
@@ -4623,6 +4669,7 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi)
f2fs_err(sbi,
"Current segment has invalid alloc_type:%d",
curseg->alloc_type);
+ f2fs_handle_error(sbi, ERROR_INVALID_CURSEG);
return -EFSCORRUPTED;
}
@@ -4640,6 +4687,7 @@ out:
"Current segment's next free block offset is inconsistent with bitmap, logtype:%u, segno:%u, type:%u, next_blkoff:%u, blkofs:%u",
i, curseg->segno, curseg->alloc_type,
curseg->next_blkoff, blkofs);
+ f2fs_handle_error(sbi, ERROR_INVALID_CURSEG);
return -EFSCORRUPTED;
}
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index d1d63766f2c7..3ad1b7b6fa94 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -222,10 +222,6 @@ struct sec_entry {
unsigned int valid_blocks; /* # of valid blocks in a section */
};
-struct segment_allocation {
- void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
-};
-
#define MAX_SKIP_GC_COUNT 16
struct revoke_entry {
@@ -235,8 +231,6 @@ struct revoke_entry {
};
struct sit_info {
- const struct segment_allocation *s_ops;
-
block_t sit_base_addr; /* start block address of SIT area */
block_t sit_blocks; /* # of blocks used by SIT area */
block_t written_valid_blocks; /* # of valid blocks in main area */
@@ -753,6 +747,7 @@ static inline int check_block_count(struct f2fs_sb_info *sbi,
f2fs_err(sbi, "Mismatch valid blocks %d vs. %d",
GET_SIT_VBLOCKS(raw_sit), valid_blocks);
set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT);
return -EFSCORRUPTED;
}
@@ -767,6 +762,7 @@ static inline int check_block_count(struct f2fs_sb_info *sbi,
f2fs_err(sbi, "Wrong valid blocks %d or segno %u",
GET_SIT_VBLOCKS(raw_sit), segno);
set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT);
return -EFSCORRUPTED;
}
return 0;
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index dd3c3c7a90ec..83d6fb97dcae 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -28,10 +28,13 @@ static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
return count > 0 ? count : 0;
}
-static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi)
+static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi,
+ enum extent_type type)
{
- return atomic_read(&sbi->total_zombie_tree) +
- atomic_read(&sbi->total_ext_node);
+ struct extent_tree_info *eti = &sbi->extent_tree[type];
+
+ return atomic_read(&eti->total_zombie_tree) +
+ atomic_read(&eti->total_ext_node);
}
unsigned long f2fs_shrink_count(struct shrinker *shrink,
@@ -53,8 +56,11 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink,
}
spin_unlock(&f2fs_list_lock);
- /* count extent cache entries */
- count += __count_extent_cache(sbi);
+ /* count read extent cache entries */
+ count += __count_extent_cache(sbi, EX_READ);
+
+ /* count block age extent cache entries */
+ count += __count_extent_cache(sbi, EX_BLOCK_AGE);
/* count clean nat cache entries */
count += __count_nat_entries(sbi);
@@ -100,7 +106,10 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink,
sbi->shrinker_run_no = run_no;
/* shrink extent cache entries */
- freed += f2fs_shrink_extent_tree(sbi, nr >> 1);
+ freed += f2fs_shrink_age_extent_tree(sbi, nr >> 2);
+
+ /* shrink read extent cache entries */
+ freed += f2fs_shrink_read_extent_tree(sbi, nr >> 2);
/* shrink clean nat cache entries */
if (freed < nr)
@@ -130,7 +139,9 @@ void f2fs_join_shrinker(struct f2fs_sb_info *sbi)
void f2fs_leave_shrinker(struct f2fs_sb_info *sbi)
{
- f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi));
+ f2fs_shrink_read_extent_tree(sbi, __count_extent_cache(sbi, EX_READ));
+ f2fs_shrink_age_extent_tree(sbi,
+ __count_extent_cache(sbi, EX_BLOCK_AGE));
spin_lock(&f2fs_list_lock);
list_del_init(&sbi->s_list);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 2451623c05a7..1f812b9ce985 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -61,6 +61,7 @@ const char *f2fs_fault_name[FAULT_MAX] = {
[FAULT_SLAB_ALLOC] = "slab alloc",
[FAULT_DQUOT_INIT] = "dquot initialize",
[FAULT_LOCK_OP] = "lock_op",
+ [FAULT_BLKADDR] = "invalid blkaddr",
};
void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
@@ -110,6 +111,7 @@ enum {
Opt_noinline_dentry,
Opt_flush_merge,
Opt_noflush_merge,
+ Opt_barrier,
Opt_nobarrier,
Opt_fastboot,
Opt_extent_cache,
@@ -161,6 +163,7 @@ enum {
Opt_nogc_merge,
Opt_discard_unit,
Opt_memory_mode,
+ Opt_age_extent_cache,
Opt_err,
};
@@ -186,6 +189,7 @@ static match_table_t f2fs_tokens = {
{Opt_noinline_dentry, "noinline_dentry"},
{Opt_flush_merge, "flush_merge"},
{Opt_noflush_merge, "noflush_merge"},
+ {Opt_barrier, "barrier"},
{Opt_nobarrier, "nobarrier"},
{Opt_fastboot, "fastboot"},
{Opt_extent_cache, "extent_cache"},
@@ -238,6 +242,7 @@ static match_table_t f2fs_tokens = {
{Opt_nogc_merge, "nogc_merge"},
{Opt_discard_unit, "discard_unit=%s"},
{Opt_memory_mode, "memory=%s"},
+ {Opt_age_extent_cache, "age_extent_cache"},
{Opt_err, NULL},
};
@@ -285,9 +290,7 @@ static int __init f2fs_create_casefold_cache(void)
{
f2fs_cf_name_slab = f2fs_kmem_cache_create("f2fs_casefolded_name",
F2FS_NAME_LEN);
- if (!f2fs_cf_name_slab)
- return -ENOMEM;
- return 0;
+ return f2fs_cf_name_slab ? 0 : -ENOMEM;
}
static void f2fs_destroy_casefold_cache(void)
@@ -301,10 +304,10 @@ static void f2fs_destroy_casefold_cache(void) { }
static inline void limit_reserve_root(struct f2fs_sb_info *sbi)
{
- block_t limit = min((sbi->user_block_count << 1) / 1000,
+ block_t limit = min((sbi->user_block_count >> 3),
sbi->user_block_count - sbi->reserved_blocks);
- /* limit is 0.2% */
+ /* limit is 12.5% */
if (test_opt(sbi, RESERVE_ROOT) &&
F2FS_OPTION(sbi).root_reserved_blocks > limit) {
F2FS_OPTION(sbi).root_reserved_blocks = limit;
@@ -806,14 +809,17 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
case Opt_nobarrier:
set_opt(sbi, NOBARRIER);
break;
+ case Opt_barrier:
+ clear_opt(sbi, NOBARRIER);
+ break;
case Opt_fastboot:
set_opt(sbi, FASTBOOT);
break;
case Opt_extent_cache:
- set_opt(sbi, EXTENT_CACHE);
+ set_opt(sbi, READ_EXTENT_CACHE);
break;
case Opt_noextent_cache:
- clear_opt(sbi, EXTENT_CACHE);
+ clear_opt(sbi, READ_EXTENT_CACHE);
break;
case Opt_noinline_data:
clear_opt(sbi, INLINE_DATA);
@@ -1253,6 +1259,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
}
kfree(name);
break;
+ case Opt_age_extent_cache:
+ set_opt(sbi, AGE_EXTENT_CACHE);
+ break;
default:
f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value",
p);
@@ -1342,6 +1351,16 @@ default_check:
return -EINVAL;
}
+ if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) {
+ f2fs_err(sbi, "LFS not compatible with ATGC");
+ return -EINVAL;
+ }
+
+ if (f2fs_is_readonly(sbi) && test_opt(sbi, FLUSH_MERGE)) {
+ f2fs_err(sbi, "FLUSH_MERGE not compatible with readonly mode");
+ return -EINVAL;
+ }
+
if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) {
f2fs_err(sbi, "Allow to mount readonly mode only");
return -EROFS;
@@ -1562,8 +1581,7 @@ static void f2fs_put_super(struct super_block *sb)
/* be sure to wait for any on-going discard commands */
dropped = f2fs_issue_discard_timeout(sbi);
- if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) &&
- !sbi->discard_blks && !dropped) {
+ if (f2fs_realtime_discard_enable(sbi) && !sbi->discard_blks && !dropped) {
struct cp_control cpc = {
.reason = CP_UMOUNT | CP_TRIMMED,
};
@@ -1666,9 +1684,8 @@ static int f2fs_freeze(struct super_block *sb)
if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY))
return -EINVAL;
- /* ensure no checkpoint required */
- if (!llist_empty(&F2FS_SB(sb)->cprc_info.issue_list))
- return -EINVAL;
+ /* Let's flush checkpoints and stop the thread. */
+ f2fs_flush_ckpt_thread(F2FS_SB(sb));
/* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */
set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING);
@@ -1931,16 +1948,22 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",inline_dentry");
else
seq_puts(seq, ",noinline_dentry");
- if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE))
+ if (test_opt(sbi, FLUSH_MERGE))
seq_puts(seq, ",flush_merge");
+ else
+ seq_puts(seq, ",noflush_merge");
if (test_opt(sbi, NOBARRIER))
seq_puts(seq, ",nobarrier");
+ else
+ seq_puts(seq, ",barrier");
if (test_opt(sbi, FASTBOOT))
seq_puts(seq, ",fastboot");
- if (test_opt(sbi, EXTENT_CACHE))
+ if (test_opt(sbi, READ_EXTENT_CACHE))
seq_puts(seq, ",extent_cache");
else
seq_puts(seq, ",noextent_cache");
+ if (test_opt(sbi, AGE_EXTENT_CACHE))
+ seq_puts(seq, ",age_extent_cache");
if (test_opt(sbi, DATA_FLUSH))
seq_puts(seq, ",data_flush");
@@ -2039,7 +2062,11 @@ static void default_options(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE;
F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
- F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
+ if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main) <=
+ SMALL_VOLUME_SEGMENTS)
+ F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
+ else
+ F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX;
F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID);
F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID);
@@ -2055,13 +2082,14 @@ static void default_options(struct f2fs_sb_info *sbi)
set_opt(sbi, INLINE_XATTR);
set_opt(sbi, INLINE_DATA);
set_opt(sbi, INLINE_DENTRY);
- set_opt(sbi, EXTENT_CACHE);
+ set_opt(sbi, READ_EXTENT_CACHE);
set_opt(sbi, NOHEAP);
clear_opt(sbi, DISABLE_CHECKPOINT);
set_opt(sbi, MERGE_CHECKPOINT);
F2FS_OPTION(sbi).unusable_cap = 0;
sbi->sb->s_flags |= SB_LAZYTIME;
- set_opt(sbi, FLUSH_MERGE);
+ if (!f2fs_is_readonly(sbi))
+ set_opt(sbi, FLUSH_MERGE);
if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi))
set_opt(sbi, DISCARD);
if (f2fs_sb_has_blkzoned(sbi)) {
@@ -2181,6 +2209,9 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
f2fs_up_write(&sbi->gc_lock);
f2fs_sync_fs(sbi->sb, 1);
+
+ /* Let's ensure there's no pending checkpoint anymore */
+ f2fs_flush_ckpt_thread(sbi);
}
static int f2fs_remount(struct super_block *sb, int *flags, char *data)
@@ -2193,14 +2224,14 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
bool need_restart_ckpt = false, need_stop_ckpt = false;
bool need_restart_flush = false, need_stop_flush = false;
bool need_restart_discard = false, need_stop_discard = false;
- bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
+ bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE);
+ bool no_age_extent_cache = !test_opt(sbi, AGE_EXTENT_CACHE);
bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT);
bool no_io_align = !F2FS_IO_ALIGNED(sbi);
bool no_atgc = !test_opt(sbi, ATGC);
bool no_discard = !test_opt(sbi, DISCARD);
bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE);
bool block_unit_discard = f2fs_block_unit_discard(sbi);
- struct discard_cmd_control *dcc;
#ifdef CONFIG_QUOTA
int i, j;
#endif
@@ -2283,11 +2314,17 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
}
/* disallow enable/disable extent_cache dynamically */
- if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) {
+ if (no_read_extent_cache == !!test_opt(sbi, READ_EXTENT_CACHE)) {
err = -EINVAL;
f2fs_warn(sbi, "switch extent_cache option is not allowed");
goto restore_opts;
}
+ /* disallow enable/disable age extent_cache dynamically */
+ if (no_age_extent_cache == !!test_opt(sbi, AGE_EXTENT_CACHE)) {
+ err = -EINVAL;
+ f2fs_warn(sbi, "switch age_extent_cache option is not allowed");
+ goto restore_opts;
+ }
if (no_io_align == !!F2FS_IO_ALIGNED(sbi)) {
err = -EINVAL;
@@ -2346,6 +2383,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
f2fs_stop_ckpt_thread(sbi);
need_restart_ckpt = true;
} else {
+ /* Flush if the prevous checkpoint, if exists. */
+ f2fs_flush_ckpt_thread(sbi);
+
err = f2fs_start_ckpt_thread(sbi);
if (err) {
f2fs_err(sbi,
@@ -2378,10 +2418,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
goto restore_flush;
need_stop_discard = true;
} else {
- dcc = SM_I(sbi)->dcc_info;
f2fs_stop_discard_thread(sbi);
- if (atomic_read(&dcc->discard_cmd_cnt))
- f2fs_issue_discard_timeout(sbi);
+ f2fs_issue_discard_timeout(sbi);
need_restart_discard = true;
}
}
@@ -2465,7 +2503,6 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data,
size_t toread;
loff_t i_size = i_size_read(inode);
struct page *page;
- char *kaddr;
if (off > i_size)
return 0;
@@ -2498,9 +2535,7 @@ repeat:
return -EIO;
}
- kaddr = kmap_atomic(page);
- memcpy(data, kaddr + offset, tocopy);
- kunmap_atomic(kaddr);
+ memcpy_from_page(data, page, offset, tocopy);
f2fs_put_page(page, 1);
offset = 0;
@@ -2522,7 +2557,6 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
size_t towrite = len;
struct page *page;
void *fsdata = NULL;
- char *kaddr;
int err = 0;
int tocopy;
@@ -2541,10 +2575,7 @@ retry:
break;
}
- kaddr = kmap_atomic(page);
- memcpy(kaddr + offset, data, tocopy);
- kunmap_atomic(kaddr);
- flush_dcache_page(page);
+ memcpy_to_page(page, offset, data, tocopy);
a_ops->write_end(NULL, mapping, off, tocopy, tocopy,
page, fsdata);
@@ -3039,23 +3070,24 @@ static void f2fs_get_ino_and_lblk_bits(struct super_block *sb,
*lblk_bits_ret = 8 * sizeof(block_t);
}
-static int f2fs_get_num_devices(struct super_block *sb)
+static struct block_device **f2fs_get_devices(struct super_block *sb,
+ unsigned int *num_devs)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct block_device **devs;
+ int i;
- if (f2fs_is_multi_device(sbi))
- return sbi->s_ndevs;
- return 1;
-}
+ if (!f2fs_is_multi_device(sbi))
+ return NULL;
-static void f2fs_get_devices(struct super_block *sb,
- struct request_queue **devs)
-{
- struct f2fs_sb_info *sbi = F2FS_SB(sb);
- int i;
+ devs = kmalloc_array(sbi->s_ndevs, sizeof(*devs), GFP_KERNEL);
+ if (!devs)
+ return ERR_PTR(-ENOMEM);
for (i = 0; i < sbi->s_ndevs; i++)
- devs[i] = bdev_get_queue(FDEV(i).bdev);
+ devs[i] = FDEV(i).bdev;
+ *num_devs = sbi->s_ndevs;
+ return devs;
}
static const struct fscrypt_operations f2fs_cryptops = {
@@ -3066,7 +3098,6 @@ static const struct fscrypt_operations f2fs_cryptops = {
.empty_dir = f2fs_empty_dir,
.has_stable_inodes = f2fs_has_stable_inodes,
.get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits,
- .get_num_devices = f2fs_get_num_devices,
.get_devices = f2fs_get_devices,
};
#endif
@@ -3613,7 +3644,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->seq_file_ra_mul = MIN_RA_MUL;
sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE;
sbi->max_fragment_hole = DEF_FRAGMENT_SIZE;
- spin_lock_init(&sbi->gc_urgent_high_lock);
+ spin_lock_init(&sbi->gc_remaining_trials_lock);
atomic64_set(&sbi->current_atomic_write, 0);
sbi->dir_level = DEF_DIR_LEVEL;
@@ -3843,6 +3874,68 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
return err;
}
+void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason)
+{
+ struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+ int err;
+
+ f2fs_down_write(&sbi->sb_lock);
+
+ if (raw_super->s_stop_reason[reason] < ((1 << BITS_PER_BYTE) - 1))
+ raw_super->s_stop_reason[reason]++;
+
+ err = f2fs_commit_super(sbi, false);
+ if (err)
+ f2fs_err(sbi, "f2fs_commit_super fails to record reason:%u err:%d",
+ reason, err);
+ f2fs_up_write(&sbi->sb_lock);
+}
+
+static void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag)
+{
+ spin_lock(&sbi->error_lock);
+ if (!test_bit(flag, (unsigned long *)sbi->errors)) {
+ set_bit(flag, (unsigned long *)sbi->errors);
+ sbi->error_dirty = true;
+ }
+ spin_unlock(&sbi->error_lock);
+}
+
+static bool f2fs_update_errors(struct f2fs_sb_info *sbi)
+{
+ bool need_update = false;
+
+ spin_lock(&sbi->error_lock);
+ if (sbi->error_dirty) {
+ memcpy(F2FS_RAW_SUPER(sbi)->s_errors, sbi->errors,
+ MAX_F2FS_ERRORS);
+ sbi->error_dirty = false;
+ need_update = true;
+ }
+ spin_unlock(&sbi->error_lock);
+
+ return need_update;
+}
+
+void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error)
+{
+ int err;
+
+ f2fs_save_errors(sbi, error);
+
+ f2fs_down_write(&sbi->sb_lock);
+
+ if (!f2fs_update_errors(sbi))
+ goto out_unlock;
+
+ err = f2fs_commit_super(sbi, false);
+ if (err)
+ f2fs_err(sbi, "f2fs_commit_super fails to record errors:%u, err:%d",
+ error, err);
+out_unlock:
+ f2fs_up_write(&sbi->sb_lock);
+}
+
static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
@@ -3991,18 +4084,16 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi)
static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
{
- struct f2fs_sm_info *sm_i = SM_I(sbi);
-
/* adjust parameters according to the volume size */
- if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) {
- F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
+ if (MAIN_SEGS(sbi) <= SMALL_VOLUME_SEGMENTS) {
if (f2fs_block_unit_discard(sbi))
- sm_i->dcc_info->discard_granularity = 1;
- sm_i->ipu_policy = 1 << F2FS_IPU_FORCE |
+ SM_I(sbi)->dcc_info->discard_granularity =
+ MIN_DISCARD_GRANULARITY;
+ SM_I(sbi)->ipu_policy = 1 << F2FS_IPU_FORCE |
1 << F2FS_IPU_HONOR_OPU_WRITE;
}
- sbi->readdir_ra = 1;
+ sbi->readdir_ra = true;
}
static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
@@ -4030,6 +4121,24 @@ try_onemore:
sbi->sb = sb;
+ /* initialize locks within allocated memory */
+ init_f2fs_rwsem(&sbi->gc_lock);
+ mutex_init(&sbi->writepages);
+ init_f2fs_rwsem(&sbi->cp_global_sem);
+ init_f2fs_rwsem(&sbi->node_write);
+ init_f2fs_rwsem(&sbi->node_change);
+ spin_lock_init(&sbi->stat_lock);
+ init_f2fs_rwsem(&sbi->cp_rwsem);
+ init_f2fs_rwsem(&sbi->quota_sem);
+ init_waitqueue_head(&sbi->cp_wait);
+ spin_lock_init(&sbi->error_lock);
+
+ for (i = 0; i < NR_INODE_TYPE; i++) {
+ INIT_LIST_HEAD(&sbi->inode_list[i]);
+ spin_lock_init(&sbi->inode_lock[i]);
+ }
+ mutex_init(&sbi->flush_lock);
+
/* Load the checksum driver */
sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0);
if (IS_ERR(sbi->s_chksum_driver)) {
@@ -4053,6 +4162,8 @@ try_onemore:
sb->s_fs_info = sbi;
sbi->raw_super = raw_super;
+ memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS);
+
/* precompute checksum seed for metadata */
if (f2fs_sb_has_inode_chksum(sbi))
sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid,
@@ -4109,23 +4220,14 @@ try_onemore:
/* init f2fs-specific super block info */
sbi->valid_super_block = valid_super_block;
- init_f2fs_rwsem(&sbi->gc_lock);
- mutex_init(&sbi->writepages);
- init_f2fs_rwsem(&sbi->cp_global_sem);
- init_f2fs_rwsem(&sbi->node_write);
- init_f2fs_rwsem(&sbi->node_change);
/* disallow all the data/node/meta page writes */
set_sbi_flag(sbi, SBI_POR_DOING);
- spin_lock_init(&sbi->stat_lock);
err = f2fs_init_write_merge_io(sbi);
if (err)
goto free_bio_info;
- init_f2fs_rwsem(&sbi->cp_rwsem);
- init_f2fs_rwsem(&sbi->quota_sem);
- init_waitqueue_head(&sbi->cp_wait);
init_sb_info(sbi);
err = f2fs_init_iostat(sbi);
@@ -4203,12 +4305,6 @@ try_onemore:
limit_reserve_root(sbi);
adjust_unusable_cap_perc(sbi);
- for (i = 0; i < NR_INODE_TYPE; i++) {
- INIT_LIST_HEAD(&sbi->inode_list[i]);
- spin_lock_init(&sbi->inode_lock[i]);
- }
- mutex_init(&sbi->flush_lock);
-
f2fs_init_extent_cache_info(sbi);
f2fs_init_ino_entry_info(sbi);
@@ -4455,9 +4551,9 @@ free_nm:
f2fs_destroy_node_manager(sbi);
free_sm:
f2fs_destroy_segment_manager(sbi);
- f2fs_destroy_post_read_wq(sbi);
stop_ckpt_thread:
f2fs_stop_ckpt_thread(sbi);
+ f2fs_destroy_post_read_wq(sbi);
free_devices:
destroy_device_list(sbi);
kvfree(sbi->ckpt);
@@ -4558,9 +4654,7 @@ static int __init init_inodecache(void)
f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache",
sizeof(struct f2fs_inode_info), 0,
SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL);
- if (!f2fs_inode_cachep)
- return -ENOMEM;
- return 0;
+ return f2fs_inode_cachep ? 0 : -ENOMEM;
}
static void destroy_inodecache(void)
@@ -4625,7 +4719,7 @@ static int __init init_f2fs_fs(void)
goto free_iostat;
err = f2fs_init_bioset();
if (err)
- goto free_bio_enrty_cache;
+ goto free_bio_entry_cache;
err = f2fs_init_compress_mempool();
if (err)
goto free_bioset;
@@ -4642,7 +4736,7 @@ free_compress_mempool:
f2fs_destroy_compress_mempool();
free_bioset:
f2fs_destroy_bioset();
-free_bio_enrty_cache:
+free_bio_entry_cache:
f2fs_destroy_bio_entry_cache();
free_iostat:
f2fs_destroy_iostat_processing();
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index eba5fb1629d7..83a366f3ee80 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -53,9 +53,9 @@ static const char *gc_mode_names[MAX_GC_MODE] = {
struct f2fs_attr {
struct attribute attr;
- ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *);
- ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *,
- const char *, size_t);
+ ssize_t (*show)(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf);
+ ssize_t (*store)(struct f2fs_attr *a, struct f2fs_sb_info *sbi,
+ const char *buf, size_t len);
int struct_type;
int offset;
int id;
@@ -95,28 +95,28 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
static ssize_t dirty_segments_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)(dirty_segments(sbi)));
}
static ssize_t free_segments_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)(free_segments(sbi)));
}
static ssize_t ovp_segments_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)(overprovision_segments(sbi)));
}
static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)(sbi->kbytes_written +
((f2fs_get_sectors_written(sbi) -
sbi->sectors_written_start) >> 1)));
@@ -125,7 +125,13 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
static ssize_t sb_status_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%lx\n", sbi->s_flag);
+ return sysfs_emit(buf, "%lx\n", sbi->s_flag);
+}
+
+static ssize_t cp_status_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ return sysfs_emit(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags));
}
static ssize_t pending_discard_show(struct f2fs_attr *a,
@@ -133,10 +139,16 @@ static ssize_t pending_discard_show(struct f2fs_attr *a,
{
if (!SM_I(sbi)->dcc_info)
return -EINVAL;
- return sprintf(buf, "%llu\n", (unsigned long long)atomic_read(
+ return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read(
&SM_I(sbi)->dcc_info->discard_cmd_cnt));
}
+static ssize_t gc_mode_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ return sysfs_emit(buf, "%s\n", gc_mode_names[sbi->gc_mode]);
+}
+
static ssize_t features_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
@@ -193,7 +205,7 @@ static ssize_t features_show(struct f2fs_attr *a,
static ssize_t current_reserved_blocks_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%u\n", sbi->current_reserved_blocks);
+ return sysfs_emit(buf, "%u\n", sbi->current_reserved_blocks);
}
static ssize_t unusable_show(struct f2fs_attr *a,
@@ -205,7 +217,7 @@ static ssize_t unusable_show(struct f2fs_attr *a,
unusable = sbi->unusable_block_count;
else
unusable = f2fs_get_unusable_blocks(sbi);
- return sprintf(buf, "%llu\n", (unsigned long long)unusable);
+ return sysfs_emit(buf, "%llu\n", (unsigned long long)unusable);
}
static ssize_t encoding_show(struct f2fs_attr *a,
@@ -220,13 +232,13 @@ static ssize_t encoding_show(struct f2fs_attr *a,
(sb->s_encoding->version >> 8) & 0xff,
sb->s_encoding->version & 0xff);
#endif
- return sprintf(buf, "(none)");
+ return sysfs_emit(buf, "(none)\n");
}
static ssize_t mounted_time_sec_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%llu", SIT_I(sbi)->mounted_time);
+ return sysfs_emit(buf, "%llu\n", SIT_I(sbi)->mounted_time);
}
#ifdef CONFIG_F2FS_STAT_FS
@@ -235,7 +247,7 @@ static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a,
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
- return sprintf(buf, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)(si->tot_blks -
(si->bg_data_blks + si->bg_node_blks)));
}
@@ -245,7 +257,7 @@ static ssize_t moved_blocks_background_show(struct f2fs_attr *a,
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
- return sprintf(buf, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)(si->bg_data_blks + si->bg_node_blks));
}
@@ -256,7 +268,7 @@ static ssize_t avg_vblocks_show(struct f2fs_attr *a,
si->dirty_count = dirty_segments(sbi);
f2fs_update_sit_info(sbi);
- return sprintf(buf, "%llu\n", (unsigned long long)(si->avg_vblocks));
+ return sysfs_emit(buf, "%llu\n", (unsigned long long)(si->avg_vblocks));
}
#endif
@@ -326,13 +338,8 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
return sysfs_emit(buf, "%u\n", sbi->compr_new_inode);
#endif
- if (!strcmp(a->attr.name, "gc_urgent"))
- return sysfs_emit(buf, "%s\n",
- gc_mode_names[sbi->gc_mode]);
-
if (!strcmp(a->attr.name, "gc_segment_mode"))
- return sysfs_emit(buf, "%s\n",
- gc_mode_names[sbi->gc_segment_mode]);
+ return sysfs_emit(buf, "%u\n", sbi->gc_segment_mode);
if (!strcmp(a->attr.name, "gc_reclaimed_segments")) {
return sysfs_emit(buf, "%u\n",
@@ -356,7 +363,7 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
ui = (unsigned int *)(ptr + a->offset);
- return sprintf(buf, "%u\n", *ui);
+ return sysfs_emit(buf, "%u\n", *ui);
}
static ssize_t __sbi_store(struct f2fs_attr *a,
@@ -477,14 +484,27 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "max_ordered_discard")) {
+ if (t == 0 || t > MAX_PLIST_NUM)
+ return -EINVAL;
+ if (!f2fs_block_unit_discard(sbi))
+ return -EINVAL;
+ *ui = t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "discard_urgent_util")) {
+ if (t > 100)
+ return -EINVAL;
+ *ui = t;
+ return count;
+ }
+
if (!strcmp(a->attr.name, "migration_granularity")) {
if (t == 0 || t > sbi->segs_per_sec)
return -EINVAL;
}
- if (!strcmp(a->attr.name, "trim_sections"))
- return -EINVAL;
-
if (!strcmp(a->attr.name, "gc_urgent")) {
if (t == 0) {
sbi->gc_mode = GC_NORMAL;
@@ -525,11 +545,10 @@ out:
return count;
}
- if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) {
- spin_lock(&sbi->gc_urgent_high_lock);
- sbi->gc_urgent_high_limited = t != 0;
- sbi->gc_urgent_high_remaining = t;
- spin_unlock(&sbi->gc_urgent_high_lock);
+ if (!strcmp(a->attr.name, "gc_remaining_trials")) {
+ spin_lock(&sbi->gc_remaining_trials_lock);
+ sbi->gc_remaining_trials = t;
+ spin_unlock(&sbi->gc_remaining_trials_lock);
return count;
}
@@ -644,6 +663,29 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "readdir_ra")) {
+ sbi->readdir_ra = !!t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "hot_data_age_threshold")) {
+ if (t == 0 || t >= sbi->warm_data_age_threshold)
+ return -EINVAL;
+ if (t == *ui)
+ return count;
+ *ui = (unsigned int)t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "warm_data_age_threshold")) {
+ if (t == 0 || t <= sbi->hot_data_age_threshold)
+ return -EINVAL;
+ if (t == *ui)
+ return count;
+ *ui = (unsigned int)t;
+ return count;
+ }
+
*ui = (unsigned int)t;
return count;
@@ -716,7 +758,7 @@ static void f2fs_sb_release(struct kobject *kobj)
static ssize_t f2fs_feature_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "supported\n");
+ return sysfs_emit(buf, "supported\n");
}
#define F2FS_FEATURE_RO_ATTR(_name) \
@@ -729,8 +771,8 @@ static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
if (F2FS_HAS_FEATURE(sbi, a->id))
- return sprintf(buf, "supported\n");
- return sprintf(buf, "unsupported\n");
+ return sysfs_emit(buf, "supported\n");
+ return sysfs_emit(buf, "unsupported\n");
}
#define F2FS_SB_FEATURE_RO_ATTR(_name, _feat) \
@@ -783,9 +825,10 @@ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_req
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_urgent_util, discard_urgent_util);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_ordered_discard, max_ordered_discard);
F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks);
-F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
@@ -820,7 +863,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
#endif
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag);
-F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent_high_remaining, gc_urgent_high_remaining);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_remaining_trials, gc_remaining_trials);
F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio);
F2FS_GENERAL_RO_ATTR(dirty_segments);
F2FS_GENERAL_RO_ATTR(free_segments);
@@ -833,6 +876,7 @@ F2FS_GENERAL_RO_ATTR(encoding);
F2FS_GENERAL_RO_ATTR(mounted_time_sec);
F2FS_GENERAL_RO_ATTR(main_blkaddr);
F2FS_GENERAL_RO_ATTR(pending_discard);
+F2FS_GENERAL_RO_ATTR(gc_mode);
#ifdef CONFIG_F2FS_STAT_FS
F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count);
F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count);
@@ -897,6 +941,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, peak_atomic_write, peak_atomic_write);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, committed_atomic_block, committed_atomic_block);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, revoked_atomic_block, revoked_atomic_block);
+/* For block age extent cache */
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, hot_data_age_threshold, hot_data_age_threshold);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, warm_data_age_threshold, warm_data_age_threshold);
+
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
ATTR_LIST(gc_urgent_sleep_time),
@@ -912,9 +960,11 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(min_discard_issue_time),
ATTR_LIST(mid_discard_issue_time),
ATTR_LIST(max_discard_issue_time),
+ ATTR_LIST(discard_urgent_util),
ATTR_LIST(discard_granularity),
+ ATTR_LIST(max_ordered_discard),
ATTR_LIST(pending_discard),
- ATTR_LIST(batched_trim_sections),
+ ATTR_LIST(gc_mode),
ATTR_LIST(ipu_policy),
ATTR_LIST(min_ipu_util),
ATTR_LIST(min_fsync_blocks),
@@ -947,7 +997,7 @@ static struct attribute *f2fs_attrs[] = {
#endif
ATTR_LIST(data_io_flag),
ATTR_LIST(node_io_flag),
- ATTR_LIST(gc_urgent_high_remaining),
+ ATTR_LIST(gc_remaining_trials),
ATTR_LIST(ckpt_thread_ioprio),
ATTR_LIST(dirty_segments),
ATTR_LIST(free_segments),
@@ -990,6 +1040,8 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(peak_atomic_write),
ATTR_LIST(committed_atomic_block),
ATTR_LIST(revoked_atomic_block),
+ ATTR_LIST(hot_data_age_threshold),
+ ATTR_LIST(warm_data_age_threshold),
NULL,
};
ATTRIBUTE_GROUPS(f2fs);
@@ -1030,8 +1082,10 @@ static struct attribute *f2fs_feat_attrs[] = {
ATTRIBUTE_GROUPS(f2fs_feat);
F2FS_GENERAL_RO_ATTR(sb_status);
+F2FS_GENERAL_RO_ATTR(cp_status);
static struct attribute *f2fs_stat_attrs[] = {
ATTR_LIST(sb_status),
+ ATTR_LIST(cp_status),
NULL,
};
ATTRIBUTE_GROUPS(f2fs_stat);
@@ -1236,6 +1290,44 @@ static int __maybe_unused victim_bits_seq_show(struct seq_file *seq,
return 0;
}
+static int __maybe_unused discard_plist_seq_show(struct seq_file *seq,
+ void *offset)
+{
+ struct super_block *sb = seq->private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ int i, count;
+
+ seq_puts(seq, "Discard pend list(Show diacrd_cmd count on each entry, .:not exist):\n");
+ if (!f2fs_realtime_discard_enable(sbi))
+ return 0;
+
+ if (dcc) {
+ mutex_lock(&dcc->cmd_lock);
+ for (i = 0; i < MAX_PLIST_NUM; i++) {
+ struct list_head *pend_list;
+ struct discard_cmd *dc, *tmp;
+
+ if (i % 8 == 0)
+ seq_printf(seq, " %-3d", i);
+ count = 0;
+ pend_list = &dcc->pend_list[i];
+ list_for_each_entry_safe(dc, tmp, pend_list, list)
+ count++;
+ if (count)
+ seq_printf(seq, " %7d", count);
+ else
+ seq_puts(seq, " .");
+ if (i % 8 == 7)
+ seq_putc(seq, '\n');
+ }
+ seq_putc(seq, '\n');
+ mutex_unlock(&dcc->cmd_lock);
+ }
+
+ return 0;
+}
+
int __init f2fs_init_sysfs(void)
{
int ret;
@@ -1306,6 +1398,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
#endif
proc_create_single_data("victim_bits", 0444, sbi->s_proc,
victim_bits_seq_show, sb);
+ proc_create_single_data("discard_plist_info", 0444, sbi->s_proc,
+ discard_plist_seq_show, sb);
}
return 0;
put_feature_list_kobj:
@@ -1329,6 +1423,7 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi)
remove_proc_entry("segment_info", sbi->s_proc);
remove_proc_entry("segment_bits", sbi->s_proc);
remove_proc_entry("victim_bits", sbi->s_proc);
+ remove_proc_entry("discard_plist_info", sbi->s_proc);
remove_proc_entry(sbi->sb->s_id, f2fs_proc_root);
}
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 7b8f2b41c29b..c352fff88a5e 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -47,16 +47,13 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count,
size_t n = min_t(size_t, count,
PAGE_SIZE - offset_in_page(pos));
struct page *page;
- void *addr;
page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT,
NULL);
if (IS_ERR(page))
return PTR_ERR(page);
- addr = kmap_atomic(page);
- memcpy(buf, addr + offset_in_page(pos), n);
- kunmap_atomic(addr);
+ memcpy_from_page(buf, page, offset_in_page(pos), n);
put_page(page);
@@ -85,16 +82,13 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
PAGE_SIZE - offset_in_page(pos));
struct page *page;
void *fsdata;
- void *addr;
int res;
res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata);
if (res)
return res;
- addr = kmap_atomic(page);
- memcpy(addr + offset_in_page(pos), buf, n);
- kunmap_atomic(addr);
+ memcpy_to_page(page, offset_in_page(pos), buf, n);
res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata);
if (res < 0)
@@ -246,6 +240,8 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf,
if (pos + size < pos || pos + size > inode->i_sb->s_maxbytes ||
pos < f2fs_verity_metadata_pos(inode) || size > INT_MAX) {
f2fs_warn(F2FS_I_SB(inode), "invalid verity xattr");
+ f2fs_handle_error(F2FS_I_SB(inode),
+ ERROR_CORRUPTED_VERITY_XATTR);
return -EFSCORRUPTED;
}
if (buf_size) {
@@ -262,13 +258,14 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
{
- DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
struct page *page;
index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
if (!page || !PageUptodate(page)) {
+ DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
+
if (page)
put_page(page);
else if (num_ra_pages > 1)
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index c76c15086e5f..dc2e8637189e 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -367,6 +367,8 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
inode->i_ino);
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
err = -EFSCORRUPTED;
+ f2fs_handle_error(F2FS_I_SB(inode),
+ ERROR_CORRUPTED_XATTR);
goto out;
}
check:
@@ -583,6 +585,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
inode->i_ino);
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
error = -EFSCORRUPTED;
+ f2fs_handle_error(F2FS_I_SB(inode),
+ ERROR_CORRUPTED_XATTR);
goto cleanup;
}
@@ -658,6 +662,8 @@ static int __f2fs_setxattr(struct inode *inode, int index,
inode->i_ino);
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
error = -EFSCORRUPTED;
+ f2fs_handle_error(F2FS_I_SB(inode),
+ ERROR_CORRUPTED_XATTR);
goto exit;
}
@@ -684,6 +690,8 @@ static int __f2fs_setxattr(struct inode *inode, int index,
inode->i_ino, ENTRY_SIZE(last));
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
error = -EFSCORRUPTED;
+ f2fs_handle_error(F2FS_I_SB(inode),
+ ERROR_CORRUPTED_XATTR);
goto exit;
}
last = XATTR_NEXT_ENTRY(last);
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 249825017da7..00235b8a1823 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -705,7 +705,7 @@ static int fat_readdir(struct file *file, struct dir_context *ctx)
}
#define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type) \
-static int func(struct dir_context *ctx, const char *name, int name_len, \
+static bool func(struct dir_context *ctx, const char *name, int name_len, \
loff_t offset, u64 ino, unsigned int d_type) \
{ \
struct fat_ioctl_filldir_callback *buf = \
@@ -714,7 +714,7 @@ static int func(struct dir_context *ctx, const char *name, int name_len, \
struct dirent_type __user *d2 = d1 + 1; \
\
if (buf->result) \
- return -EINVAL; \
+ return false; \
buf->result++; \
\
if (name != NULL) { \
@@ -750,10 +750,10 @@ static int func(struct dir_context *ctx, const char *name, int name_len, \
put_user(short_len, &d1->d_reclen)) \
goto efault; \
} \
- return 0; \
+ return true; \
efault: \
buf->result = -EFAULT; \
- return -EFAULT; \
+ return false; \
}
FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent)
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 3e4eb3467cb4..8a6b493b5b5f 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -461,8 +461,9 @@ static int fat_allow_set_time(struct user_namespace *mnt_userns,
{
umode_t allow_utime = sbi->options.allow_utime;
- if (!uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) {
- if (in_group_p(i_gid_into_mnt(mnt_userns, inode)))
+ if (!vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode),
+ current_fsuid())) {
+ if (vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)))
allow_utime >>= 3;
if (allow_utime & MAY_WRITE)
return 1;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index a38238d75c08..d99b8549ec8f 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -194,11 +194,6 @@ static int fat_get_block(struct inode *inode, sector_t iblock,
return 0;
}
-static int fat_writepage(struct page *page, struct writeback_control *wbc)
-{
- return block_write_full_page(page, fat_get_block, wbc);
-}
-
static int fat_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -346,12 +341,12 @@ static const struct address_space_operations fat_aops = {
.invalidate_folio = block_invalidate_folio,
.read_folio = fat_read_folio,
.readahead = fat_readahead,
- .writepage = fat_writepage,
.writepages = fat_writepages,
.write_begin = fat_write_begin,
.write_end = fat_write_end,
.direct_IO = fat_direct_IO,
- .bmap = _fat_bmap
+ .bmap = _fat_bmap,
+ .migrate_folio = buffer_migrate_folio,
};
/*
@@ -523,7 +518,7 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
inode->i_uid = sbi->options.fs_uid;
inode->i_gid = sbi->options.fs_gid;
inode_inc_iversion(inode);
- inode->i_generation = prandom_u32();
+ inode->i_generation = get_random_u32();
if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) {
inode->i_generation &= ~1;
diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c
index af191371c352..3626eb585a98 100644
--- a/fs/fat/nfs.c
+++ b/fs/fat/nfs.c
@@ -17,7 +17,7 @@ struct fat_fid {
#define FAT_FID_SIZE_WITHOUT_PARENT 3
#define FAT_FID_SIZE_WITH_PARENT (sizeof(struct fat_fid)/sizeof(u32))
-/**
+/*
* Look up a directory inode given its starting cluster.
*/
static struct inode *fat_dget(struct super_block *sb, int i_logstart)
@@ -135,7 +135,7 @@ fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp,
return type;
}
-/**
+/*
* Map a NFS file handle to a corresponding dentry.
* The dentry may or may not be connected to the filesystem root.
*/
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 6630c69c23a2..f2bc27d1975e 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -14,7 +14,7 @@
#include "internal.h"
#include "mount.h"
-static long do_sys_name_to_handle(struct path *path,
+static long do_sys_name_to_handle(const struct path *path,
struct file_handle __user *ufh,
int __user *mnt_id)
{
diff --git a/fs/file.c b/fs/file.c
index 3bcc1ecc314a..c942c89ca4cd 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -980,6 +980,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
*ret_fd = fd;
return file;
}
+EXPORT_SYMBOL(task_lookup_next_fd_rcu);
/*
* Lightweight file lookup - no refcnt increment if fd table isn't shared.
@@ -1002,7 +1003,16 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask)
struct files_struct *files = current->files;
struct file *file;
- if (atomic_read(&files->count) == 1) {
+ /*
+ * If another thread is concurrently calling close_fd() followed
+ * by put_files_struct(), we must not observe the old table
+ * entry combined with the new refcount - otherwise we could
+ * return a file that is concurrently being freed.
+ *
+ * atomic_read_acquire() pairs with atomic_dec_and_test() in
+ * put_files_struct().
+ */
+ if (atomic_read_acquire(&files->count) == 1) {
file = files_lookup_fd_raw(files, fd);
if (!file || unlikely(file->f_mode & mask))
return 0;
diff --git a/fs/file_table.c b/fs/file_table.c
index 99c6796c9f28..dd88701e54a9 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -324,12 +324,7 @@ static void __fput(struct file *file)
}
fops_put(file->f_op);
put_pid(file->f_owner.pid);
- if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
- i_readcount_dec(inode);
- if (mode & FMODE_WRITER) {
- put_write_access(inode);
- __mnt_drop_write(mnt);
- }
+ put_file_access(file);
dput(dentry);
if (unlikely(mode & FMODE_NEED_UNMOUNT))
dissolve_on_fput(mnt);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 08a1993ab7fd..6fba5a52127b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -121,6 +121,7 @@ static bool inode_io_list_move_locked(struct inode *inode,
{
assert_spin_locked(&wb->list_lock);
assert_spin_locked(&inode->i_lock);
+ WARN_ON_ONCE(inode->i_state & I_FREEING);
list_move(&inode->i_io_list, head);
@@ -280,6 +281,7 @@ static void inode_cgwb_move_to_attached(struct inode *inode,
{
assert_spin_locked(&wb->list_lock);
assert_spin_locked(&inode->i_lock);
+ WARN_ON_ONCE(inode->i_state & I_FREEING);
inode->i_state &= ~I_SYNC_QUEUED;
if (wb != &wb->bdi->wb)
@@ -1129,6 +1131,7 @@ static void inode_cgwb_move_to_attached(struct inode *inode,
{
assert_spin_locked(&wb->list_lock);
assert_spin_locked(&inode->i_lock);
+ WARN_ON_ONCE(inode->i_state & I_FREEING);
inode->i_state &= ~I_SYNC_QUEUED;
list_del_init(&inode->i_io_list);
@@ -1294,6 +1297,17 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
{
assert_spin_locked(&inode->i_lock);
+ inode->i_state &= ~I_SYNC_QUEUED;
+ /*
+ * When the inode is being freed just don't bother with dirty list
+ * tracking. Flush worker will ignore this inode anyway and it will
+ * trigger assertions in inode_io_list_move_locked().
+ */
+ if (inode->i_state & I_FREEING) {
+ list_del_init(&inode->i_io_list);
+ wb_io_lists_depopulated(wb);
+ return;
+ }
if (!list_empty(&wb->b_dirty)) {
struct inode *tail;
@@ -1302,7 +1316,6 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
inode->dirtied_when = jiffies;
}
inode_io_list_move_locked(inode, wb, &wb->b_dirty);
- inode->i_state &= ~I_SYNC_QUEUED;
}
static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
@@ -1345,8 +1358,6 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
return ret;
}
-#define EXPIRE_DIRTY_ATIME 0x0001
-
/*
* Move expired (dirtied before dirtied_before) dirty inodes from
* @delaying_queue to @dispatch_queue.
@@ -1712,15 +1723,28 @@ static int writeback_single_inode(struct inode *inode,
wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
/*
- * If the inode is now fully clean, then it can be safely removed from
- * its writeback list (if any). Otherwise the flusher threads are
- * responsible for the writeback lists.
+ * If the inode is freeing, its i_io_list shoudn't be updated
+ * as it can be finally deleted at this moment.
*/
- if (!(inode->i_state & I_DIRTY_ALL))
- inode_cgwb_move_to_attached(inode, wb);
- else if (!(inode->i_state & I_SYNC_QUEUED) &&
- (inode->i_state & I_DIRTY))
- redirty_tail_locked(inode, wb);
+ if (!(inode->i_state & I_FREEING)) {
+ /*
+ * If the inode is now fully clean, then it can be safely
+ * removed from its writeback list (if any). Otherwise the
+ * flusher threads are responsible for the writeback lists.
+ */
+ if (!(inode->i_state & I_DIRTY_ALL))
+ inode_cgwb_move_to_attached(inode, wb);
+ else if (!(inode->i_state & I_SYNC_QUEUED)) {
+ if ((inode->i_state & I_DIRTY))
+ redirty_tail_locked(inode, wb);
+ else if (inode->i_state & I_DIRTY_TIME) {
+ inode->dirtied_when = jiffies;
+ inode_io_list_move_locked(inode,
+ wb,
+ &wb->b_dirty_time);
+ }
+ }
+ }
spin_unlock(&wb->list_lock);
inode_sync_complete(inode);
@@ -2370,6 +2394,20 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if (flags & I_DIRTY_INODE) {
/*
+ * Inode timestamp update will piggback on this dirtying.
+ * We tell ->dirty_inode callback that timestamps need to
+ * be updated by setting I_DIRTY_TIME in flags.
+ */
+ if (inode->i_state & I_DIRTY_TIME) {
+ spin_lock(&inode->i_lock);
+ if (inode->i_state & I_DIRTY_TIME) {
+ inode->i_state &= ~I_DIRTY_TIME;
+ flags |= I_DIRTY_TIME;
+ }
+ spin_unlock(&inode->i_lock);
+ }
+
+ /*
* Notify the filesystem about the inode being dirtied, so that
* (if needed) it can update on-disk fields and journal the
* inode. This is only needed when the inode itself is being
@@ -2378,7 +2416,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
*/
trace_writeback_dirty_inode_start(inode, flags);
if (sb->s_op->dirty_inode)
- sb->s_op->dirty_inode(inode, flags & I_DIRTY_INODE);
+ sb->s_op->dirty_inode(inode,
+ flags & (I_DIRTY_INODE | I_DIRTY_TIME));
trace_writeback_dirty_inode(inode, flags);
/* I_DIRTY_INODE supersedes I_DIRTY_TIME. */
@@ -2399,21 +2438,15 @@ void __mark_inode_dirty(struct inode *inode, int flags)
*/
smp_mb();
- if (((inode->i_state & flags) == flags) ||
- (dirtytime && (inode->i_state & I_DIRTY_INODE)))
+ if ((inode->i_state & flags) == flags)
return;
spin_lock(&inode->i_lock);
- if (dirtytime && (inode->i_state & I_DIRTY_INODE))
- goto out_unlock_inode;
if ((inode->i_state & flags) != flags) {
const int was_dirty = inode->i_state & I_DIRTY;
inode_attach_wb(inode, NULL);
- /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */
- if (flags & I_DIRTY_INODE)
- inode->i_state &= ~I_DIRTY_TIME;
inode->i_state |= flags;
/*
@@ -2486,7 +2519,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
out_unlock:
if (wb)
spin_unlock(&wb->list_lock);
-out_unlock_inode:
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(__mark_inode_dirty);
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index ed40ce5742fd..edb3712dcfa5 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -138,15 +138,16 @@ EXPORT_SYMBOL(__fs_parse);
* @fc: The filesystem context to log errors through.
* @param: The parameter.
* @want_bdev: T if want a blockdev
+ * @flags: Pathwalk flags passed to filename_lookup()
* @_path: The result of the lookup
*/
int fs_lookup_param(struct fs_context *fc,
struct fs_parameter *param,
bool want_bdev,
+ unsigned int flags,
struct path *_path)
{
struct filename *f;
- unsigned int flags = 0;
bool put_f;
int ret;
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 451d8a077e12..bce2492186d0 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -605,6 +605,14 @@ again:
set_bit(FSCACHE_COOKIE_DO_PREP_TO_WRITE, &cookie->flags);
queue = true;
}
+ /*
+ * We could race with cookie_lru which may set LRU_DISCARD bit
+ * but has yet to run the cookie state machine. If this happens
+ * and another thread tries to use the cookie, clear LRU_DISCARD
+ * so we don't end up withdrawing the cookie while in use.
+ */
+ if (test_and_clear_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags))
+ fscache_see_cookie(cookie, fscache_cookie_see_lru_discard_clear);
break;
case FSCACHE_COOKIE_STATE_FAILED:
diff --git a/fs/fscache/io.c b/fs/fscache/io.c
index 3af3b08a9bb3..0d2b8dec8f82 100644
--- a/fs/fscache/io.c
+++ b/fs/fscache/io.c
@@ -286,7 +286,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
* taken into account.
*/
- iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len);
+ iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len);
fscache_write(cres, start, &iter, fscache_wreq_done, wreq);
return;
diff --git a/fs/fscache/volume.c b/fs/fscache/volume.c
index a058e0136bfe..ab8ceddf9efa 100644
--- a/fs/fscache/volume.c
+++ b/fs/fscache/volume.c
@@ -203,7 +203,11 @@ static struct fscache_volume *fscache_alloc_volume(const char *volume_key,
struct fscache_volume *volume;
struct fscache_cache *cache;
size_t klen, hlen;
- char *key;
+ u8 *key;
+
+ klen = strlen(volume_key);
+ if (klen > NAME_MAX)
+ return NULL;
if (!coherency_data)
coherency_len = 0;
@@ -229,7 +233,6 @@ static struct fscache_volume *fscache_alloc_volume(const char *volume_key,
/* Stick the length on the front of the key and pad it out to make
* hashing easier.
*/
- klen = strlen(volume_key);
hlen = round_up(1 + klen + 1, sizeof(__le32));
key = kzalloc(hlen, GFP_KERNEL);
if (!key)
diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
index 337cb29a8dd5..a4850aee2639 100644
--- a/fs/fuse/acl.c
+++ b/fs/fuse/acl.c
@@ -53,9 +53,10 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu)
return acl;
}
-int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
+ struct inode *inode = d_inode(dentry);
struct fuse_conn *fc = get_fuse_conn(inode);
const char *name;
int ret;
@@ -98,7 +99,7 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
return ret;
}
- if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) &&
+ if (!vfsgid_in_group_p(i_gid_into_vfsgid(&init_user_ns, inode)) &&
!capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID;
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index c7d882a9fe33..a06fbb1a8a5b 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -545,7 +545,6 @@ static int cuse_channel_release(struct inode *inode, struct file *file)
{
struct fuse_dev *fud = file->private_data;
struct cuse_conn *cc = fc_to_cc(fud->fc);
- int rc;
/* remove from the conntbl, no more access from this point on */
mutex_lock(&cuse_lock);
@@ -560,9 +559,7 @@ static int cuse_channel_release(struct inode *inode, struct file *file)
cdev_del(cc->cdev);
}
- rc = fuse_dev_release(inode, file); /* puts the base reference */
-
- return rc;
+ return fuse_dev_release(inode, file);
}
static struct file_operations cuse_channel_fops; /* initialized during init */
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 51897427a534..e8b60ce72c9a 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -764,11 +764,11 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
return ncpy;
}
-static int fuse_check_page(struct page *page)
+static int fuse_check_folio(struct folio *folio)
{
- if (page_mapcount(page) ||
- page->mapping != NULL ||
- (page->flags & PAGE_FLAGS_CHECK_AT_PREP &
+ if (folio_mapped(folio) ||
+ folio->mapping != NULL ||
+ (folio->flags & PAGE_FLAGS_CHECK_AT_PREP &
~(1 << PG_locked |
1 << PG_referenced |
1 << PG_uptodate |
@@ -776,8 +776,9 @@ static int fuse_check_page(struct page *page)
1 << PG_active |
1 << PG_workingset |
1 << PG_reclaim |
- 1 << PG_waiters))) {
- dump_page(page, "fuse: trying to steal weird page");
+ 1 << PG_waiters |
+ LRU_GEN_MASK | LRU_REFS_MASK))) {
+ dump_page(&folio->page, "fuse: trying to steal weird page");
return 1;
}
return 0;
@@ -786,11 +787,11 @@ static int fuse_check_page(struct page *page)
static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
{
int err;
- struct page *oldpage = *pagep;
- struct page *newpage;
+ struct folio *oldfolio = page_folio(*pagep);
+ struct folio *newfolio;
struct pipe_buffer *buf = cs->pipebufs;
- get_page(oldpage);
+ folio_get(oldfolio);
err = unlock_request(cs->req);
if (err)
goto out_put_old;
@@ -813,35 +814,36 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
if (!pipe_buf_try_steal(cs->pipe, buf))
goto out_fallback;
- newpage = buf->page;
+ newfolio = page_folio(buf->page);
- if (!PageUptodate(newpage))
- SetPageUptodate(newpage);
+ if (!folio_test_uptodate(newfolio))
+ folio_mark_uptodate(newfolio);
- ClearPageMappedToDisk(newpage);
+ folio_clear_mappedtodisk(newfolio);
- if (fuse_check_page(newpage) != 0)
+ if (fuse_check_folio(newfolio) != 0)
goto out_fallback_unlock;
/*
* This is a new and locked page, it shouldn't be mapped or
* have any special flags on it
*/
- if (WARN_ON(page_mapped(oldpage)))
+ if (WARN_ON(folio_mapped(oldfolio)))
goto out_fallback_unlock;
- if (WARN_ON(page_has_private(oldpage)))
+ if (WARN_ON(folio_has_private(oldfolio)))
goto out_fallback_unlock;
- if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage)))
+ if (WARN_ON(folio_test_dirty(oldfolio) ||
+ folio_test_writeback(oldfolio)))
goto out_fallback_unlock;
- if (WARN_ON(PageMlocked(oldpage)))
+ if (WARN_ON(folio_test_mlocked(oldfolio)))
goto out_fallback_unlock;
- replace_page_cache_page(oldpage, newpage);
+ replace_page_cache_folio(oldfolio, newfolio);
- get_page(newpage);
+ folio_get(newfolio);
if (!(buf->flags & PIPE_BUF_FLAG_LRU))
- lru_cache_add(newpage);
+ folio_add_lru(newfolio);
/*
* Release while we have extra ref on stolen page. Otherwise
@@ -854,28 +856,28 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
if (test_bit(FR_ABORTED, &cs->req->flags))
err = -ENOENT;
else
- *pagep = newpage;
+ *pagep = &newfolio->page;
spin_unlock(&cs->req->waitq.lock);
if (err) {
- unlock_page(newpage);
- put_page(newpage);
+ folio_unlock(newfolio);
+ folio_put(newfolio);
goto out_put_old;
}
- unlock_page(oldpage);
+ folio_unlock(oldfolio);
/* Drop ref for ap->pages[] array */
- put_page(oldpage);
+ folio_put(oldfolio);
cs->len = 0;
err = 0;
out_put_old:
/* Drop ref obtained in this function */
- put_page(oldpage);
+ folio_put(oldfolio);
return err;
out_fallback_unlock:
- unlock_page(newpage);
+ folio_unlock(newfolio);
out_fallback:
cs->pg = buf->page;
cs->offset = buf->offset;
@@ -1497,7 +1499,7 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
buf[outarg.namelen] = 0;
down_read(&fc->killsb);
- err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name);
+ err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name, outarg.flags);
up_read(&fc->killsb);
kfree(buf);
return err;
@@ -1545,7 +1547,7 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
buf[outarg.namelen] = 0;
down_read(&fc->killsb);
- err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name);
+ err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name, 0);
up_read(&fc->killsb);
kfree(buf);
return err;
@@ -2266,8 +2268,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
* Check against file->f_op because CUSE
* uses the same ioctl handler.
*/
- if (old->f_op == file->f_op &&
- old->f_cred->user_ns == file->f_cred->user_ns)
+ if (old->f_op == file->f_op)
fud = fuse_get_dev(old);
if (fud) {
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b585b04e815e..cd1a071b625a 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -214,7 +214,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
if (inode && fuse_is_bad(inode))
goto invalid;
else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) ||
- (flags & (LOOKUP_EXCL | LOOKUP_REVAL))) {
+ (flags & (LOOKUP_EXCL | LOOKUP_REVAL | LOOKUP_RENAME_TARGET))) {
struct fuse_entry_out outarg;
FUSE_ARGS(args);
struct fuse_forget_link *forget;
@@ -529,7 +529,7 @@ out_err:
*/
static int fuse_create_open(struct inode *dir, struct dentry *entry,
struct file *file, unsigned int flags,
- umode_t mode)
+ umode_t mode, u32 opcode)
{
int err;
struct inode *inode;
@@ -573,7 +573,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
}
- args.opcode = FUSE_CREATE;
+ args.opcode = opcode;
args.nodeid = get_node_id(dir);
args.in_numargs = 2;
args.in_args[0].size = sizeof(inarg);
@@ -676,7 +676,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
if (fc->no_create)
goto mknod;
- err = fuse_create_open(dir, entry, file, flags, mode);
+ err = fuse_create_open(dir, entry, file, flags, mode, FUSE_CREATE);
if (err == -ENOSYS) {
fc->no_create = 1;
goto mknod;
@@ -802,6 +802,23 @@ static int fuse_create(struct user_namespace *mnt_userns, struct inode *dir,
return fuse_mknod(&init_user_ns, dir, entry, mode, 0);
}
+static int fuse_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct file *file, umode_t mode)
+{
+ struct fuse_conn *fc = get_fuse_conn(dir);
+ int err;
+
+ if (fc->no_tmpfile)
+ return -EOPNOTSUPP;
+
+ err = fuse_create_open(dir, file->f_path.dentry, file, file->f_flags, mode, FUSE_TMPFILE);
+ if (err == -ENOSYS) {
+ fc->no_tmpfile = 1;
+ err = -EOPNOTSUPP;
+ }
+ return err;
+}
+
static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *entry, umode_t mode)
{
@@ -1153,7 +1170,7 @@ int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask)
}
int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
- u64 child_nodeid, struct qstr *name)
+ u64 child_nodeid, struct qstr *name, u32 flags)
{
int err = -ENOTDIR;
struct inode *parent;
@@ -1180,7 +1197,9 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
goto unlock;
fuse_dir_changed(parent);
- fuse_invalidate_entry(entry);
+ if (!(flags & FUSE_EXPIRE_ONLY))
+ d_invalidate(entry);
+ fuse_invalidate_entry_cache(entry);
if (child_nodeid != 0 && d_really_is_positive(entry)) {
inode_lock(d_inode(entry));
@@ -1218,6 +1237,18 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
return err;
}
+static inline bool fuse_permissible_uidgid(struct fuse_conn *fc)
+{
+ const struct cred *cred = current_cred();
+
+ return (uid_eq(cred->euid, fc->user_id) &&
+ uid_eq(cred->suid, fc->user_id) &&
+ uid_eq(cred->uid, fc->user_id) &&
+ gid_eq(cred->egid, fc->group_id) &&
+ gid_eq(cred->sgid, fc->group_id) &&
+ gid_eq(cred->gid, fc->group_id));
+}
+
/*
* Calling into a user-controlled filesystem gives the filesystem
* daemon ptrace-like capabilities over the current process. This
@@ -1231,26 +1262,19 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
* for which the owner of the mount has ptrace privilege. This
* excludes processes started by other users, suid or sgid processes.
*/
-int fuse_allow_current_process(struct fuse_conn *fc)
+bool fuse_allow_current_process(struct fuse_conn *fc)
{
- const struct cred *cred;
-
- if (allow_sys_admin_access && capable(CAP_SYS_ADMIN))
- return 1;
+ bool allow;
if (fc->allow_other)
- return current_in_userns(fc->user_ns);
+ allow = current_in_userns(fc->user_ns);
+ else
+ allow = fuse_permissible_uidgid(fc);
- cred = current_cred();
- if (uid_eq(cred->euid, fc->user_id) &&
- uid_eq(cred->suid, fc->user_id) &&
- uid_eq(cred->uid, fc->user_id) &&
- gid_eq(cred->egid, fc->group_id) &&
- gid_eq(cred->sgid, fc->group_id) &&
- gid_eq(cred->gid, fc->group_id))
- return 1;
+ if (!allow && allow_sys_admin_access && capable(CAP_SYS_ADMIN))
+ allow = true;
- return 0;
+ return allow;
}
static int fuse_access(struct inode *inode, int mask)
@@ -1913,11 +1937,12 @@ static const struct inode_operations fuse_dir_inode_operations = {
.setattr = fuse_setattr,
.create = fuse_create,
.atomic_open = fuse_atomic_open,
+ .tmpfile = fuse_tmpfile,
.mknod = fuse_mknod,
.permission = fuse_permission,
.getattr = fuse_getattr,
.listxattr = fuse_listxattr,
- .get_acl = fuse_get_acl,
+ .get_inode_acl = fuse_get_acl,
.set_acl = fuse_set_acl,
.fileattr_get = fuse_fileattr_get,
.fileattr_set = fuse_fileattr_set,
@@ -1939,7 +1964,7 @@ static const struct inode_operations fuse_common_inode_operations = {
.permission = fuse_permission,
.getattr = fuse_getattr,
.listxattr = fuse_listxattr,
- .get_acl = fuse_get_acl,
+ .get_inode_acl = fuse_get_acl,
.set_acl = fuse_set_acl,
.fileattr_get = fuse_fileattr_get,
.fileattr_set = fuse_fileattr_set,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 1a3afd469e3a..875314ee6f59 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1313,7 +1313,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
return err;
if (fc->handle_killpriv_v2 &&
- should_remove_suid(file_dentry(file))) {
+ setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) {
goto writethrough;
}
@@ -1563,14 +1563,47 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
return res;
}
+static bool fuse_direct_write_extending_i_size(struct kiocb *iocb,
+ struct iov_iter *iter)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode);
+}
+
static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
ssize_t res;
+ bool exclusive_lock =
+ !(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) ||
+ iocb->ki_flags & IOCB_APPEND ||
+ fuse_direct_write_extending_i_size(iocb, from);
+
+ /*
+ * Take exclusive lock if
+ * - Parallel direct writes are disabled - a user space decision
+ * - Parallel direct writes are enabled and i_size is being extended.
+ * This might not be needed at all, but needs further investigation.
+ */
+ if (exclusive_lock)
+ inode_lock(inode);
+ else {
+ inode_lock_shared(inode);
+
+ /* A race with truncate might have come up as the decision for
+ * the lock type was done without holding the lock, check again.
+ */
+ if (fuse_direct_write_extending_i_size(iocb, from)) {
+ inode_unlock_shared(inode);
+ inode_lock(inode);
+ exclusive_lock = true;
+ }
+ }
- /* Don't allow parallel writes to the same file */
- inode_lock(inode);
res = generic_write_checks(iocb, from);
if (res > 0) {
if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
@@ -1581,7 +1614,10 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
fuse_write_update_attr(inode, iocb->ki_pos, res);
}
}
- inode_unlock(inode);
+ if (exclusive_lock)
+ inode_unlock(inode);
+ else
+ inode_unlock_shared(inode);
return res;
}
@@ -2931,6 +2967,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (iov_iter_rw(iter) == WRITE) {
fuse_write_update_attr(inode, pos, ret);
+ /* For extending writes we already hold exclusive lock */
if (ret < 0 && offset + count > i_size)
fuse_do_truncate(file);
}
@@ -2963,11 +3000,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
.mode = mode
};
int err;
- bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
- (mode & (FALLOC_FL_PUNCH_HOLE |
- FALLOC_FL_ZERO_RANGE));
-
- bool block_faults = FUSE_IS_DAX(inode) && lock_inode;
+ bool block_faults = FUSE_IS_DAX(inode) &&
+ (!(mode & FALLOC_FL_KEEP_SIZE) ||
+ (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)));
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
FALLOC_FL_ZERO_RANGE))
@@ -2976,22 +3011,20 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
if (fm->fc->no_fallocate)
return -EOPNOTSUPP;
- if (lock_inode) {
- inode_lock(inode);
- if (block_faults) {
- filemap_invalidate_lock(inode->i_mapping);
- err = fuse_dax_break_layouts(inode, 0, 0);
- if (err)
- goto out;
- }
+ inode_lock(inode);
+ if (block_faults) {
+ filemap_invalidate_lock(inode->i_mapping);
+ err = fuse_dax_break_layouts(inode, 0, 0);
+ if (err)
+ goto out;
+ }
- if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) {
- loff_t endbyte = offset + length - 1;
+ if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) {
+ loff_t endbyte = offset + length - 1;
- err = fuse_writeback_range(inode, offset, endbyte);
- if (err)
- goto out;
- }
+ err = fuse_writeback_range(inode, offset, endbyte);
+ if (err)
+ goto out;
}
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
@@ -3001,6 +3034,10 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
goto out;
}
+ err = file_modified(file);
+ if (err)
+ goto out;
+
if (!(mode & FALLOC_FL_KEEP_SIZE))
set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
@@ -3035,8 +3072,7 @@ out:
if (block_faults)
filemap_invalidate_unlock(inode->i_mapping);
- if (lock_inode)
- inode_unlock(inode);
+ inode_unlock(inode);
fuse_flush_time_update(inode);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 488b460e046f..c673faefdcb9 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -784,6 +784,9 @@ struct fuse_conn {
/* Does the filesystem support per inode DAX? */
unsigned int inode_dax:1;
+ /* Is tmpfile not implemented by fs? */
+ unsigned int no_tmpfile:1;
+
/** The number of requests waiting for completion */
atomic_t num_waiting;
@@ -1176,7 +1179,7 @@ bool fuse_invalid_attr(struct fuse_attr *attr);
/**
* Is current process allowed to perform filesystem operation?
*/
-int fuse_allow_current_process(struct fuse_conn *fc);
+bool fuse_allow_current_process(struct fuse_conn *fc);
u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
@@ -1217,7 +1220,7 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
* then the dentry is unhashed (d_delete()).
*/
int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
- u64 child_nodeid, struct qstr *name);
+ u64 child_nodeid, struct qstr *name, u32 flags);
int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
bool isdir);
@@ -1266,7 +1269,7 @@ extern const struct xattr_handler *fuse_no_acl_xattr_handlers[];
struct posix_acl;
struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu);
-int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
/* readdir.c */
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index 61d8afcb10a3..fcce94ace2c2 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -255,7 +255,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
ap.args.in_pages = true;
err = -EFAULT;
- iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size);
+ iov_iter_init(&ii, ITER_SOURCE, in_iov, in_iovs, in_size);
for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
if (c != PAGE_SIZE && iov_iter_count(&ii))
@@ -324,7 +324,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
goto out;
err = -EFAULT;
- iov_iter_init(&ii, READ, out_iov, out_iovs, transferred);
+ iov_iter_init(&ii, ITER_DEST, out_iov, out_iovs, transferred);
for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
if (c != PAGE_SIZE && iov_iter_count(&ii))
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index b4e565711045..dc603479b30e 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -77,8 +77,10 @@ static void fuse_add_dirent_to_cache(struct file *file,
goto unlock;
addr = kmap_local_page(page);
- if (!offset)
+ if (!offset) {
clear_page(addr);
+ SetPageUptodate(page);
+ }
memcpy(addr + offset, dirent, reclen);
kunmap_local(addr);
fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen;
@@ -516,6 +518,12 @@ retry_locked:
page = find_get_page_flags(file->f_mapping, index,
FGP_ACCESSED | FGP_LOCK);
+ /* Page gone missing, then re-added to cache, but not initialized? */
+ if (page && !PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ page = NULL;
+ }
spin_lock(&fi->rdc.lock);
if (!page) {
/*
@@ -539,9 +547,9 @@ retry_locked:
* Contents of the page are now protected against changing by holding
* the page lock.
*/
- addr = kmap(page);
+ addr = kmap_local_page(page);
res = fuse_parse_cache(ff, addr, size, ctx);
- kunmap(page);
+ kunmap_local(addr);
unlock_page(page);
put_page(page);
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 734d1f05d823..3dcde4912413 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -109,9 +109,10 @@ out:
return error;
}
-int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
+ struct inode *inode = d_inode(dentry);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
bool need_unlock = false;
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index cd180ca7c959..b8de8c148f5c 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -13,7 +13,7 @@
extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu);
extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 05bee80ac7de..e782b4f1d104 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -427,8 +427,6 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
return error;
kaddr = kmap_atomic(page);
- if (dsize > gfs2_max_stuffed_size(ip))
- dsize = gfs2_max_stuffed_size(ip);
memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
kunmap_atomic(kaddr);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 3bdb2c668a71..e7537fd305dd 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -61,9 +61,6 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
void *kaddr = kmap(page);
u64 dsize = i_size_read(inode);
- if (dsize > gfs2_max_stuffed_size(ip))
- dsize = gfs2_max_stuffed_size(ip);
-
memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
kunmap(page);
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 756d05779200..cf40895233f5 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -66,7 +66,7 @@ struct get_name_filldir {
char *name;
};
-static int get_name_filldir(struct dir_context *ctx, const char *name,
+static bool get_name_filldir(struct dir_context *ctx, const char *name,
int length, loff_t offset, u64 inum,
unsigned int type)
{
@@ -74,12 +74,12 @@ static int get_name_filldir(struct dir_context *ctx, const char *name,
container_of(ctx, struct get_name_filldir, ctx);
if (inum != gnfd->inum.no_addr)
- return 0;
+ return true;
memcpy(gnfd->name, name, length);
gnfd->name[length] = 0;
- return 1;
+ return false;
}
static int gfs2_get_name(struct dentry *parent, char *name,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 892006fbbb09..eea5be4fbf0e 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1443,6 +1443,21 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);
}
+static void __flock_holder_uninit(struct file *file, struct gfs2_holder *fl_gh)
+{
+ struct gfs2_glock *gl = gfs2_glock_hold(fl_gh->gh_gl);
+
+ /*
+ * Make sure gfs2_glock_put() won't sleep under the file->f_lock
+ * spinlock.
+ */
+
+ spin_lock(&file->f_lock);
+ gfs2_holder_uninit(fl_gh);
+ spin_unlock(&file->f_lock);
+ gfs2_glock_put(gl);
+}
+
static int do_flock(struct file *file, int cmd, struct file_lock *fl)
{
struct gfs2_file *fp = file->private_data;
@@ -1455,7 +1470,9 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
int sleeptime;
state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
- flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY_1CB) | GL_EXACT;
+ flags = GL_EXACT | GL_NOPID;
+ if (!IS_SETLKW(cmd))
+ flags |= LM_FLAG_TRY_1CB;
mutex_lock(&fp->f_fl_mutex);
@@ -1474,18 +1491,21 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
&gfs2_flock_glops, CREATE, &gl);
if (error)
goto out;
+ spin_lock(&file->f_lock);
gfs2_holder_init(gl, state, flags, fl_gh);
+ spin_unlock(&file->f_lock);
gfs2_glock_put(gl);
}
for (sleeptime = 1; sleeptime <= 4; sleeptime <<= 1) {
error = gfs2_glock_nq(fl_gh);
if (error != GLR_TRYFAILED)
break;
- fl_gh->gh_flags = LM_FLAG_TRY | GL_EXACT;
+ fl_gh->gh_flags &= ~LM_FLAG_TRY_1CB;
+ fl_gh->gh_flags |= LM_FLAG_TRY;
msleep(sleeptime);
}
if (error) {
- gfs2_holder_uninit(fl_gh);
+ __flock_holder_uninit(file, fl_gh);
if (error == GLR_TRYFAILED)
error = -EAGAIN;
} else {
@@ -1507,7 +1527,7 @@ static void do_unflock(struct file *file, struct file_lock *fl)
locks_lock_file_wait(file, fl);
if (gfs2_holder_initialized(fl_gh)) {
gfs2_glock_dq(fl_gh);
- gfs2_holder_uninit(fl_gh);
+ __flock_holder_uninit(file, fl_gh);
}
mutex_unlock(&fp->f_fl_mutex);
}
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 41b6c89e4bf7..524f3c96b9a4 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -33,6 +33,9 @@
#include <linux/list_sort.h>
#include <linux/lockref.h>
#include <linux/rhashtable.h>
+#include <linux/pid_namespace.h>
+#include <linux/fdtable.h>
+#include <linux/file.h>
#include "gfs2.h"
#include "incore.h"
@@ -59,6 +62,8 @@ typedef void (*glock_examiner) (struct gfs2_glock * gl);
static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
static void __gfs2_glock_dq(struct gfs2_holder *gh);
+static void handle_callback(struct gfs2_glock *gl, unsigned int state,
+ unsigned long delay, bool remote);
static struct dentry *gfs2_root;
static struct workqueue_struct *glock_workqueue;
@@ -181,10 +186,11 @@ void gfs2_glock_free(struct gfs2_glock *gl)
*
*/
-void gfs2_glock_hold(struct gfs2_glock *gl)
+struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl)
{
GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref));
lockref_get(&gl->gl_lockref);
+ return gl;
}
/**
@@ -200,12 +206,6 @@ static int demote_ok(const struct gfs2_glock *gl)
if (gl->gl_state == LM_ST_UNLOCKED)
return 0;
- /*
- * Note that demote_ok is used for the lru process of disposing of
- * glocks. For this purpose, we don't care if the glock's holders
- * have the HIF_MAY_DEMOTE flag set or not. If someone is using
- * them, don't demote.
- */
if (!list_empty(&gl->gl_holders))
return 0;
if (glops->go_demote_ok)
@@ -388,7 +388,7 @@ static void do_error(struct gfs2_glock *gl, const int ret)
struct gfs2_holder *gh, *tmp;
list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
- if (!test_bit(HIF_WAIT, &gh->gh_iflags))
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
continue;
if (ret & LM_OUT_ERROR)
gh->gh_error = -EIO;
@@ -403,45 +403,6 @@ static void do_error(struct gfs2_glock *gl, const int ret)
}
/**
- * demote_incompat_holders - demote incompatible demoteable holders
- * @gl: the glock we want to promote
- * @current_gh: the newly promoted holder
- *
- * We're passing the newly promoted holder in @current_gh, but actually, any of
- * the strong holders would do.
- */
-static void demote_incompat_holders(struct gfs2_glock *gl,
- struct gfs2_holder *current_gh)
-{
- struct gfs2_holder *gh, *tmp;
-
- /*
- * Demote incompatible holders before we make ourselves eligible.
- * (This holder may or may not allow auto-demoting, but we don't want
- * to demote the new holder before it's even granted.)
- */
- list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
- /*
- * Since holders are at the front of the list, we stop when we
- * find the first non-holder.
- */
- if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
- return;
- if (gh == current_gh)
- continue;
- if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) &&
- !may_grant(gl, current_gh, gh)) {
- /*
- * We should not recurse into do_promote because
- * __gfs2_glock_dq only calls handle_callback,
- * gfs2_glock_add_to_lru and __gfs2_glock_queue_work.
- */
- __gfs2_glock_dq(gh);
- }
- }
-}
-
-/**
* find_first_holder - find the first "holder" gh
* @gl: the glock
*/
@@ -459,26 +420,6 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
return NULL;
}
-/**
- * find_first_strong_holder - find the first non-demoteable holder
- * @gl: the glock
- *
- * Find the first holder that doesn't have the HIF_MAY_DEMOTE flag set.
- */
-static inline struct gfs2_holder *
-find_first_strong_holder(struct gfs2_glock *gl)
-{
- struct gfs2_holder *gh;
-
- list_for_each_entry(gh, &gl->gl_holders, gh_list) {
- if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
- return NULL;
- if (!test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
- return gh;
- }
- return NULL;
-}
-
/*
* gfs2_instantiate - Call the glops instantiate function
* @gh: The glock holder
@@ -535,9 +476,8 @@ done:
static int do_promote(struct gfs2_glock *gl)
{
struct gfs2_holder *gh, *current_gh;
- bool incompat_holders_demoted = false;
- current_gh = find_first_strong_holder(gl);
+ current_gh = find_first_holder(gl);
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
if (test_bit(HIF_HOLDER, &gh->gh_iflags))
continue;
@@ -556,11 +496,8 @@ static int do_promote(struct gfs2_glock *gl)
set_bit(HIF_HOLDER, &gh->gh_iflags);
trace_gfs2_promote(gh);
gfs2_holder_wake(gh);
- if (!incompat_holders_demoted) {
+ if (!current_gh)
current_gh = gh;
- demote_incompat_holders(gl, current_gh);
- incompat_holders_demoted = true;
- }
}
return 0;
}
@@ -730,7 +667,8 @@ static bool is_system_glock(struct gfs2_glock *gl)
*
*/
-static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
+static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh,
+ unsigned int target)
__releases(&gl->gl_lockref.lock)
__acquires(&gl->gl_lockref.lock)
{
@@ -741,7 +679,8 @@ __acquires(&gl->gl_lockref.lock)
if (target != LM_ST_UNLOCKED && glock_blocked_by_withdraw(gl) &&
gh && !(gh->gh_flags & LM_FLAG_NOEXP))
- return;
+ goto skip_inval;
+
lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
LM_FLAG_PRIORITY);
GLOCK_BUG_ON(gl, gl->gl_state == target);
@@ -826,6 +765,20 @@ skip_inval:
(target != LM_ST_UNLOCKED ||
test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags))) {
if (!is_system_glock(gl)) {
+ handle_callback(gl, LM_ST_UNLOCKED, 0, false); /* sets demote */
+ /*
+ * Ordinarily, we would call dlm and its callback would call
+ * finish_xmote, which would call state_change() to the new state.
+ * Since we withdrew, we won't call dlm, so call state_change
+ * manually, but to the UNLOCKED state we desire.
+ */
+ state_change(gl, LM_ST_UNLOCKED);
+ /*
+ * We skip telling dlm to do the locking, so we won't get a
+ * reply that would otherwise clear GLF_LOCK. So we clear it here.
+ */
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD);
goto out;
} else {
@@ -906,6 +859,48 @@ out_unlock:
return;
}
+/**
+ * glock_set_object - set the gl_object field of a glock
+ * @gl: the glock
+ * @object: the object
+ */
+void glock_set_object(struct gfs2_glock *gl, void *object)
+{
+ void *prev_object;
+
+ spin_lock(&gl->gl_lockref.lock);
+ prev_object = gl->gl_object;
+ gl->gl_object = object;
+ spin_unlock(&gl->gl_lockref.lock);
+ if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == NULL)) {
+ pr_warn("glock=%u/%llx\n",
+ gl->gl_name.ln_type,
+ (unsigned long long)gl->gl_name.ln_number);
+ gfs2_dump_glock(NULL, gl, true);
+ }
+}
+
+/**
+ * glock_clear_object - clear the gl_object field of a glock
+ * @gl: the glock
+ */
+void glock_clear_object(struct gfs2_glock *gl, void *object)
+{
+ void *prev_object;
+
+ spin_lock(&gl->gl_lockref.lock);
+ prev_object = gl->gl_object;
+ gl->gl_object = NULL;
+ spin_unlock(&gl->gl_lockref.lock);
+ if (gfs2_assert_warn(gl->gl_name.ln_sbd,
+ prev_object == object || prev_object == NULL)) {
+ pr_warn("glock=%u/%llx\n",
+ gl->gl_name.ln_type,
+ (unsigned long long)gl->gl_name.ln_number);
+ gfs2_dump_glock(NULL, gl, true);
+ }
+}
+
void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation)
{
struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr;
@@ -959,8 +954,6 @@ static bool gfs2_try_evict(struct gfs2_glock *gl)
ip = NULL;
spin_unlock(&gl->gl_lockref.lock);
if (ip) {
- struct gfs2_glock *inode_gl = NULL;
-
gl->gl_no_formal_ino = ip->i_no_formal_ino;
set_bit(GIF_DEFERRED_DELETE, &ip->i_flags);
d_prune_aliases(&ip->i_inode);
@@ -970,14 +963,14 @@ static bool gfs2_try_evict(struct gfs2_glock *gl)
spin_lock(&gl->gl_lockref.lock);
ip = gl->gl_object;
if (ip) {
- inode_gl = ip->i_gl;
- lockref_get(&inode_gl->gl_lockref);
clear_bit(GIF_DEFERRED_DELETE, &ip->i_flags);
+ if (!igrab(&ip->i_inode))
+ ip = NULL;
}
spin_unlock(&gl->gl_lockref.lock);
- if (inode_gl) {
- gfs2_glock_poke(inode_gl);
- gfs2_glock_put(inode_gl);
+ if (ip) {
+ gfs2_glock_poke(ip->i_gl);
+ iput(&ip->i_inode);
}
evicted = !ip;
}
@@ -1023,7 +1016,11 @@ static void delete_work_func(struct work_struct *work)
inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino,
GFS2_BLKST_UNLINKED);
- if (!IS_ERR_OR_NULL(inode)) {
+ if (IS_ERR(inode)) {
+ if (PTR_ERR(inode) == -EAGAIN &&
+ (gfs2_queue_delete_work(gl, 5 * HZ)))
+ return;
+ } else {
d_prune_aliases(inode);
iput(inode);
}
@@ -1233,13 +1230,12 @@ void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags,
struct gfs2_holder *gh, unsigned long ip)
{
INIT_LIST_HEAD(&gh->gh_list);
- gh->gh_gl = gl;
+ gh->gh_gl = gfs2_glock_hold(gl);
gh->gh_ip = ip;
gh->gh_owner_pid = get_pid(task_pid(current));
gh->gh_state = state;
gh->gh_flags = flags;
gh->gh_iflags = 0;
- gfs2_glock_hold(gl);
}
/**
@@ -1436,6 +1432,15 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
va_end(args);
}
+static inline bool pid_is_meaningful(const struct gfs2_holder *gh)
+{
+ if (!(gh->gh_flags & GL_NOPID))
+ return true;
+ if (gh->gh_state == LM_ST_UNLOCKED)
+ return true;
+ return false;
+}
+
/**
* add_to_queue - Add a holder to the wait queue (but look for recursion)
* @gh: the holder structure to add
@@ -1464,7 +1469,7 @@ __acquires(&gl->gl_lockref.lock)
if (test_bit(GLF_LOCK, &gl->gl_flags)) {
struct gfs2_holder *current_gh;
- current_gh = find_first_strong_holder(gl);
+ current_gh = find_first_holder(gl);
try_futile = !may_grant(gl, current_gh, gh);
}
if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
@@ -1472,10 +1477,15 @@ __acquires(&gl->gl_lockref.lock)
}
list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
- if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
- (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK) &&
- !test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags)))
- goto trap_recursive;
+ if (likely(gh2->gh_owner_pid != gh->gh_owner_pid))
+ continue;
+ if (gh->gh_gl->gl_ops->go_type == LM_TYPE_FLOCK)
+ continue;
+ if (!pid_is_meaningful(gh2))
+ continue;
+ goto trap_recursive;
+ }
+ list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
if (try_futile &&
!(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
fail:
@@ -1580,69 +1590,28 @@ static inline bool needs_demote(struct gfs2_glock *gl)
static void __gfs2_glock_dq(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
unsigned delay = 0;
int fast_path = 0;
/*
- * This while loop is similar to function demote_incompat_holders:
- * If the glock is due to be demoted (which may be from another node
- * or even if this holder is GL_NOCACHE), the weak holders are
- * demoted as well, allowing the glock to be demoted.
+ * This holder should not be cached, so mark it for demote.
+ * Note: this should be done before the check for needs_demote
+ * below.
*/
- while (gh) {
- /*
- * If we're in the process of file system withdraw, we cannot
- * just dequeue any glocks until our journal is recovered, lest
- * we introduce file system corruption. We need two exceptions
- * to this rule: We need to allow unlocking of nondisk glocks
- * and the glock for our own journal that needs recovery.
- */
- if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
- glock_blocked_by_withdraw(gl) &&
- gh->gh_gl != sdp->sd_jinode_gl) {
- sdp->sd_glock_dqs_held++;
- spin_unlock(&gl->gl_lockref.lock);
- might_sleep();
- wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
- TASK_UNINTERRUPTIBLE);
- spin_lock(&gl->gl_lockref.lock);
- }
-
- /*
- * This holder should not be cached, so mark it for demote.
- * Note: this should be done before the check for needs_demote
- * below.
- */
- if (gh->gh_flags & GL_NOCACHE)
- handle_callback(gl, LM_ST_UNLOCKED, 0, false);
-
- list_del_init(&gh->gh_list);
- clear_bit(HIF_HOLDER, &gh->gh_iflags);
- trace_gfs2_glock_queue(gh, 0);
+ if (gh->gh_flags & GL_NOCACHE)
+ handle_callback(gl, LM_ST_UNLOCKED, 0, false);
- /*
- * If there hasn't been a demote request we are done.
- * (Let the remaining holders, if any, keep holding it.)
- */
- if (!needs_demote(gl)) {
- if (list_empty(&gl->gl_holders))
- fast_path = 1;
- break;
- }
- /*
- * If we have another strong holder (we cannot auto-demote)
- * we are done. It keeps holding it until it is done.
- */
- if (find_first_strong_holder(gl))
- break;
+ list_del_init(&gh->gh_list);
+ clear_bit(HIF_HOLDER, &gh->gh_iflags);
+ trace_gfs2_glock_queue(gh, 0);
- /*
- * If we have a weak holder at the head of the list, it
- * (and all others like it) must be auto-demoted. If there
- * are no more weak holders, we exit the while loop.
- */
- gh = find_first_holder(gl);
+ /*
+ * If there hasn't been a demote request we are done.
+ * (Let the remaining holders, if any, keep holding it.)
+ */
+ if (!needs_demote(gl)) {
+ if (list_empty(&gl->gl_holders))
+ fast_path = 1;
}
if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
@@ -1666,8 +1635,17 @@ static void __gfs2_glock_dq(struct gfs2_holder *gh)
void gfs2_glock_dq(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
spin_lock(&gl->gl_lockref.lock);
+ if (!gfs2_holder_queued(gh)) {
+ /*
+ * May have already been dequeued because the locking request
+ * was GL_ASYNC and it has failed in the meantime.
+ */
+ goto out;
+ }
+
if (list_is_first(&gh->gh_list, &gl->gl_holders) &&
!test_bit(HIF_HOLDER, &gh->gh_iflags)) {
spin_unlock(&gl->gl_lockref.lock);
@@ -1676,7 +1654,26 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
spin_lock(&gl->gl_lockref.lock);
}
+ /*
+ * If we're in the process of file system withdraw, we cannot just
+ * dequeue any glocks until our journal is recovered, lest we introduce
+ * file system corruption. We need two exceptions to this rule: We need
+ * to allow unlocking of nondisk glocks and the glock for our own
+ * journal that needs recovery.
+ */
+ if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
+ glock_blocked_by_withdraw(gl) &&
+ gh->gh_gl != sdp->sd_jinode_gl) {
+ sdp->sd_glock_dqs_held++;
+ spin_unlock(&gl->gl_lockref.lock);
+ might_sleep();
+ wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
+ TASK_UNINTERRUPTIBLE);
+ spin_lock(&gl->gl_lockref.lock);
+ }
+
__gfs2_glock_dq(gh);
+out:
spin_unlock(&gl->gl_lockref.lock);
}
@@ -1849,33 +1846,6 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
delay = gl->gl_hold_time;
}
- /*
- * Note 1: We cannot call demote_incompat_holders from handle_callback
- * or gfs2_set_demote due to recursion problems like: gfs2_glock_dq ->
- * handle_callback -> demote_incompat_holders -> gfs2_glock_dq
- * Plus, we only want to demote the holders if the request comes from
- * a remote cluster node because local holder conflicts are resolved
- * elsewhere.
- *
- * Note 2: if a remote node wants this glock in EX mode, lock_dlm will
- * request that we set our state to UNLOCKED. Here we mock up a holder
- * to make it look like someone wants the lock EX locally. Any SH
- * and DF requests should be able to share the lock without demoting.
- *
- * Note 3: We only want to demote the demoteable holders when there
- * are no more strong holders. The demoteable holders might as well
- * keep the glock until the last strong holder is done with it.
- */
- if (!find_first_strong_holder(gl)) {
- struct gfs2_holder mock_gh = {
- .gh_gl = gl,
- .gh_state = (state == LM_ST_UNLOCKED) ?
- LM_ST_EXCLUSIVE : state,
- .gh_iflags = BIT(HIF_HOLDER)
- };
-
- demote_incompat_holders(gl, &mock_gh);
- }
handle_callback(gl, state, delay, true);
__gfs2_glock_queue_work(gl, delay);
spin_unlock(&gl->gl_lockref.lock);
@@ -2194,6 +2164,20 @@ static void dump_glock_func(struct gfs2_glock *gl)
dump_glock(NULL, gl, true);
}
+static void withdraw_dq(struct gfs2_glock *gl)
+{
+ spin_lock(&gl->gl_lockref.lock);
+ if (!__lockref_is_dead(&gl->gl_lockref) &&
+ glock_blocked_by_withdraw(gl))
+ do_error(gl, LM_OUT_ERROR); /* remove pending waiters */
+ spin_unlock(&gl->gl_lockref.lock);
+}
+
+void gfs2_gl_dq_holders(struct gfs2_sbd *sdp)
+{
+ glock_hash_walk(withdraw_dq, sdp);
+}
+
/**
* gfs2_gl_hash_clear - Empty out the glock hash table
* @sdp: the filesystem
@@ -2253,8 +2237,6 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags)
*p++ = 'H';
if (test_bit(HIF_WAIT, &iflags))
*p++ = 'W';
- if (test_bit(HIF_MAY_DEMOTE, &iflags))
- *p++ = 'D';
if (flags & GL_SKIP)
*p++ = 's';
*p = 0;
@@ -2272,19 +2254,24 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags)
static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh,
const char *fs_id_buf)
{
- struct task_struct *gh_owner = NULL;
+ const char *comm = "(none)";
+ pid_t owner_pid = 0;
char flags_buf[32];
rcu_read_lock();
- if (gh->gh_owner_pid)
+ if (pid_is_meaningful(gh)) {
+ struct task_struct *gh_owner;
+
+ comm = "(ended)";
+ owner_pid = pid_nr(gh->gh_owner_pid);
gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
+ if (gh_owner)
+ comm = gh_owner->comm;
+ }
gfs2_print_dbg(seq, "%s H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
fs_id_buf, state2str(gh->gh_state),
hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
- gh->gh_error,
- gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
- gh_owner ? gh_owner->comm : "(ended)",
- (void *)gh->gh_ip);
+ gh->gh_error, (long)owner_pid, comm, (void *)gh->gh_ip);
rcu_read_unlock();
}
@@ -2699,6 +2686,172 @@ static const struct file_operations gfs2_glstats_fops = {
.release = gfs2_glocks_release,
};
+struct gfs2_glockfd_iter {
+ struct super_block *sb;
+ unsigned int tgid;
+ struct task_struct *task;
+ unsigned int fd;
+ struct file *file;
+};
+
+static struct task_struct *gfs2_glockfd_next_task(struct gfs2_glockfd_iter *i)
+{
+ struct pid_namespace *ns = task_active_pid_ns(current);
+ struct pid *pid;
+
+ if (i->task)
+ put_task_struct(i->task);
+
+ rcu_read_lock();
+retry:
+ i->task = NULL;
+ pid = find_ge_pid(i->tgid, ns);
+ if (pid) {
+ i->tgid = pid_nr_ns(pid, ns);
+ i->task = pid_task(pid, PIDTYPE_TGID);
+ if (!i->task) {
+ i->tgid++;
+ goto retry;
+ }
+ get_task_struct(i->task);
+ }
+ rcu_read_unlock();
+ return i->task;
+}
+
+static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i)
+{
+ if (i->file) {
+ fput(i->file);
+ i->file = NULL;
+ }
+
+ rcu_read_lock();
+ for(;; i->fd++) {
+ struct inode *inode;
+
+ i->file = task_lookup_next_fd_rcu(i->task, &i->fd);
+ if (!i->file) {
+ i->fd = 0;
+ break;
+ }
+ inode = file_inode(i->file);
+ if (inode->i_sb != i->sb)
+ continue;
+ if (get_file_rcu(i->file))
+ break;
+ }
+ rcu_read_unlock();
+ return i->file;
+}
+
+static void *gfs2_glockfd_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct gfs2_glockfd_iter *i = seq->private;
+
+ if (*pos)
+ return NULL;
+ while (gfs2_glockfd_next_task(i)) {
+ if (gfs2_glockfd_next_file(i))
+ return i;
+ i->tgid++;
+ }
+ return NULL;
+}
+
+static void *gfs2_glockfd_seq_next(struct seq_file *seq, void *iter_ptr,
+ loff_t *pos)
+{
+ struct gfs2_glockfd_iter *i = seq->private;
+
+ (*pos)++;
+ i->fd++;
+ do {
+ if (gfs2_glockfd_next_file(i))
+ return i;
+ i->tgid++;
+ } while (gfs2_glockfd_next_task(i));
+ return NULL;
+}
+
+static void gfs2_glockfd_seq_stop(struct seq_file *seq, void *iter_ptr)
+{
+ struct gfs2_glockfd_iter *i = seq->private;
+
+ if (i->file)
+ fput(i->file);
+ if (i->task)
+ put_task_struct(i->task);
+}
+
+static void gfs2_glockfd_seq_show_flock(struct seq_file *seq,
+ struct gfs2_glockfd_iter *i)
+{
+ struct gfs2_file *fp = i->file->private_data;
+ struct gfs2_holder *fl_gh = &fp->f_fl_gh;
+ struct lm_lockname gl_name = { .ln_type = LM_TYPE_RESERVED };
+
+ if (!READ_ONCE(fl_gh->gh_gl))
+ return;
+
+ spin_lock(&i->file->f_lock);
+ if (gfs2_holder_initialized(fl_gh))
+ gl_name = fl_gh->gh_gl->gl_name;
+ spin_unlock(&i->file->f_lock);
+
+ if (gl_name.ln_type != LM_TYPE_RESERVED) {
+ seq_printf(seq, "%d %u %u/%llx\n",
+ i->tgid, i->fd, gl_name.ln_type,
+ (unsigned long long)gl_name.ln_number);
+ }
+}
+
+static int gfs2_glockfd_seq_show(struct seq_file *seq, void *iter_ptr)
+{
+ struct gfs2_glockfd_iter *i = seq->private;
+ struct inode *inode = file_inode(i->file);
+ struct gfs2_glock *gl;
+
+ inode_lock_shared(inode);
+ gl = GFS2_I(inode)->i_iopen_gh.gh_gl;
+ if (gl) {
+ seq_printf(seq, "%d %u %u/%llx\n",
+ i->tgid, i->fd, gl->gl_name.ln_type,
+ (unsigned long long)gl->gl_name.ln_number);
+ }
+ gfs2_glockfd_seq_show_flock(seq, i);
+ inode_unlock_shared(inode);
+ return 0;
+}
+
+static const struct seq_operations gfs2_glockfd_seq_ops = {
+ .start = gfs2_glockfd_seq_start,
+ .next = gfs2_glockfd_seq_next,
+ .stop = gfs2_glockfd_seq_stop,
+ .show = gfs2_glockfd_seq_show,
+};
+
+static int gfs2_glockfd_open(struct inode *inode, struct file *file)
+{
+ struct gfs2_glockfd_iter *i;
+ struct gfs2_sbd *sdp = inode->i_private;
+
+ i = __seq_open_private(file, &gfs2_glockfd_seq_ops,
+ sizeof(struct gfs2_glockfd_iter));
+ if (!i)
+ return -ENOMEM;
+ i->sb = sdp->sd_vfs;
+ return 0;
+}
+
+static const struct file_operations gfs2_glockfd_fops = {
+ .owner = THIS_MODULE,
+ .open = gfs2_glockfd_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
DEFINE_SEQ_ATTRIBUTE(gfs2_sbstats);
void gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
@@ -2708,6 +2861,9 @@ void gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
debugfs_create_file("glocks", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
&gfs2_glocks_fops);
+ debugfs_create_file("glockfd", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
+ &gfs2_glockfd_fops);
+
debugfs_create_file("glstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
&gfs2_glstats_fops);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 5aed8b500cf5..f37ac087e2c1 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -91,6 +91,7 @@ enum {
#define GL_ASYNC 0x0040
#define GL_EXACT 0x0080
#define GL_SKIP 0x0100
+#define GL_NOPID 0x0200
#define GL_NOCACHE 0x0400
/*
@@ -155,8 +156,6 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
break;
- if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
- continue;
if (gh->gh_owner_pid == pid)
goto out;
}
@@ -195,7 +194,7 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
const struct gfs2_glock_operations *glops,
int create, struct gfs2_glock **glp);
-extern void gfs2_glock_hold(struct gfs2_glock *gl);
+extern struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl);
extern void gfs2_glock_put(struct gfs2_glock *gl);
extern void gfs2_glock_queue_put(struct gfs2_glock *gl);
@@ -274,6 +273,7 @@ extern void gfs2_cancel_delete_work(struct gfs2_glock *gl);
extern bool gfs2_delete_work_queued(const struct gfs2_glock *gl);
extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp);
extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
+extern void gfs2_gl_dq_holders(struct gfs2_sbd *sdp);
extern void gfs2_glock_thaw(struct gfs2_sbd *sdp);
extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl);
extern void gfs2_glock_free(struct gfs2_glock *gl);
@@ -286,6 +286,9 @@ extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
extern void gfs2_register_debugfs(void);
extern void gfs2_unregister_debugfs(void);
+extern void glock_set_object(struct gfs2_glock *gl, void *object);
+extern void glock_clear_object(struct gfs2_glock *gl, void *object);
+
extern const struct lm_lockops gfs2_dlm_ops;
static inline void gfs2_holder_mark_uninitialized(struct gfs2_holder *gh)
@@ -303,64 +306,6 @@ static inline bool gfs2_holder_queued(struct gfs2_holder *gh)
return !list_empty(&gh->gh_list);
}
-/**
- * glock_set_object - set the gl_object field of a glock
- * @gl: the glock
- * @object: the object
- */
-static inline void glock_set_object(struct gfs2_glock *gl, void *object)
-{
- spin_lock(&gl->gl_lockref.lock);
- if (gfs2_assert_warn(gl->gl_name.ln_sbd, gl->gl_object == NULL))
- gfs2_dump_glock(NULL, gl, true);
- gl->gl_object = object;
- spin_unlock(&gl->gl_lockref.lock);
-}
-
-/**
- * glock_clear_object - clear the gl_object field of a glock
- * @gl: the glock
- * @object: the object
- *
- * I'd love to similarly add this:
- * else if (gfs2_assert_warn(gl->gl_sbd, gl->gl_object == object))
- * gfs2_dump_glock(NULL, gl, true);
- * Unfortunately, that's not possible because as soon as gfs2_delete_inode
- * frees the block in the rgrp, another process can reassign it for an I_NEW
- * inode in gfs2_create_inode because that calls new_inode, not gfs2_iget.
- * That means gfs2_delete_inode may subsequently try to call this function
- * for a glock that's already pointing to a brand new inode. If we clear the
- * new inode's gl_object, we'll introduce metadata corruption. Function
- * gfs2_delete_inode calls clear_inode which calls gfs2_clear_inode which also
- * tries to clear gl_object, so it's more than just gfs2_delete_inode.
- *
- */
-static inline void glock_clear_object(struct gfs2_glock *gl, void *object)
-{
- spin_lock(&gl->gl_lockref.lock);
- if (gl->gl_object == object)
- gl->gl_object = NULL;
- spin_unlock(&gl->gl_lockref.lock);
-}
-
-static inline void gfs2_holder_allow_demote(struct gfs2_holder *gh)
-{
- struct gfs2_glock *gl = gh->gh_gl;
-
- spin_lock(&gl->gl_lockref.lock);
- set_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
- spin_unlock(&gl->gl_lockref.lock);
-}
-
-static inline void gfs2_holder_disallow_demote(struct gfs2_holder *gh)
-{
- struct gfs2_glock *gl = gh->gh_gl;
-
- spin_lock(&gl->gl_lockref.lock);
- clear_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
- spin_unlock(&gl->gl_lockref.lock);
-}
-
extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation);
extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 49210a2e7ce7..d78b61ecc1cd 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -397,38 +397,39 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
struct timespec64 atime;
u16 height, depth;
umode_t mode = be32_to_cpu(str->di_mode);
- bool is_new = ip->i_inode.i_state & I_NEW;
+ struct inode *inode = &ip->i_inode;
+ bool is_new = inode->i_state & I_NEW;
if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
goto corrupt;
- if (unlikely(!is_new && inode_wrong_type(&ip->i_inode, mode)))
+ if (unlikely(!is_new && inode_wrong_type(inode, mode)))
goto corrupt;
ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino);
- ip->i_inode.i_mode = mode;
+ inode->i_mode = mode;
if (is_new) {
- ip->i_inode.i_rdev = 0;
+ inode->i_rdev = 0;
switch (mode & S_IFMT) {
case S_IFBLK:
case S_IFCHR:
- ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major),
- be32_to_cpu(str->di_minor));
+ inode->i_rdev = MKDEV(be32_to_cpu(str->di_major),
+ be32_to_cpu(str->di_minor));
break;
}
}
- i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid));
- i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid));
- set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink));
- i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
- gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
+ i_uid_write(inode, be32_to_cpu(str->di_uid));
+ i_gid_write(inode, be32_to_cpu(str->di_gid));
+ set_nlink(inode, be32_to_cpu(str->di_nlink));
+ i_size_write(inode, be64_to_cpu(str->di_size));
+ gfs2_set_inode_blocks(inode, be64_to_cpu(str->di_blocks));
atime.tv_sec = be64_to_cpu(str->di_atime);
atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
- if (timespec64_compare(&ip->i_inode.i_atime, &atime) < 0)
- ip->i_inode.i_atime = atime;
- ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
- ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
- ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
- ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
+ if (timespec64_compare(&inode->i_atime, &atime) < 0)
+ inode->i_atime = atime;
+ inode->i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
+ inode->i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
+ inode->i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
+ inode->i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
ip->i_goal = be64_to_cpu(str->di_goal_meta);
ip->i_generation = be64_to_cpu(str->di_generation);
@@ -436,7 +437,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
ip->i_diskflags = be32_to_cpu(str->di_flags);
ip->i_eattr = be64_to_cpu(str->di_eattr);
/* i_diskflags and i_eattr must be set before gfs2_set_inode_flags() */
- gfs2_set_inode_flags(&ip->i_inode);
+ gfs2_set_inode_flags(inode);
height = be16_to_cpu(str->di_height);
if (unlikely(height > GFS2_MAX_META_HEIGHT))
goto corrupt;
@@ -448,8 +449,11 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
ip->i_depth = (u8)depth;
ip->i_entries = be32_to_cpu(str->di_entries);
- if (S_ISREG(ip->i_inode.i_mode))
- gfs2_set_aops(&ip->i_inode);
+ if (gfs2_is_stuffed(ip) && inode->i_size > gfs2_max_stuffed_size(ip))
+ goto corrupt;
+
+ if (S_ISREG(inode->i_mode))
+ gfs2_set_aops(inode);
return 0;
corrupt:
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index d09d9892cd05..c26765080f28 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -252,7 +252,6 @@ struct gfs2_lkstats {
enum {
/* States */
- HIF_MAY_DEMOTE = 1,
HIF_HOLDER = 6, /* Set for gh that "holds" the glock */
HIF_WAIT = 10,
};
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index c8ec876f33ea..614db3055c02 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -130,6 +130,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
if (inode->i_state & I_NEW) {
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_glock *io_gl;
+ int extra_flags = 0;
error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE,
&ip->i_gl);
@@ -141,9 +142,17 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
if (unlikely(error))
goto fail;
- if (blktype != GFS2_BLKST_UNLINKED)
+ /*
+ * The only caller that sets @blktype to GFS2_BLKST_UNLINKED is
+ * delete_work_func(). Make sure not to cancel the delete work
+ * from within itself here.
+ */
+ if (blktype == GFS2_BLKST_UNLINKED)
+ extra_flags |= LM_FLAG_TRY;
+ else
gfs2_cancel_delete_work(io_gl);
- error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT,
+ error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED,
+ GL_EXACT | GL_NOPID | extra_flags,
&ip->i_iopen_gh);
gfs2_glock_put(io_gl);
if (unlikely(error))
@@ -210,6 +219,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
return inode;
fail:
+ if (error == GLR_TRYFAILED)
+ error = -EAGAIN;
if (gfs2_holder_initialized(&ip->i_iopen_gh))
gfs2_glock_dq_uninit(&ip->i_iopen_gh);
if (gfs2_holder_initialized(&i_gh))
@@ -397,12 +408,17 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks)
goto out_ipreserv;
error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1, &ip->i_generation);
+ if (error)
+ goto out_trans_end;
+
ip->i_no_formal_ino = ip->i_generation;
ip->i_inode.i_ino = ip->i_no_addr;
ip->i_goal = ip->i_no_addr;
+ if (*dblocks > 1)
+ ip->i_eattr = ip->i_no_addr + 1;
+out_trans_end:
gfs2_trans_end(sdp);
-
out_ipreserv:
gfs2_inplace_release(ip);
out_quota:
@@ -580,6 +596,12 @@ static int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
* @size: The initial size of the inode (ignored for directories)
* @excl: Force fail if inode exists
*
+ * FIXME: Change to allocate the disk blocks and write them out in the same
+ * transaction. That way, we can no longer end up in a situation in which an
+ * inode is allocated, the node crashes, and the block looks like a valid
+ * inode. (With atomic creates in place, we will also no longer need to zero
+ * the link count and dirty the inode here on failure.)
+ *
* Returns: 0 on success, or error code
*/
@@ -590,12 +612,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
{
const struct qstr *name = &dentry->d_name;
struct posix_acl *default_acl, *acl;
- struct gfs2_holder ghs[2];
+ struct gfs2_holder d_gh, gh;
struct inode *inode = NULL;
struct gfs2_inode *dip = GFS2_I(dir), *ip;
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_glock *io_gl;
- int error, free_vfs_inode = 1;
+ int error;
u32 aflags = 0;
unsigned blocks = 1;
struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, };
@@ -611,10 +633,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (error)
goto fail;
- error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+ error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, &d_gh);
if (error)
goto fail;
- gfs2_holder_mark_uninitialized(ghs + 1);
+ gfs2_holder_mark_uninitialized(&gh);
error = create_ok(dip, name, mode);
if (error)
@@ -636,7 +658,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
else
error = finish_no_open(file, NULL);
}
- gfs2_glock_dq_uninit(ghs);
+ gfs2_glock_dq_uninit(&d_gh);
goto fail;
} else if (error != -ENOENT) {
goto fail_gunlock;
@@ -650,12 +672,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
error = -ENOMEM;
if (!inode)
goto fail_gunlock;
+ ip = GFS2_I(inode);
error = posix_acl_create(dir, &mode, &default_acl, &acl);
if (error)
goto fail_gunlock;
- ip = GFS2_I(inode);
error = gfs2_qa_get(ip);
if (error)
goto fail_free_acls;
@@ -717,14 +739,19 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
goto fail_free_inode;
gfs2_cancel_delete_work(io_gl);
+retry:
error = insert_inode_locked4(inode, ip->i_no_addr, iget_test, &ip->i_no_addr);
- BUG_ON(error);
+ if (error == -EBUSY)
+ goto retry;
+ if (error)
+ goto fail_gunlock2;
- error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
+ error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT | GL_NOPID,
+ &ip->i_iopen_gh);
if (error)
goto fail_gunlock2;
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
if (error)
goto fail_gunlock3;
@@ -732,10 +759,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (error)
goto fail_gunlock3;
- if (blocks > 1) {
- ip->i_eattr = ip->i_no_addr + 1;
+ if (blocks > 1)
gfs2_init_xattr(ip);
- }
init_dinode(dip, ip, symname);
gfs2_trans_end(sdp);
@@ -743,9 +768,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
glock_set_object(io_gl, ip);
gfs2_set_iop(inode);
- free_vfs_inode = 0; /* After this point, the inode is no longer
- considered free. Any failures need to undo
- the gfs2 structures. */
if (default_acl) {
error = __gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
if (error)
@@ -778,9 +800,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
file->f_mode |= FMODE_CREATED;
error = finish_open(file, dentry, gfs2_open_common);
}
- gfs2_glock_dq_uninit(ghs);
+ gfs2_glock_dq_uninit(&d_gh);
gfs2_qa_put(ip);
- gfs2_glock_dq_uninit(ghs + 1);
+ gfs2_glock_dq_uninit(&gh);
gfs2_glock_put(io_gl);
gfs2_qa_put(dip);
unlock_new_inode(inode);
@@ -794,10 +816,6 @@ fail_gunlock3:
fail_gunlock2:
gfs2_glock_put(io_gl);
fail_free_inode:
- if (ip->i_gl) {
- if (free_vfs_inode) /* else evict will do the put for us */
- gfs2_glock_put(ip->i_gl);
- }
gfs2_rs_deltree(&ip->i_res);
gfs2_qa_put(ip);
fail_free_acls:
@@ -805,20 +823,19 @@ fail_free_acls:
posix_acl_release(acl);
fail_gunlock:
gfs2_dir_no_add(&da);
- gfs2_glock_dq_uninit(ghs);
+ gfs2_glock_dq_uninit(&d_gh);
if (!IS_ERR_OR_NULL(inode)) {
+ set_bit(GIF_ALLOC_FAILED, &ip->i_flags);
clear_nlink(inode);
- if (!free_vfs_inode)
+ if (ip->i_no_addr)
mark_inode_dirty(inode);
- set_bit(free_vfs_inode ? GIF_FREE_VFS_INODE : GIF_ALLOC_FAILED,
- &GFS2_I(inode)->i_flags);
if (inode->i_state & I_NEW)
iget_failed(inode);
else
iput(inode);
}
- if (gfs2_holder_initialized(ghs + 1))
- gfs2_glock_dq_uninit(ghs + 1);
+ if (gfs2_holder_initialized(&gh))
+ gfs2_glock_dq_uninit(&gh);
fail:
gfs2_qa_put(dip);
return error;
@@ -1990,7 +2007,7 @@ static int gfs2_setattr(struct user_namespace *mnt_userns,
else {
error = gfs2_setattr_simple(inode, attr);
if (!error && attr->ia_valid & ATTR_MODE)
- error = posix_acl_chmod(&init_user_ns, inode,
+ error = posix_acl_chmod(&init_user_ns, dentry,
inode->i_mode);
}
@@ -2142,7 +2159,7 @@ static const struct inode_operations gfs2_file_iops = {
.getattr = gfs2_getattr,
.listxattr = gfs2_listxattr,
.fiemap = gfs2_fiemap,
- .get_acl = gfs2_get_acl,
+ .get_inode_acl = gfs2_get_acl,
.set_acl = gfs2_set_acl,
.update_time = gfs2_update_time,
.fileattr_get = gfs2_fileattr_get,
@@ -2164,7 +2181,7 @@ static const struct inode_operations gfs2_dir_iops = {
.getattr = gfs2_getattr,
.listxattr = gfs2_listxattr,
.fiemap = gfs2_fiemap,
- .get_acl = gfs2_get_acl,
+ .get_inode_acl = gfs2_get_acl,
.set_acl = gfs2_set_acl,
.update_time = gfs2_update_time,
.atomic_open = gfs2_atomic_open,
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 6ce369b096d4..71911bf9ab34 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -1302,7 +1302,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
memcpy(cluster, table, strlen(table) - strlen(fsname));
fsname++;
- flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL;
+ flags = DLM_LSFL_NEWEXCL;
/*
* create/join lockspace
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 14ae9de76277..afcb32854f14 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -151,14 +151,6 @@ static int __init init_gfs2_fs(void)
if (error)
goto fail_shrinker;
- error = register_filesystem(&gfs2_fs_type);
- if (error)
- goto fail_fs1;
-
- error = register_filesystem(&gfs2meta_fs_type);
- if (error)
- goto fail_fs2;
-
error = -ENOMEM;
gfs_recovery_wq = alloc_workqueue("gfs_recovery",
WQ_MEM_RECLAIM | WQ_FREEZABLE, 0);
@@ -180,11 +172,23 @@ static int __init init_gfs2_fs(void)
goto fail_mempool;
gfs2_register_debugfs();
+ error = register_filesystem(&gfs2_fs_type);
+ if (error)
+ goto fail_fs1;
+
+ error = register_filesystem(&gfs2meta_fs_type);
+ if (error)
+ goto fail_fs2;
+
pr_info("GFS2 installed\n");
return 0;
+fail_fs2:
+ unregister_filesystem(&gfs2_fs_type);
+fail_fs1:
+ mempool_destroy(gfs2_page_pool);
fail_mempool:
destroy_workqueue(gfs2_freeze_wq);
fail_wq3:
@@ -192,10 +196,6 @@ fail_wq3:
fail_wq2:
destroy_workqueue(gfs_recovery_wq);
fail_wq1:
- unregister_filesystem(&gfs2meta_fs_type);
-fail_fs2:
- unregister_filesystem(&gfs2_fs_type);
-fail_fs1:
unregister_shrinker(&gfs2_qd_shrinker);
fail_shrinker:
kmem_cache_destroy(gfs2_trans_cachep);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 7e70e0ba5a6c..3c41b864ee5b 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -442,6 +442,12 @@ void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
struct buffer_head *bh;
int ty;
+ if (!ip->i_gl) {
+ /* This can only happen during incomplete inode creation. */
+ BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags));
+ return;
+ }
+
gfs2_ail1_wipe(sdp, bstart, blen);
while (blen) {
ty = REMOVE_META;
@@ -525,8 +531,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
if (buffer_uptodate(first_bh))
goto out;
- if (!buffer_locked(first_bh))
- ll_rw_block(REQ_OP_READ | REQ_META | REQ_PRIO, 1, &first_bh);
+ bh_read_nowait(first_bh, REQ_META | REQ_PRIO);
dblock++;
extlen--;
@@ -534,9 +539,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
while (extlen) {
bh = gfs2_getbuf(gl, dblock, CREATE);
- if (!buffer_uptodate(bh) && !buffer_locked(bh))
- ll_rw_block(REQ_OP_READ | REQ_RAHEAD | REQ_META |
- REQ_PRIO, 1, &bh);
+ bh_readahead(bh, REQ_RAHEAD | REQ_META | REQ_PRIO);
brelse(bh);
dblock++;
extlen--;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 549879929c84..c0cf1d2d0ef5 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -178,7 +178,10 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent)
pr_warn("Invalid block size\n");
return -EINVAL;
}
-
+ if (sb->sb_bsize_shift != ffs(sb->sb_bsize) - 1) {
+ pr_warn("Invalid block size shift\n");
+ return -EINVAL;
+ }
return 0;
}
@@ -381,8 +384,10 @@ static int init_names(struct gfs2_sbd *sdp, int silent)
if (!table[0])
table = sdp->sd_vfs->s_id;
- strlcpy(sdp->sd_proto_name, proto, GFS2_FSNAME_LEN);
- strlcpy(sdp->sd_table_name, table, GFS2_FSNAME_LEN);
+ BUILD_BUG_ON(GFS2_LOCKNAME_LEN > GFS2_FSNAME_LEN);
+
+ strscpy(sdp->sd_proto_name, proto, GFS2_LOCKNAME_LEN);
+ strscpy(sdp->sd_table_name, table, GFS2_LOCKNAME_LEN);
table = sdp->sd_table_name;
while ((table = strchr(table, '/')))
@@ -401,7 +406,8 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
error = gfs2_glock_nq_num(sdp,
GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
- LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
+ LM_ST_EXCLUSIVE,
+ LM_FLAG_NOEXP | GL_NOCACHE | GL_NOPID,
mount_gh);
if (error) {
fs_err(sdp, "can't acquire mount glock: %d\n", error);
@@ -411,7 +417,7 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
error = gfs2_glock_nq_num(sdp,
GFS2_LIVE_LOCK, &gfs2_nondisk_glops,
LM_ST_SHARED,
- LM_FLAG_NOEXP | GL_EXACT,
+ LM_FLAG_NOEXP | GL_EXACT | GL_NOPID,
&sdp->sd_live_gh);
if (error) {
fs_err(sdp, "can't acquire live glock: %d\n", error);
@@ -687,7 +693,7 @@ static int init_statfs(struct gfs2_sbd *sdp)
iput(pn);
pn = NULL;
ip = GFS2_I(sdp->sd_sc_inode);
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_NOPID,
&sdp->sd_sc_gh);
if (error) {
fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
@@ -776,7 +782,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid,
&gfs2_journal_glops,
LM_ST_EXCLUSIVE,
- LM_FLAG_NOEXP | GL_NOCACHE,
+ LM_FLAG_NOEXP | GL_NOCACHE | GL_NOPID,
&sdp->sd_journal_gh);
if (error) {
fs_err(sdp, "can't acquire journal glock: %d\n", error);
@@ -786,7 +792,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
ip = GFS2_I(sdp->sd_jdesc->jd_inode);
sdp->sd_jinode_gl = ip->i_gl;
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
- LM_FLAG_NOEXP | GL_EXACT | GL_NOCACHE,
+ LM_FLAG_NOEXP | GL_EXACT |
+ GL_NOCACHE | GL_NOPID,
&sdp->sd_jinode_gh);
if (error) {
fs_err(sdp, "can't acquire journal inode glock: %d\n",
@@ -957,7 +964,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
pn = NULL;
ip = GFS2_I(sdp->sd_qc_inode);
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_NOPID,
&sdp->sd_qc_gh);
if (error) {
fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
@@ -1439,13 +1446,13 @@ static int gfs2_parse_param(struct fs_context *fc, struct fs_parameter *param)
switch (o) {
case Opt_lockproto:
- strlcpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN);
+ strscpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN);
break;
case Opt_locktable:
- strlcpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN);
+ strscpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN);
break;
case Opt_hostdata:
- strlcpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN);
+ strscpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN);
break;
case Opt_spectator:
args->ar_spectator = 1;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index f201eaf59d0d..1ed17226d9ed 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -745,12 +745,8 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index,
}
if (PageUptodate(page))
set_buffer_uptodate(bh);
- if (!buffer_uptodate(bh)) {
- ll_rw_block(REQ_OP_READ | REQ_META | REQ_PRIO, 1, &bh);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh))
- goto unlock_out;
- }
+ if (bh_read(bh, REQ_META | REQ_PRIO) < 0)
+ goto unlock_out;
if (gfs2_is_jdata(ip))
gfs2_trans_add_data(ip->i_gl, bh);
else
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b5b0f285b27f..999cc146d708 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -346,7 +346,8 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp)
}
error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE,
- LM_FLAG_NOEXP, &sdp->sd_freeze_gh);
+ LM_FLAG_NOEXP | GL_NOPID,
+ &sdp->sd_freeze_gh);
if (error)
goto out;
@@ -378,6 +379,7 @@ out:
void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
{
+ const struct inode *inode = &ip->i_inode;
struct gfs2_dinode *str = buf;
str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
@@ -385,15 +387,15 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
- str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
- str->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode));
- str->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode));
- str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
- str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
- str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
- str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
- str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
- str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec);
+ str->di_mode = cpu_to_be32(inode->i_mode);
+ str->di_uid = cpu_to_be32(i_uid_read(inode));
+ str->di_gid = cpu_to_be32(i_gid_read(inode));
+ str->di_nlink = cpu_to_be32(inode->i_nlink);
+ str->di_size = cpu_to_be64(i_size_read(inode));
+ str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(inode));
+ str->di_atime = cpu_to_be64(inode->i_atime.tv_sec);
+ str->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec);
+ str->di_ctime = cpu_to_be64(inode->i_ctime.tv_sec);
str->di_goal_meta = cpu_to_be64(ip->i_goal);
str->di_goal_data = cpu_to_be64(ip->i_goal);
@@ -401,16 +403,16 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
str->di_flags = cpu_to_be32(ip->i_diskflags);
str->di_height = cpu_to_be16(ip->i_height);
- str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
+ str->di_payload_format = cpu_to_be32(S_ISDIR(inode->i_mode) &&
!(ip->i_diskflags & GFS2_DIF_EXHASH) ?
GFS2_FORMAT_DE : 0);
str->di_depth = cpu_to_be16(ip->i_depth);
str->di_entries = cpu_to_be32(ip->i_entries);
str->di_eattr = cpu_to_be64(ip->i_eattr);
- str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
- str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec);
- str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec);
+ str->di_atime_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
+ str->di_mtime_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
+ str->di_ctime_nsec = cpu_to_be32(inode->i_ctime.tv_nsec);
}
/**
@@ -474,6 +476,12 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
int need_endtrans = 0;
int ret;
+ if (unlikely(!ip->i_gl)) {
+ /* This can only happen during incomplete inode creation. */
+ BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags));
+ return;
+ }
+
if (unlikely(gfs2_withdrawn(sdp)))
return;
if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
@@ -926,8 +934,7 @@ static int gfs2_drop_inode(struct inode *inode)
{
struct gfs2_inode *ip = GFS2_I(inode);
- if (!test_bit(GIF_FREE_VFS_INODE, &ip->i_flags) &&
- inode->i_nlink &&
+ if (inode->i_nlink &&
gfs2_holder_initialized(&ip->i_iopen_gh)) {
struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
if (test_bit(GLF_DEMOTE, &gl->gl_flags))
@@ -1075,7 +1082,13 @@ static void gfs2_final_release_pages(struct gfs2_inode *ip)
struct inode *inode = &ip->i_inode;
struct gfs2_glock *gl = ip->i_gl;
- truncate_inode_pages(gfs2_glock2aspace(ip->i_gl), 0);
+ if (unlikely(!gl)) {
+ /* This can only happen during incomplete inode creation. */
+ BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags));
+ return;
+ }
+
+ truncate_inode_pages(gfs2_glock2aspace(gl), 0);
truncate_inode_pages(&inode->i_data, 0);
if (atomic_read(&gl->gl_revokes) == 0) {
@@ -1217,10 +1230,8 @@ static enum dinode_demise evict_should_delete(struct inode *inode,
struct gfs2_sbd *sdp = sb->s_fs_info;
int ret;
- if (test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) {
- BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));
+ if (unlikely(test_bit(GIF_ALLOC_FAILED, &ip->i_flags)))
goto should_delete;
- }
if (test_bit(GIF_DEFERRED_DELETE, &ip->i_flags))
return SHOULD_DEFER_EVICTION;
@@ -1293,13 +1304,22 @@ static int evict_unlinked_inode(struct inode *inode)
goto out;
}
- /* We're about to clear the bitmap for the dinode, but as soon as we
- do, gfs2_create_inode can create another inode at the same block
- location and try to set gl_object again. We clear gl_object here so
- that subsequent inode creates don't see an old gl_object. */
- glock_clear_object(ip->i_gl, ip);
+ if (ip->i_gl)
+ gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino);
+
+ /*
+ * As soon as we clear the bitmap for the dinode, gfs2_create_inode()
+ * can get called to recreate it, or even gfs2_inode_lookup() if the
+ * inode was recreated on another node in the meantime.
+ *
+ * However, inserting the new inode into the inode hash table will not
+ * succeed until the old inode is removed, and that only happens after
+ * ->evict_inode() returns. The new inode is attached to its inode and
+ * iopen glocks after inserting it into the inode hash table, so at
+ * that point we can be sure that both glocks are unused.
+ */
+
ret = gfs2_dinode_dealloc(ip);
- gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino);
out:
return ret;
}
@@ -1366,12 +1386,7 @@ static void gfs2_evict_inode(struct inode *inode)
struct gfs2_holder gh;
int ret;
- if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) {
- clear_inode(inode);
- return;
- }
-
- if (inode->i_nlink || sb_rdonly(sb))
+ if (inode->i_nlink || sb_rdonly(sb) || !ip->i_no_addr)
goto out;
gfs2_holder_mark_uninitialized(&gh);
@@ -1404,12 +1419,9 @@ out:
struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
glock_clear_object(gl, ip);
- if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
- ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq(&ip->i_iopen_gh);
- }
gfs2_glock_hold(gl);
- gfs2_holder_uninit(&ip->i_iopen_gh);
+ ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
+ gfs2_glock_dq_uninit(&ip->i_iopen_gh);
gfs2_glock_put_eventually(gl);
}
if (ip->i_gl) {
@@ -1428,6 +1440,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
ip = alloc_inode_sb(sb, gfs2_inode_cachep, GFP_KERNEL);
if (!ip)
return NULL;
+ ip->i_no_addr = 0;
ip->i_flags = 0;
ip->i_gl = NULL;
gfs2_holder_mark_uninitialized(&ip->i_iopen_gh);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 8241029a2a5d..7a6aeffcdf5c 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -164,6 +164,11 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
}
if (!ret)
gfs2_make_fs_ro(sdp);
+ /*
+ * Dequeue any pending non-system glock holders that can no
+ * longer be granted because the file system is withdrawn.
+ */
+ gfs2_gl_dq_holders(sdp);
gfs2_freeze_unlock(&freeze_gh);
}
@@ -204,6 +209,7 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
* exception code in glock_dq.
*/
iput(inode);
+ sdp->sd_jdesc->jd_inode = NULL;
/*
* Wait until the journal inode's glock is freed. This allows try locks
* on other nodes to be successful, otherwise we remain the owner of
@@ -226,7 +232,8 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
*/
fs_warn(sdp, "Requesting recovery of jid %d.\n",
sdp->sd_lockstruct.ls_jid);
- gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | LM_FLAG_NOEXP,
+ gfs2_holder_reinit(LM_ST_EXCLUSIVE,
+ LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | GL_NOPID,
&sdp->sd_live_gh);
msleep(GL_GLOCK_MAX_HOLD);
/*
@@ -251,7 +258,8 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
fs_warn(sdp, "Unable to recover our journal jid %d.\n",
sdp->sd_lockstruct.ls_jid);
gfs2_glock_dq_wait(&sdp->sd_live_gh);
- gfs2_holder_reinit(LM_ST_SHARED, LM_FLAG_NOEXP | GL_EXACT,
+ gfs2_holder_reinit(LM_ST_SHARED,
+ LM_FLAG_NOEXP | GL_EXACT | GL_NOPID,
&sdp->sd_live_gh);
gfs2_glock_nq(&sdp->sd_live_gh);
}
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index f6a66050380e..518c0677e12a 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1412,11 +1412,13 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
ip->i_eattr = 0;
gfs2_add_inode_blocks(&ip->i_inode, -1);
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (!error) {
- gfs2_trans_add_meta(ip->i_gl, dibh);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
+ if (likely(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) {
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (!error) {
+ gfs2_trans_add_meta(ip->i_gl, dibh);
+ gfs2_dinode_out(ip, dibh->b_data);
+ brelse(dibh);
+ }
}
gfs2_trans_end(sdp);
@@ -1445,14 +1447,16 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
if (error)
return error;
- error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
- if (error)
- goto out_quota;
-
- if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
- error = ea_dealloc_indirect(ip);
+ if (likely(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) {
+ error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
if (error)
goto out_quota;
+
+ if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
+ error = ea_dealloc_indirect(ip);
+ if (error)
+ goto out_quota;
+ }
}
error = ea_dealloc_block(ip);
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index c83fd0e8404d..2015e42e752a 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -21,7 +21,6 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
int pagenum;
int bytes_read;
int bytes_to_read;
- void *vaddr;
off += node->page_offset;
pagenum = off >> PAGE_SHIFT;
@@ -33,9 +32,7 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
page = node->page[pagenum];
bytes_to_read = min_t(int, len - bytes_read, PAGE_SIZE - off);
- vaddr = kmap_atomic(page);
- memcpy(buf + bytes_read, vaddr + off, bytes_to_read);
- kunmap_atomic(vaddr);
+ memcpy_from_page(buf + bytes_read, page, off, bytes_to_read);
pagenum++;
off = 0; /* page offset only applies to the first page */
@@ -80,8 +77,7 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
off += node->page_offset;
page = node->page[0];
- memcpy(kmap(page) + off, buf, len);
- kunmap(page);
+ memcpy_to_page(page, off, buf, len);
set_page_dirty(page);
}
@@ -105,8 +101,7 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
off += node->page_offset;
page = node->page[0];
- memset(kmap(page) + off, 0, len);
- kunmap(page);
+ memzero_page(page, off, len);
set_page_dirty(page);
}
@@ -123,9 +118,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
src_page = src_node->page[0];
dst_page = dst_node->page[0];
- memcpy(kmap(dst_page) + dst, kmap(src_page) + src, len);
- kunmap(src_page);
- kunmap(dst_page);
+ memcpy_page(dst_page, dst, src_page, src, len);
set_page_dirty(dst_page);
}
@@ -140,9 +133,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
src += node->page_offset;
dst += node->page_offset;
page = node->page[0];
- ptr = kmap(page);
+ ptr = kmap_local_page(page);
memmove(ptr + dst, ptr + src, len);
- kunmap(page);
+ kunmap_local(ptr);
set_page_dirty(page);
}
@@ -346,13 +339,14 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
if (!test_bit(HFS_BNODE_NEW, &node->flags))
return node;
- desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + node->page_offset);
+ desc = (struct hfs_bnode_desc *)(kmap_local_page(node->page[0]) +
+ node->page_offset);
node->prev = be32_to_cpu(desc->prev);
node->next = be32_to_cpu(desc->next);
node->num_recs = be16_to_cpu(desc->num_recs);
node->type = desc->type;
node->height = desc->height;
- kunmap(node->page[0]);
+ kunmap_local(desc);
switch (node->type) {
case HFS_NODE_HEADER:
@@ -436,14 +430,12 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
}
pagep = node->page;
- memset(kmap(*pagep) + node->page_offset, 0,
- min((int)PAGE_SIZE, (int)tree->node_size));
+ memzero_page(*pagep, node->page_offset,
+ min((int)PAGE_SIZE, (int)tree->node_size));
set_page_dirty(*pagep);
- kunmap(*pagep);
for (i = 1; i < tree->pages_per_bnode; i++) {
- memset(kmap(*++pagep), 0, PAGE_SIZE);
+ memzero_page(*++pagep, 0, PAGE_SIZE);
set_page_dirty(*pagep);
- kunmap(*pagep);
}
clear_bit(HFS_BNODE_NEW, &node->flags);
wake_up(&node->lock_wq);
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 19017d296173..2fa4b1f8cc7f 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -80,7 +80,8 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
goto free_inode;
/* Load the header */
- head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
+ head = (struct hfs_btree_header_rec *)(kmap_local_page(page) +
+ sizeof(struct hfs_bnode_desc));
tree->root = be32_to_cpu(head->root);
tree->leaf_count = be32_to_cpu(head->leaf_count);
tree->leaf_head = be32_to_cpu(head->leaf_head);
@@ -119,11 +120,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
tree->node_size_shift = ffs(size) - 1;
tree->pages_per_bnode = (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- kunmap(page);
+ kunmap_local(head);
put_page(page);
return tree;
fail_page:
+ kunmap_local(head);
put_page(page);
free_inode:
tree->inode->i_mapping->a_ops = &hfs_aops;
@@ -169,7 +171,8 @@ void hfs_btree_write(struct hfs_btree *tree)
return;
/* Load the header */
page = node->page[0];
- head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
+ head = (struct hfs_btree_header_rec *)(kmap_local_page(page) +
+ sizeof(struct hfs_bnode_desc));
head->root = cpu_to_be32(tree->root);
head->leaf_count = cpu_to_be32(tree->leaf_count);
@@ -180,7 +183,7 @@ void hfs_btree_write(struct hfs_btree *tree)
head->attributes = cpu_to_be32(tree->attributes);
head->depth = cpu_to_be16(tree->depth);
- kunmap(page);
+ kunmap_local(head);
set_page_dirty(page);
hfs_bnode_put(node);
}
@@ -268,7 +271,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
off += node->page_offset;
pagep = node->page + (off >> PAGE_SHIFT);
- data = kmap(*pagep);
+ data = kmap_local_page(*pagep);
off &= ~PAGE_MASK;
idx = 0;
@@ -281,7 +284,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
idx += i;
data[off] |= m;
set_page_dirty(*pagep);
- kunmap(*pagep);
+ kunmap_local(data);
tree->free_nodes--;
mark_inode_dirty(tree->inode);
hfs_bnode_put(node);
@@ -290,14 +293,14 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
}
}
if (++off >= PAGE_SIZE) {
- kunmap(*pagep);
- data = kmap(*++pagep);
+ kunmap_local(data);
+ data = kmap_local_page(*++pagep);
off = 0;
}
idx += 8;
len--;
}
- kunmap(*pagep);
+ kunmap_local(data);
nidx = node->next;
if (!nidx) {
printk(KERN_DEBUG "create new bmap node...\n");
@@ -313,7 +316,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
off = off16;
off += node->page_offset;
pagep = node->page + (off >> PAGE_SHIFT);
- data = kmap(*pagep);
+ data = kmap_local_page(*pagep);
off &= ~PAGE_MASK;
}
}
@@ -360,20 +363,20 @@ void hfs_bmap_free(struct hfs_bnode *node)
}
off += node->page_offset + nidx / 8;
page = node->page[off >> PAGE_SHIFT];
- data = kmap(page);
+ data = kmap_local_page(page);
off &= ~PAGE_MASK;
m = 1 << (~nidx & 7);
byte = data[off];
if (!(byte & m)) {
pr_crit("trying to free free bnode %u(%d)\n",
node->this, node->type);
- kunmap(page);
+ kunmap_local(data);
hfs_bnode_put(node);
return;
}
data[off] = byte & ~m;
set_page_dirty(page);
- kunmap(page);
+ kunmap_local(data);
hfs_bnode_put(node);
tree->free_nodes++;
mark_inode_dirty(tree->inode);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index c4526f16355d..9c329a365e75 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -173,12 +173,12 @@ const struct address_space_operations hfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = hfs_read_folio,
- .writepage = hfs_writepage,
.write_begin = hfs_write_begin,
.write_end = generic_write_end,
.bmap = hfs_bmap,
.direct_IO = hfs_direct_IO,
.writepages = hfs_writepages,
+ .migrate_folio = buffer_migrate_folio,
};
/*
@@ -458,6 +458,8 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
/* panic? */
return -EIO;
+ if (HFS_I(main_inode)->cat_key.CName.len > HFS_NAMELEN)
+ return -EIO;
fd.search_key->cat = HFS_I(main_inode)->cat_key;
if (hfs_brec_find(&fd))
/* panic? */
diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c
index 39f5e343bf4d..fdb0edb8a607 100644
--- a/fs/hfs/trans.c
+++ b/fs/hfs/trans.c
@@ -109,7 +109,7 @@ void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, const struct qstr
if (nls_io) {
wchar_t ch;
- while (srclen > 0) {
+ while (srclen > 0 && dstlen > 0) {
size = nls_io->char2uni(src, srclen, &ch);
if (size < 0) {
ch = '?';
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index cebce0cfe340..bd8dcea85588 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -39,7 +39,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size,
start = size;
goto out;
}
- pptr = kmap(page);
+ pptr = kmap_local_page(page);
curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
i = offset % 32;
offset &= ~(PAGE_CACHE_BITS - 1);
@@ -74,7 +74,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size,
}
curr++;
}
- kunmap(page);
+ kunmap_local(pptr);
offset += PAGE_CACHE_BITS;
if (offset >= size)
break;
@@ -84,7 +84,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size,
start = size;
goto out;
}
- curr = pptr = kmap(page);
+ curr = pptr = kmap_local_page(page);
if ((size ^ offset) / PAGE_CACHE_BITS)
end = pptr + PAGE_CACHE_BITS / 32;
else
@@ -127,7 +127,7 @@ found:
len -= 32;
}
set_page_dirty(page);
- kunmap(page);
+ kunmap_local(pptr);
offset += PAGE_CACHE_BITS;
page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
NULL);
@@ -135,7 +135,7 @@ found:
start = size;
goto out;
}
- pptr = kmap(page);
+ pptr = kmap_local_page(page);
curr = pptr;
end = pptr + PAGE_CACHE_BITS / 32;
}
@@ -151,7 +151,7 @@ last:
done:
*curr = cpu_to_be32(n);
set_page_dirty(page);
- kunmap(page);
+ kunmap_local(pptr);
*max = offset + (curr - pptr) * 32 + i - start;
sbi->free_blocks -= *max;
hfsplus_mark_mdb_dirty(sb);
@@ -185,7 +185,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
page = read_mapping_page(mapping, pnr, NULL);
if (IS_ERR(page))
goto kaboom;
- pptr = kmap(page);
+ pptr = kmap_local_page(page);
curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
end = pptr + PAGE_CACHE_BITS / 32;
len = count;
@@ -215,11 +215,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
if (!count)
break;
set_page_dirty(page);
- kunmap(page);
+ kunmap_local(pptr);
page = read_mapping_page(mapping, ++pnr, NULL);
if (IS_ERR(page))
goto kaboom;
- pptr = kmap(page);
+ pptr = kmap_local_page(page);
curr = pptr;
end = pptr + PAGE_CACHE_BITS / 32;
}
@@ -231,7 +231,7 @@ done:
}
out:
set_page_dirty(page);
- kunmap(page);
+ kunmap_local(pptr);
sbi->free_blocks += len;
hfsplus_mark_mdb_dirty(sb);
mutex_unlock(&sbi->alloc_mutex);
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index a5ab00e54220..87974d5e6791 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -29,14 +29,12 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
off &= ~PAGE_MASK;
l = min_t(int, len, PAGE_SIZE - off);
- memcpy(buf, kmap(*pagep) + off, l);
- kunmap(*pagep);
+ memcpy_from_page(buf, *pagep, off, l);
while ((len -= l) != 0) {
buf += l;
l = min_t(int, len, PAGE_SIZE);
- memcpy(buf, kmap(*++pagep), l);
- kunmap(*pagep);
+ memcpy_from_page(buf, *++pagep, 0, l);
}
}
@@ -82,16 +80,14 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
off &= ~PAGE_MASK;
l = min_t(int, len, PAGE_SIZE - off);
- memcpy(kmap(*pagep) + off, buf, l);
+ memcpy_to_page(*pagep, off, buf, l);
set_page_dirty(*pagep);
- kunmap(*pagep);
while ((len -= l) != 0) {
buf += l;
l = min_t(int, len, PAGE_SIZE);
- memcpy(kmap(*++pagep), buf, l);
+ memcpy_to_page(*++pagep, 0, buf, l);
set_page_dirty(*pagep);
- kunmap(*pagep);
}
}
@@ -112,15 +108,13 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
off &= ~PAGE_MASK;
l = min_t(int, len, PAGE_SIZE - off);
- memset(kmap(*pagep) + off, 0, l);
+ memzero_page(*pagep, off, l);
set_page_dirty(*pagep);
- kunmap(*pagep);
while ((len -= l) != 0) {
l = min_t(int, len, PAGE_SIZE);
- memset(kmap(*++pagep), 0, l);
+ memzero_page(*++pagep, 0, l);
set_page_dirty(*pagep);
- kunmap(*pagep);
}
}
@@ -142,24 +136,20 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
if (src == dst) {
l = min_t(int, len, PAGE_SIZE - src);
- memcpy(kmap(*dst_page) + src, kmap(*src_page) + src, l);
- kunmap(*src_page);
+ memcpy_page(*dst_page, src, *src_page, src, l);
set_page_dirty(*dst_page);
- kunmap(*dst_page);
while ((len -= l) != 0) {
l = min_t(int, len, PAGE_SIZE);
- memcpy(kmap(*++dst_page), kmap(*++src_page), l);
- kunmap(*src_page);
+ memcpy_page(*++dst_page, 0, *++src_page, 0, l);
set_page_dirty(*dst_page);
- kunmap(*dst_page);
}
} else {
void *src_ptr, *dst_ptr;
do {
- src_ptr = kmap(*src_page) + src;
- dst_ptr = kmap(*dst_page) + dst;
+ dst_ptr = kmap_local_page(*dst_page) + dst;
+ src_ptr = kmap_local_page(*src_page) + src;
if (PAGE_SIZE - src < PAGE_SIZE - dst) {
l = PAGE_SIZE - src;
src = 0;
@@ -171,9 +161,9 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
}
l = min(len, l);
memcpy(dst_ptr, src_ptr, l);
- kunmap(*src_page);
+ kunmap_local(src_ptr);
set_page_dirty(*dst_page);
- kunmap(*dst_page);
+ kunmap_local(dst_ptr);
if (!dst)
dst_page++;
else
@@ -185,6 +175,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
{
struct page **src_page, **dst_page;
+ void *src_ptr, *dst_ptr;
int l;
hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
@@ -202,27 +193,28 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
if (src == dst) {
while (src < len) {
- memmove(kmap(*dst_page), kmap(*src_page), src);
- kunmap(*src_page);
+ dst_ptr = kmap_local_page(*dst_page);
+ src_ptr = kmap_local_page(*src_page);
+ memmove(dst_ptr, src_ptr, src);
+ kunmap_local(src_ptr);
set_page_dirty(*dst_page);
- kunmap(*dst_page);
+ kunmap_local(dst_ptr);
len -= src;
src = PAGE_SIZE;
src_page--;
dst_page--;
}
src -= len;
- memmove(kmap(*dst_page) + src,
- kmap(*src_page) + src, len);
- kunmap(*src_page);
+ dst_ptr = kmap_local_page(*dst_page);
+ src_ptr = kmap_local_page(*src_page);
+ memmove(dst_ptr + src, src_ptr + src, len);
+ kunmap_local(src_ptr);
set_page_dirty(*dst_page);
- kunmap(*dst_page);
+ kunmap_local(dst_ptr);
} else {
- void *src_ptr, *dst_ptr;
-
do {
- src_ptr = kmap(*src_page) + src;
- dst_ptr = kmap(*dst_page) + dst;
+ dst_ptr = kmap_local_page(*dst_page) + dst;
+ src_ptr = kmap_local_page(*src_page) + src;
if (src < dst) {
l = src;
src = PAGE_SIZE;
@@ -234,9 +226,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
}
l = min(len, l);
memmove(dst_ptr - l, src_ptr - l, l);
- kunmap(*src_page);
+ kunmap_local(src_ptr);
set_page_dirty(*dst_page);
- kunmap(*dst_page);
+ kunmap_local(dst_ptr);
if (dst == PAGE_SIZE)
dst_page--;
else
@@ -251,26 +243,27 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
if (src == dst) {
l = min_t(int, len, PAGE_SIZE - src);
- memmove(kmap(*dst_page) + src,
- kmap(*src_page) + src, l);
- kunmap(*src_page);
+
+ dst_ptr = kmap_local_page(*dst_page) + src;
+ src_ptr = kmap_local_page(*src_page) + src;
+ memmove(dst_ptr, src_ptr, l);
+ kunmap_local(src_ptr);
set_page_dirty(*dst_page);
- kunmap(*dst_page);
+ kunmap_local(dst_ptr);
while ((len -= l) != 0) {
l = min_t(int, len, PAGE_SIZE);
- memmove(kmap(*++dst_page),
- kmap(*++src_page), l);
- kunmap(*src_page);
+ dst_ptr = kmap_local_page(*++dst_page);
+ src_ptr = kmap_local_page(*++src_page);
+ memmove(dst_ptr, src_ptr, l);
+ kunmap_local(src_ptr);
set_page_dirty(*dst_page);
- kunmap(*dst_page);
+ kunmap_local(dst_ptr);
}
} else {
- void *src_ptr, *dst_ptr;
-
do {
- src_ptr = kmap(*src_page) + src;
- dst_ptr = kmap(*dst_page) + dst;
+ dst_ptr = kmap_local_page(*dst_page) + dst;
+ src_ptr = kmap_local_page(*src_page) + src;
if (PAGE_SIZE - src <
PAGE_SIZE - dst) {
l = PAGE_SIZE - src;
@@ -283,9 +276,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
}
l = min(len, l);
memmove(dst_ptr, src_ptr, l);
- kunmap(*src_page);
+ kunmap_local(src_ptr);
set_page_dirty(*dst_page);
- kunmap(*dst_page);
+ kunmap_local(dst_ptr);
if (!dst)
dst_page++;
else
@@ -498,14 +491,14 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
if (!test_bit(HFS_BNODE_NEW, &node->flags))
return node;
- desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) +
- node->page_offset);
+ desc = (struct hfs_bnode_desc *)(kmap_local_page(node->page[0]) +
+ node->page_offset);
node->prev = be32_to_cpu(desc->prev);
node->next = be32_to_cpu(desc->next);
node->num_recs = be16_to_cpu(desc->num_recs);
node->type = desc->type;
node->height = desc->height;
- kunmap(node->page[0]);
+ kunmap_local(desc);
switch (node->type) {
case HFS_NODE_HEADER:
@@ -589,14 +582,12 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
}
pagep = node->page;
- memset(kmap(*pagep) + node->page_offset, 0,
- min_t(int, PAGE_SIZE, tree->node_size));
+ memzero_page(*pagep, node->page_offset,
+ min_t(int, PAGE_SIZE, tree->node_size));
set_page_dirty(*pagep);
- kunmap(*pagep);
for (i = 1; i < tree->pages_per_bnode; i++) {
- memset(kmap(*++pagep), 0, PAGE_SIZE);
+ memzero_page(*++pagep, 0, PAGE_SIZE);
set_page_dirty(*pagep);
- kunmap(*pagep);
}
clear_bit(HFS_BNODE_NEW, &node->flags);
wake_up(&node->lock_wq);
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 66774f4cb4fd..9e1732a2b92a 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -163,7 +163,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
goto free_inode;
/* Load the header */
- head = (struct hfs_btree_header_rec *)(kmap(page) +
+ head = (struct hfs_btree_header_rec *)(kmap_local_page(page) +
sizeof(struct hfs_bnode_desc));
tree->root = be32_to_cpu(head->root);
tree->leaf_count = be32_to_cpu(head->leaf_count);
@@ -240,11 +240,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
(tree->node_size + PAGE_SIZE - 1) >>
PAGE_SHIFT;
- kunmap(page);
+ kunmap_local(head);
put_page(page);
return tree;
fail_page:
+ kunmap_local(head);
put_page(page);
free_inode:
tree->inode->i_mapping->a_ops = &hfsplus_aops;
@@ -291,7 +292,7 @@ int hfs_btree_write(struct hfs_btree *tree)
return -EIO;
/* Load the header */
page = node->page[0];
- head = (struct hfs_btree_header_rec *)(kmap(page) +
+ head = (struct hfs_btree_header_rec *)(kmap_local_page(page) +
sizeof(struct hfs_bnode_desc));
head->root = cpu_to_be32(tree->root);
@@ -303,7 +304,7 @@ int hfs_btree_write(struct hfs_btree *tree)
head->attributes = cpu_to_be32(tree->attributes);
head->depth = cpu_to_be16(tree->depth);
- kunmap(page);
+ kunmap_local(head);
set_page_dirty(page);
hfs_bnode_put(node);
return 0;
@@ -394,7 +395,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
off += node->page_offset;
pagep = node->page + (off >> PAGE_SHIFT);
- data = kmap(*pagep);
+ data = kmap_local_page(*pagep);
off &= ~PAGE_MASK;
idx = 0;
@@ -407,7 +408,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
idx += i;
data[off] |= m;
set_page_dirty(*pagep);
- kunmap(*pagep);
+ kunmap_local(data);
tree->free_nodes--;
mark_inode_dirty(tree->inode);
hfs_bnode_put(node);
@@ -417,14 +418,14 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
}
}
if (++off >= PAGE_SIZE) {
- kunmap(*pagep);
- data = kmap(*++pagep);
+ kunmap_local(data);
+ data = kmap_local_page(*++pagep);
off = 0;
}
idx += 8;
len--;
}
- kunmap(*pagep);
+ kunmap_local(data);
nidx = node->next;
if (!nidx) {
hfs_dbg(BNODE_MOD, "create new bmap node\n");
@@ -440,7 +441,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
off = off16;
off += node->page_offset;
pagep = node->page + (off >> PAGE_SHIFT);
- data = kmap(*pagep);
+ data = kmap_local_page(*pagep);
off &= ~PAGE_MASK;
}
}
@@ -490,7 +491,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
}
off += node->page_offset + nidx / 8;
page = node->page[off >> PAGE_SHIFT];
- data = kmap(page);
+ data = kmap_local_page(page);
off &= ~PAGE_MASK;
m = 1 << (~nidx & 7);
byte = data[off];
@@ -498,13 +499,13 @@ void hfs_bmap_free(struct hfs_bnode *node)
pr_crit("trying to free free bnode "
"%u(%d)\n",
node->this, node->type);
- kunmap(page);
+ kunmap_local(data);
hfs_bnode_put(node);
return;
}
data[off] = byte & ~m;
set_page_dirty(page);
- kunmap(page);
+ kunmap_local(data);
hfs_bnode_put(node);
tree->free_nodes++;
mark_inode_dirty(tree->inode);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index a5db2e3b2980..6aa919e59483 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -198,6 +198,8 @@ struct hfsplus_sb_info {
#define HFSPLUS_SB_HFSX 3
#define HFSPLUS_SB_CASEFOLD 4
#define HFSPLUS_SB_NOBARRIER 5
+#define HFSPLUS_SB_UID 6
+#define HFSPLUS_SB_GID 7
static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
{
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index aeab83ed1c9c..840577a0c1e7 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -170,12 +170,12 @@ const struct address_space_operations hfsplus_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = hfsplus_read_folio,
- .writepage = hfsplus_writepage,
.write_begin = hfsplus_write_begin,
.write_end = generic_write_end,
.bmap = hfsplus_bmap,
.direct_IO = hfsplus_direct_IO,
.writepages = hfsplus_writepages,
+ .migrate_folio = buffer_migrate_folio,
};
const struct dentry_operations hfsplus_dentry_operations = {
@@ -192,11 +192,11 @@ static void hfsplus_get_perms(struct inode *inode,
mode = be16_to_cpu(perms->mode);
i_uid_write(inode, be32_to_cpu(perms->owner));
- if (!i_uid_read(inode) && !mode)
+ if ((test_bit(HFSPLUS_SB_UID, &sbi->flags)) || (!i_uid_read(inode) && !mode))
inode->i_uid = sbi->uid;
i_gid_write(inode, be32_to_cpu(perms->group));
- if (!i_gid_read(inode) && !mode)
+ if ((test_bit(HFSPLUS_SB_GID, &sbi->flags)) || (!i_gid_read(inode) && !mode))
inode->i_gid = sbi->gid;
if (dir) {
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 047e05c57560..c94a58762ad6 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -140,6 +140,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
if (!uid_valid(sbi->uid)) {
pr_err("invalid uid specified\n");
return 0;
+ } else {
+ set_bit(HFSPLUS_SB_UID, &sbi->flags);
}
break;
case opt_gid:
@@ -151,6 +153,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
if (!gid_valid(sbi->gid)) {
pr_err("invalid gid specified\n");
return 0;
+ } else {
+ set_bit(HFSPLUS_SB_GID, &sbi->flags);
}
break;
case opt_part:
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 07881b76d42f..277468783fee 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -103,7 +103,7 @@ static char *__dentry_name(struct dentry *dentry, char *name)
*/
BUG_ON(p + strlen(p) + 1 != name + PATH_MAX);
- strlcpy(name, root, PATH_MAX);
+ strscpy(name, root, PATH_MAX);
if (len > p - name) {
__putname(name);
return NULL;
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index f7547a62c81f..88952d4a631e 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -163,11 +163,6 @@ static int hpfs_read_folio(struct file *file, struct folio *folio)
return mpage_read_folio(folio, hpfs_get_block);
}
-static int hpfs_writepage(struct page *page, struct writeback_control *wbc)
-{
- return block_write_full_page(page, hpfs_get_block, wbc);
-}
-
static void hpfs_readahead(struct readahead_control *rac)
{
mpage_readahead(rac, hpfs_get_block);
@@ -248,12 +243,12 @@ const struct address_space_operations hpfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = hpfs_read_folio,
- .writepage = hpfs_writepage,
.readahead = hpfs_readahead,
.writepages = hpfs_writepages,
.write_begin = hpfs_write_begin,
.write_end = hpfs_write_end,
- .bmap = _hpfs_bmap
+ .bmap = _hpfs_bmap,
+ .migrate_folio = buffer_migrate_folio,
};
const struct file_operations hpfs_file_ops =
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index f7a5b5124d8a..790d2727141a 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -328,6 +328,12 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
} else {
unlock_page(page);
+ if (PageHWPoison(page)) {
+ put_page(page);
+ retval = -EIO;
+ break;
+ }
+
/*
* We have the page, copy it to user space buffer.
*/
@@ -364,11 +370,153 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
return -EINVAL;
}
-static void remove_huge_page(struct page *page)
+static void hugetlb_delete_from_page_cache(struct folio *folio)
+{
+ folio_clear_dirty(folio);
+ folio_clear_uptodate(folio);
+ filemap_remove_folio(folio);
+}
+
+/*
+ * Called with i_mmap_rwsem held for inode based vma maps. This makes
+ * sure vma (and vm_mm) will not go away. We also hold the hugetlb fault
+ * mutex for the page in the mapping. So, we can not race with page being
+ * faulted into the vma.
+ */
+static bool hugetlb_vma_maps_page(struct vm_area_struct *vma,
+ unsigned long addr, struct page *page)
+{
+ pte_t *ptep, pte;
+
+ ptep = huge_pte_offset(vma->vm_mm, addr,
+ huge_page_size(hstate_vma(vma)));
+
+ if (!ptep)
+ return false;
+
+ pte = huge_ptep_get(ptep);
+ if (huge_pte_none(pte) || !pte_present(pte))
+ return false;
+
+ if (pte_page(pte) == page)
+ return true;
+
+ return false;
+}
+
+/*
+ * Can vma_offset_start/vma_offset_end overflow on 32-bit arches?
+ * No, because the interval tree returns us only those vmas
+ * which overlap the truncated area starting at pgoff,
+ * and no vma on a 32-bit arch can span beyond the 4GB.
+ */
+static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start)
+{
+ if (vma->vm_pgoff < start)
+ return (start - vma->vm_pgoff) << PAGE_SHIFT;
+ else
+ return 0;
+}
+
+static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end)
+{
+ unsigned long t_end;
+
+ if (!end)
+ return vma->vm_end;
+
+ t_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start;
+ if (t_end > vma->vm_end)
+ t_end = vma->vm_end;
+ return t_end;
+}
+
+/*
+ * Called with hugetlb fault mutex held. Therefore, no more mappings to
+ * this folio can be created while executing the routine.
+ */
+static void hugetlb_unmap_file_folio(struct hstate *h,
+ struct address_space *mapping,
+ struct folio *folio, pgoff_t index)
{
- ClearPageDirty(page);
- ClearPageUptodate(page);
- delete_from_page_cache(page);
+ struct rb_root_cached *root = &mapping->i_mmap;
+ struct hugetlb_vma_lock *vma_lock;
+ struct page *page = &folio->page;
+ struct vm_area_struct *vma;
+ unsigned long v_start;
+ unsigned long v_end;
+ pgoff_t start, end;
+
+ start = index * pages_per_huge_page(h);
+ end = (index + 1) * pages_per_huge_page(h);
+
+ i_mmap_lock_write(mapping);
+retry:
+ vma_lock = NULL;
+ vma_interval_tree_foreach(vma, root, start, end - 1) {
+ v_start = vma_offset_start(vma, start);
+ v_end = vma_offset_end(vma, end);
+
+ if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
+ continue;
+
+ if (!hugetlb_vma_trylock_write(vma)) {
+ vma_lock = vma->vm_private_data;
+ /*
+ * If we can not get vma lock, we need to drop
+ * immap_sema and take locks in order. First,
+ * take a ref on the vma_lock structure so that
+ * we can be guaranteed it will not go away when
+ * dropping immap_sema.
+ */
+ kref_get(&vma_lock->refs);
+ break;
+ }
+
+ unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
+ NULL, ZAP_FLAG_DROP_MARKER);
+ hugetlb_vma_unlock_write(vma);
+ }
+
+ i_mmap_unlock_write(mapping);
+
+ if (vma_lock) {
+ /*
+ * Wait on vma_lock. We know it is still valid as we have
+ * a reference. We must 'open code' vma locking as we do
+ * not know if vma_lock is still attached to vma.
+ */
+ down_write(&vma_lock->rw_sema);
+ i_mmap_lock_write(mapping);
+
+ vma = vma_lock->vma;
+ if (!vma) {
+ /*
+ * If lock is no longer attached to vma, then just
+ * unlock, drop our reference and retry looking for
+ * other vmas.
+ */
+ up_write(&vma_lock->rw_sema);
+ kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
+ goto retry;
+ }
+
+ /*
+ * vma_lock is still attached to vma. Check to see if vma
+ * still maps page and if so, unmap.
+ */
+ v_start = vma_offset_start(vma, start);
+ v_end = vma_offset_end(vma, end);
+ if (hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
+ unmap_hugepage_range(vma, vma->vm_start + v_start,
+ v_end, NULL,
+ ZAP_FLAG_DROP_MARKER);
+
+ kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
+ hugetlb_vma_unlock_write(vma);
+
+ goto retry;
+ }
}
static void
@@ -383,32 +531,66 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
* an inclusive "last".
*/
vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) {
- unsigned long v_offset;
+ unsigned long v_start;
unsigned long v_end;
+ if (!hugetlb_vma_trylock_write(vma))
+ continue;
+
+ v_start = vma_offset_start(vma, start);
+ v_end = vma_offset_end(vma, end);
+
+ unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
+ NULL, zap_flags);
+
/*
- * Can the expression below overflow on 32-bit arches?
- * No, because the interval tree returns us only those vmas
- * which overlap the truncated area starting at pgoff,
- * and no vma on a 32-bit arch can span beyond the 4GB.
+ * Note that vma lock only exists for shared/non-private
+ * vmas. Therefore, lock is not held when calling
+ * unmap_hugepage_range for private vmas.
*/
- if (vma->vm_pgoff < start)
- v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
- else
- v_offset = 0;
-
- if (!end)
- v_end = vma->vm_end;
- else {
- v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
- + vma->vm_start;
- if (v_end > vma->vm_end)
- v_end = vma->vm_end;
- }
+ hugetlb_vma_unlock_write(vma);
+ }
+}
- unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
- NULL, zap_flags);
+/*
+ * Called with hugetlb fault mutex held.
+ * Returns true if page was actually removed, false otherwise.
+ */
+static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,
+ struct address_space *mapping,
+ struct folio *folio, pgoff_t index,
+ bool truncate_op)
+{
+ bool ret = false;
+
+ /*
+ * If folio is mapped, it was faulted in after being
+ * unmapped in caller. Unmap (again) while holding
+ * the fault mutex. The mutex will prevent faults
+ * until we finish removing the folio.
+ */
+ if (unlikely(folio_mapped(folio)))
+ hugetlb_unmap_file_folio(h, mapping, folio, index);
+
+ folio_lock(folio);
+ /*
+ * We must remove the folio from page cache before removing
+ * the region/ reserve map (hugetlb_unreserve_pages). In
+ * rare out of memory conditions, removal of the region/reserve
+ * map could fail. Correspondingly, the subpool and global
+ * reserve usage count can need to be adjusted.
+ */
+ VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio);
+ hugetlb_delete_from_page_cache(folio);
+ ret = true;
+ if (!truncate_op) {
+ if (unlikely(hugetlb_unreserve_pages(inode, index,
+ index + 1, 1)))
+ hugetlb_fix_reserve_counts(inode);
}
+
+ folio_unlock(folio);
+ return ret;
}
/*
@@ -418,10 +600,10 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
* truncation is indicated by end of range being LLONG_MAX
* In this case, we first scan the range and release found pages.
* After releasing pages, hugetlb_unreserve_pages cleans up region/reserve
- * maps and global counts. Page faults can not race with truncation
- * in this routine. hugetlb_no_page() holds i_mmap_rwsem and prevents
- * page faults in the truncated range by checking i_size. i_size is
- * modified while holding i_mmap_rwsem.
+ * maps and global counts. Page faults can race with truncation.
+ * During faults, hugetlb_no_page() checks i_size before page allocation,
+ * and again after obtaining page table lock. It will 'back out'
+ * allocations in the truncated range.
* hole punch is indicated if end is not LLONG_MAX
* In the hole punch case we scan the range and release found pages.
* Only when releasing a page is the associated region/reserve map
@@ -451,61 +633,17 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
u32 hash = 0;
index = folio->index;
- if (!truncate_op) {
- /*
- * Only need to hold the fault mutex in the
- * hole punch case. This prevents races with
- * page faults. Races are not possible in the
- * case of truncation.
- */
- hash = hugetlb_fault_mutex_hash(mapping, index);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
- }
+ hash = hugetlb_fault_mutex_hash(mapping, index);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
/*
- * If folio is mapped, it was faulted in after being
- * unmapped in caller. Unmap (again) now after taking
- * the fault mutex. The mutex will prevent faults
- * until we finish removing the folio.
- *
- * This race can only happen in the hole punch case.
- * Getting here in a truncate operation is a bug.
+ * Remove folio that was part of folio_batch.
*/
- if (unlikely(folio_mapped(folio))) {
- BUG_ON(truncate_op);
-
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_lock_write(mapping);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
- hugetlb_vmdelete_list(&mapping->i_mmap,
- index * pages_per_huge_page(h),
- (index + 1) * pages_per_huge_page(h),
- ZAP_FLAG_DROP_MARKER);
- i_mmap_unlock_write(mapping);
- }
+ if (remove_inode_single_folio(h, inode, mapping, folio,
+ index, truncate_op))
+ freed++;
- folio_lock(folio);
- /*
- * We must free the huge page and remove from page
- * cache (remove_huge_page) BEFORE removing the
- * region/reserve map (hugetlb_unreserve_pages). In
- * rare out of memory conditions, removal of the
- * region/reserve map could fail. Correspondingly,
- * the subpool and global reserve usage count can need
- * to be adjusted.
- */
- VM_BUG_ON(HPageRestoreReserve(&folio->page));
- remove_huge_page(&folio->page);
- freed++;
- if (!truncate_op) {
- if (unlikely(hugetlb_unreserve_pages(inode,
- index, index + 1, 1)))
- hugetlb_fix_reserve_counts(inode);
- }
-
- folio_unlock(folio);
- if (!truncate_op)
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
folio_batch_release(&fbatch);
cond_resched();
@@ -543,8 +681,8 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
BUG_ON(offset & ~huge_page_mask(h));
pgoff = offset >> PAGE_SHIFT;
- i_mmap_lock_write(mapping);
i_size_write(inode, offset);
+ i_mmap_lock_write(mapping);
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
ZAP_FLAG_DROP_MARKER);
@@ -703,11 +841,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
/* addr is the offset within the file (zero based) */
addr = index * hpage_size;
- /*
- * fault mutex taken here, protects against fault path
- * and hole punch. inode_lock previously taken protects
- * against truncation.
- */
+ /* mutex taken here, fault path and hole punch */
hash = hugetlb_fault_mutex_hash(mapping, index);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -737,7 +871,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
}
clear_huge_page(page, addr, pages_per_huge_page(h));
__SetPageUptodate(page);
- error = huge_add_to_page_cache(page, mapping, index);
+ error = hugetlb_add_to_page_cache(page, mapping, index);
if (unlikely(error)) {
restore_reserve_on_error(h, &pseudo_vma, addr, page);
put_page(page);
@@ -749,7 +883,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
SetHPageMigratable(page);
/*
- * unlock_page because locked by huge_add_to_page_cache()
+ * unlock_page because locked by hugetlb_add_to_page_cache()
* put_page() due to reference from alloc_huge_page()
*/
unlock_page(page);
@@ -885,33 +1019,18 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
/*
* File creation. Allocate an inode, and we're done..
*/
-static int do_hugetlbfs_mknod(struct inode *dir,
- struct dentry *dentry,
- umode_t mode,
- dev_t dev,
- bool tmpfile)
+static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode *inode;
- int error = -ENOSPC;
inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
- if (inode) {
- dir->i_ctime = dir->i_mtime = current_time(dir);
- if (tmpfile) {
- d_tmpfile(dentry, inode);
- } else {
- d_instantiate(dentry, inode);
- dget(dentry);/* Extra count - pin the dentry in core */
- }
- error = 0;
- }
- return error;
-}
-
-static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode, dev_t dev)
-{
- return do_hugetlbfs_mknod(dir, dentry, mode, dev, false);
+ if (!inode)
+ return -ENOSPC;
+ dir->i_ctime = dir->i_mtime = current_time(dir);
+ d_instantiate(dentry, inode);
+ dget(dentry);/* Extra count - pin the dentry in core */
+ return 0;
}
static int hugetlbfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
@@ -932,10 +1051,17 @@ static int hugetlbfs_create(struct user_namespace *mnt_userns,
}
static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns,
- struct inode *dir, struct dentry *dentry,
+ struct inode *dir, struct file *file,
umode_t mode)
{
- return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true);
+ struct inode *inode;
+
+ inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0);
+ if (!inode)
+ return -ENOSPC;
+ dir->i_ctime = dir->i_mtime = current_time(dir);
+ d_tmpfile(file, inode);
+ return finish_open_simple(file, 0);
}
static int hugetlbfs_symlink(struct user_namespace *mnt_userns,
@@ -971,10 +1097,10 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping,
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
- if (hugetlb_page_subpool(&src->page)) {
- hugetlb_set_page_subpool(&dst->page,
- hugetlb_page_subpool(&src->page));
- hugetlb_set_page_subpool(&src->page, NULL);
+ if (hugetlb_folio_subpool(src)) {
+ hugetlb_set_folio_subpool(dst,
+ hugetlb_folio_subpool(src));
+ hugetlb_set_folio_subpool(src, NULL);
}
if (mode != MIGRATE_SYNC_NO_COPY)
@@ -991,13 +1117,6 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping,
static int hugetlbfs_error_remove_page(struct address_space *mapping,
struct page *page)
{
- struct inode *inode = mapping->host;
- pgoff_t index = page->index;
-
- remove_huge_page(page);
- if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1)))
- hugetlb_fix_reserve_counts(inode);
-
return 0;
}
@@ -1160,7 +1279,7 @@ static const struct address_space_operations hugetlbfs_aops = {
static void init_once(void *foo)
{
- struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
+ struct hugetlbfs_inode_info *ei = foo;
inode_init_once(&ei->vfs_inode);
}
@@ -1258,7 +1377,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
case Opt_size:
/* memparse() will accept a K/M/G without a digit */
- if (!isdigit(param->string[0]))
+ if (!param->string || !isdigit(param->string[0]))
goto bad_val;
ctx->max_size_opt = memparse(param->string, &rest);
ctx->max_val_type = SIZE_STD;
@@ -1268,7 +1387,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
case Opt_nr_inodes:
/* memparse() will accept a K/M/G without a digit */
- if (!isdigit(param->string[0]))
+ if (!param->string || !isdigit(param->string[0]))
goto bad_val;
ctx->nr_inodes = memparse(param->string, &rest);
return 0;
@@ -1284,7 +1403,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
case Opt_min_size:
/* memparse() will accept a K/M/G without a digit */
- if (!isdigit(param->string[0]))
+ if (!param->string || !isdigit(param->string[0]))
goto bad_val;
ctx->min_size_opt = memparse(param->string, &rest);
ctx->min_val_type = SIZE_STD;
diff --git a/fs/inode.c b/fs/inode.c
index ba1de23c13c1..f453eb58fd03 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -192,8 +192,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_wb_frn_history = 0;
#endif
- if (security_inode_alloc(inode))
- goto out;
spin_lock_init(&inode->i_lock);
lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
@@ -228,11 +226,12 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_fsnotify_mask = 0;
#endif
inode->i_flctx = NULL;
+
+ if (unlikely(security_inode_alloc(inode)))
+ return -ENOMEM;
this_cpu_inc(nr_inodes);
return 0;
-out:
- return -ENOMEM;
}
EXPORT_SYMBOL(inode_init_always);
@@ -1950,40 +1949,12 @@ skip_update:
EXPORT_SYMBOL(touch_atime);
/*
- * The logic we want is
- *
- * if suid or (sgid and xgrp)
- * remove privs
- */
-int should_remove_suid(struct dentry *dentry)
-{
- umode_t mode = d_inode(dentry)->i_mode;
- int kill = 0;
-
- /* suid always must be killed */
- if (unlikely(mode & S_ISUID))
- kill = ATTR_KILL_SUID;
-
- /*
- * sgid without any exec bits is just a mandatory locking mark; leave
- * it alone. If some exec bits are set, it's a real sgid; kill it.
- */
- if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
- kill |= ATTR_KILL_SGID;
-
- if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
- return kill;
-
- return 0;
-}
-EXPORT_SYMBOL(should_remove_suid);
-
-/*
* Return mask of changes for notify_change() that need to be done as a
* response to write or truncate. Return 0 if nothing has to be changed.
* Negative value on error (change should be denied).
*/
-int dentry_needs_remove_privs(struct dentry *dentry)
+int dentry_needs_remove_privs(struct user_namespace *mnt_userns,
+ struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
int mask = 0;
@@ -1992,7 +1963,7 @@ int dentry_needs_remove_privs(struct dentry *dentry)
if (IS_NOSEC(inode))
return 0;
- mask = should_remove_suid(dentry);
+ mask = setattr_should_drop_suidgid(mnt_userns, inode);
ret = security_inode_need_killpriv(dentry);
if (ret < 0)
return ret;
@@ -2024,7 +1995,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags)
if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
return 0;
- kill = dentry_needs_remove_privs(dentry);
+ kill = dentry_needs_remove_privs(file_mnt_user_ns(file), dentry);
if (kill < 0)
return kill;
@@ -2072,9 +2043,6 @@ static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
sync_it |= S_VERSION;
- if (!sync_it)
- return 0;
-
return sync_it;
}
@@ -2355,15 +2323,15 @@ EXPORT_SYMBOL(inode_init_owner);
bool inode_owner_or_capable(struct user_namespace *mnt_userns,
const struct inode *inode)
{
- kuid_t i_uid;
+ vfsuid_t vfsuid;
struct user_namespace *ns;
- i_uid = i_uid_into_mnt(mnt_userns, inode);
- if (uid_eq(current_fsuid(), i_uid))
+ vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
return true;
ns = current_user_ns();
- if (kuid_has_mapping(ns, i_uid) && ns_capable(ns, CAP_FOWNER))
+ if (vfsuid_has_mapping(ns, vfsuid) && ns_capable(ns, CAP_FOWNER))
return true;
return false;
}
@@ -2489,6 +2457,28 @@ struct timespec64 current_time(struct inode *inode)
EXPORT_SYMBOL(current_time);
/**
+ * in_group_or_capable - check whether caller is CAP_FSETID privileged
+ * @mnt_userns: user namespace of the mount @inode was found from
+ * @inode: inode to check
+ * @vfsgid: the new/current vfsgid of @inode
+ *
+ * Check wether @vfsgid is in the caller's group list or if the caller is
+ * privileged with CAP_FSETID over @inode. This can be used to determine
+ * whether the setgid bit can be kept or must be dropped.
+ *
+ * Return: true if the caller is sufficiently privileged, false if not.
+ */
+bool in_group_or_capable(struct user_namespace *mnt_userns,
+ const struct inode *inode, vfsgid_t vfsgid)
+{
+ if (vfsgid_in_group_p(vfsgid))
+ return true;
+ if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
+ return true;
+ return false;
+}
+
+/**
* mode_strip_sgid - handle the sgid bit for non-directories
* @mnt_userns: User namespace of the mount the inode was created from
* @dir: parent directory inode
@@ -2509,11 +2499,9 @@ umode_t mode_strip_sgid(struct user_namespace *mnt_userns,
return mode;
if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID))
return mode;
- if (in_group_p(i_gid_into_mnt(mnt_userns, dir)))
- return mode;
- if (capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
+ if (in_group_or_capable(mnt_userns, dir,
+ i_gid_into_vfsgid(mnt_userns, dir)))
return mode;
-
return mode & ~S_ISGID;
}
EXPORT_SYMBOL(mode_strip_sgid);
diff --git a/fs/internal.h b/fs/internal.h
index 87e96b9024ce..a803cc3cf716 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -16,6 +16,7 @@ struct shrink_control;
struct fs_context;
struct user_namespace;
struct pipe_inode_info;
+struct iov_iter;
/*
* block/bdev.c
@@ -62,7 +63,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
const char *, unsigned int, struct path *);
int do_rmdir(int dfd, struct filename *name);
int do_unlinkat(int dfd, struct filename *name);
-int may_linkat(struct user_namespace *mnt_userns, struct path *link);
+int may_linkat(struct user_namespace *mnt_userns, const struct path *link);
int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
struct filename *newname, unsigned int flags);
int do_mkdirat(int dfd, struct filename *name, umode_t mode);
@@ -101,6 +102,16 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
extern struct file *alloc_empty_file(int, const struct cred *);
extern struct file *alloc_empty_file_noaccount(int, const struct cred *);
+static inline void put_file_access(struct file *file)
+{
+ if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
+ i_readcount_dec(file->f_inode);
+ } else if (file->f_mode & FMODE_WRITER) {
+ put_write_access(file->f_inode);
+ __mnt_drop_write(file->f_path.mnt);
+ }
+}
+
/*
* super.c
*/
@@ -139,7 +150,9 @@ extern int vfs_open(const struct path *, struct file *);
* inode.c
*/
extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
-extern int dentry_needs_remove_privs(struct dentry *dentry);
+int dentry_needs_remove_privs(struct user_namespace *, struct dentry *dentry);
+bool in_group_or_capable(struct user_namespace *mnt_userns,
+ const struct inode *inode, vfsgid_t vfsgid);
/*
* fs-writeback.c
@@ -214,10 +227,39 @@ struct xattr_ctx {
};
-ssize_t do_getxattr(struct user_namespace *mnt_userns,
+ssize_t do_getxattr(struct mnt_idmap *idmap,
struct dentry *d,
struct xattr_ctx *ctx);
int setxattr_copy(const char __user *name, struct xattr_ctx *ctx);
-int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct xattr_ctx *ctx);
+int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode);
+
+#ifdef CONFIG_FS_POSIX_ACL
+int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
+ const char *acl_name, const void *kvalue, size_t size);
+ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry,
+ const char *acl_name, void *kvalue, size_t size);
+#else
+static inline int do_set_acl(struct mnt_idmap *idmap,
+ struct dentry *dentry, const char *acl_name,
+ const void *kvalue, size_t size)
+{
+ return -EOPNOTSUPP;
+}
+static inline ssize_t do_get_acl(struct mnt_idmap *idmap,
+ struct dentry *dentry, const char *acl_name,
+ void *kvalue, size_t size)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos);
+
+/*
+ * fs/attr.c
+ */
+int setattr_should_drop_sgid(struct user_namespace *mnt_userns,
+ const struct inode *inode);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index ca5c62901541..356193e44cf0 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -584,7 +584,7 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter,
return iomap_read_inline_data(iter, folio);
}
-static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
+static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
size_t len, struct folio **foliop)
{
const struct iomap_page_ops *page_ops = iter->iomap.page_ops;
@@ -618,6 +618,27 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM;
goto out_no_page;
}
+
+ /*
+ * Now we have a locked folio, before we do anything with it we need to
+ * check that the iomap we have cached is not stale. The inode extent
+ * mapping can change due to concurrent IO in flight (e.g.
+ * IOMAP_UNWRITTEN state can change and memory reclaim could have
+ * reclaimed a previously partially written page at this index after IO
+ * completion before this write reaches this file offset) and hence we
+ * could do the wrong thing here (zero a page range incorrectly or fail
+ * to zero) and corrupt data.
+ */
+ if (page_ops && page_ops->iomap_valid) {
+ bool iomap_valid = page_ops->iomap_valid(iter->inode,
+ &iter->iomap);
+ if (!iomap_valid) {
+ iter->iomap.flags |= IOMAP_F_STALE;
+ status = 0;
+ goto out_unlock;
+ }
+ }
+
if (pos + len > folio_pos(folio) + folio_size(folio))
len = folio_pos(folio) + folio_size(folio) - pos;
@@ -773,6 +794,8 @@ again:
status = iomap_write_begin(iter, pos, bytes, &folio);
if (unlikely(status))
break;
+ if (iter->iomap.flags & IOMAP_F_STALE)
+ break;
page = folio_file_page(folio, pos >> PAGE_SHIFT);
if (mapping_writably_mapped(mapping))
@@ -832,6 +855,231 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
}
EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
+/*
+ * Scan the data range passed to us for dirty page cache folios. If we find a
+ * dirty folio, punch out the preceeding range and update the offset from which
+ * the next punch will start from.
+ *
+ * We can punch out storage reservations under clean pages because they either
+ * contain data that has been written back - in which case the delalloc punch
+ * over that range is a no-op - or they have been read faults in which case they
+ * contain zeroes and we can remove the delalloc backing range and any new
+ * writes to those pages will do the normal hole filling operation...
+ *
+ * This makes the logic simple: we only need to keep the delalloc extents only
+ * over the dirty ranges of the page cache.
+ *
+ * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
+ * simplify range iterations.
+ */
+static int iomap_write_delalloc_scan(struct inode *inode,
+ loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
+ int (*punch)(struct inode *inode, loff_t offset, loff_t length))
+{
+ while (start_byte < end_byte) {
+ struct folio *folio;
+
+ /* grab locked page */
+ folio = filemap_lock_folio(inode->i_mapping,
+ start_byte >> PAGE_SHIFT);
+ if (!folio) {
+ start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) +
+ PAGE_SIZE;
+ continue;
+ }
+
+ /* if dirty, punch up to offset */
+ if (folio_test_dirty(folio)) {
+ if (start_byte > *punch_start_byte) {
+ int error;
+
+ error = punch(inode, *punch_start_byte,
+ start_byte - *punch_start_byte);
+ if (error) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return error;
+ }
+ }
+
+ /*
+ * Make sure the next punch start is correctly bound to
+ * the end of this data range, not the end of the folio.
+ */
+ *punch_start_byte = min_t(loff_t, end_byte,
+ folio_next_index(folio) << PAGE_SHIFT);
+ }
+
+ /* move offset to start of next folio in range */
+ start_byte = folio_next_index(folio) << PAGE_SHIFT;
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+ return 0;
+}
+
+/*
+ * Punch out all the delalloc blocks in the range given except for those that
+ * have dirty data still pending in the page cache - those are going to be
+ * written and so must still retain the delalloc backing for writeback.
+ *
+ * As we are scanning the page cache for data, we don't need to reimplement the
+ * wheel - mapping_seek_hole_data() does exactly what we need to identify the
+ * start and end of data ranges correctly even for sub-folio block sizes. This
+ * byte range based iteration is especially convenient because it means we
+ * don't have to care about variable size folios, nor where the start or end of
+ * the data range lies within a folio, if they lie within the same folio or even
+ * if there are multiple discontiguous data ranges within the folio.
+ *
+ * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so
+ * can return data ranges that exist in the cache beyond EOF. e.g. a page fault
+ * spanning EOF will initialise the post-EOF data to zeroes and mark it up to
+ * date. A write page fault can then mark it dirty. If we then fail a write()
+ * beyond EOF into that up to date cached range, we allocate a delalloc block
+ * beyond EOF and then have to punch it out. Because the range is up to date,
+ * mapping_seek_hole_data() will return it, and we will skip the punch because
+ * the folio is dirty. THis is incorrect - we always need to punch out delalloc
+ * beyond EOF in this case as writeback will never write back and covert that
+ * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF,
+ * resulting in always punching out the range from the EOF to the end of the
+ * range the iomap spans.
+ *
+ * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it
+ * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA
+ * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte)
+ * returns the end of the data range (data_end). Using closed intervals would
+ * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
+ * the code to subtle off-by-one bugs....
+ */
+static int iomap_write_delalloc_release(struct inode *inode,
+ loff_t start_byte, loff_t end_byte,
+ int (*punch)(struct inode *inode, loff_t pos, loff_t length))
+{
+ loff_t punch_start_byte = start_byte;
+ loff_t scan_end_byte = min(i_size_read(inode), end_byte);
+ int error = 0;
+
+ /*
+ * Lock the mapping to avoid races with page faults re-instantiating
+ * folios and dirtying them via ->page_mkwrite whilst we walk the
+ * cache and perform delalloc extent removal. Failing to do this can
+ * leave dirty pages with no space reservation in the cache.
+ */
+ filemap_invalidate_lock(inode->i_mapping);
+ while (start_byte < scan_end_byte) {
+ loff_t data_end;
+
+ start_byte = mapping_seek_hole_data(inode->i_mapping,
+ start_byte, scan_end_byte, SEEK_DATA);
+ /*
+ * If there is no more data to scan, all that is left is to
+ * punch out the remaining range.
+ */
+ if (start_byte == -ENXIO || start_byte == scan_end_byte)
+ break;
+ if (start_byte < 0) {
+ error = start_byte;
+ goto out_unlock;
+ }
+ WARN_ON_ONCE(start_byte < punch_start_byte);
+ WARN_ON_ONCE(start_byte > scan_end_byte);
+
+ /*
+ * We find the end of this contiguous cached data range by
+ * seeking from start_byte to the beginning of the next hole.
+ */
+ data_end = mapping_seek_hole_data(inode->i_mapping, start_byte,
+ scan_end_byte, SEEK_HOLE);
+ if (data_end < 0) {
+ error = data_end;
+ goto out_unlock;
+ }
+ WARN_ON_ONCE(data_end <= start_byte);
+ WARN_ON_ONCE(data_end > scan_end_byte);
+
+ error = iomap_write_delalloc_scan(inode, &punch_start_byte,
+ start_byte, data_end, punch);
+ if (error)
+ goto out_unlock;
+
+ /* The next data search starts at the end of this one. */
+ start_byte = data_end;
+ }
+
+ if (punch_start_byte < end_byte)
+ error = punch(inode, punch_start_byte,
+ end_byte - punch_start_byte);
+out_unlock:
+ filemap_invalidate_unlock(inode->i_mapping);
+ return error;
+}
+
+/*
+ * When a short write occurs, the filesystem may need to remove reserved space
+ * that was allocated in ->iomap_begin from it's ->iomap_end method. For
+ * filesystems that use delayed allocation, we need to punch out delalloc
+ * extents from the range that are not dirty in the page cache. As the write can
+ * race with page faults, there can be dirty pages over the delalloc extent
+ * outside the range of a short write but still within the delalloc extent
+ * allocated for this iomap.
+ *
+ * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
+ * simplify range iterations.
+ *
+ * The punch() callback *must* only punch delalloc extents in the range passed
+ * to it. It must skip over all other types of extents in the range and leave
+ * them completely unchanged. It must do this punch atomically with respect to
+ * other extent modifications.
+ *
+ * The punch() callback may be called with a folio locked to prevent writeback
+ * extent allocation racing at the edge of the range we are currently punching.
+ * The locked folio may or may not cover the range being punched, so it is not
+ * safe for the punch() callback to lock folios itself.
+ *
+ * Lock order is:
+ *
+ * inode->i_rwsem (shared or exclusive)
+ * inode->i_mapping->invalidate_lock (exclusive)
+ * folio_lock()
+ * ->punch
+ * internal filesystem allocation lock
+ */
+int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
+ struct iomap *iomap, loff_t pos, loff_t length,
+ ssize_t written,
+ int (*punch)(struct inode *inode, loff_t pos, loff_t length))
+{
+ loff_t start_byte;
+ loff_t end_byte;
+ int blocksize = i_blocksize(inode);
+
+ if (iomap->type != IOMAP_DELALLOC)
+ return 0;
+
+ /* If we didn't reserve the blocks, we're not allowed to punch them. */
+ if (!(iomap->flags & IOMAP_F_NEW))
+ return 0;
+
+ /*
+ * start_byte refers to the first unused block after a short write. If
+ * nothing was written, round offset down to point at the first block in
+ * the range.
+ */
+ if (unlikely(!written))
+ start_byte = round_down(pos, blocksize);
+ else
+ start_byte = round_up(pos + written, blocksize);
+ end_byte = round_up(pos + length, blocksize);
+
+ /* Nothing to do if we've written the entire delalloc extent */
+ if (start_byte >= end_byte)
+ return 0;
+
+ return iomap_write_delalloc_release(inode, start_byte, end_byte,
+ punch);
+}
+EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
+
static loff_t iomap_unshare_iter(struct iomap_iter *iter)
{
struct iomap *iomap = &iter->iomap;
@@ -856,6 +1104,8 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
status = iomap_write_begin(iter, pos, bytes, &folio);
if (unlikely(status))
return status;
+ if (iter->iomap.flags & IOMAP_F_STALE)
+ break;
status = iomap_write_end(iter, pos, bytes, bytes, folio);
if (WARN_ON_ONCE(status == 0))
@@ -911,6 +1161,8 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
status = iomap_write_begin(iter, pos, bytes, &folio);
if (status)
return status;
+ if (iter->iomap.flags & IOMAP_F_STALE)
+ break;
offset = offset_in_folio(folio, pos);
if (bytes > folio_size(folio) - offset)
@@ -1360,6 +1612,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
error = wpc->ops->map_blocks(wpc, inode, pos);
if (error)
break;
+ trace_iomap_writepage_map(inode, &wpc->iomap);
if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE))
continue;
if (wpc->iomap.type == IOMAP_HOLE)
@@ -1421,7 +1674,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
if (!count)
folio_end_writeback(folio);
done:
- mapping_set_error(folio->mapping, error);
+ mapping_set_error(inode->i_mapping, error);
return error;
}
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 4eb559a16c9e..9804714b1751 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -240,7 +240,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
{
const struct iomap *iomap = &iter->iomap;
struct inode *inode = iter->inode;
- unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
unsigned int fs_block_size = i_blocksize(inode), pad;
loff_t length = iomap_length(iter);
loff_t pos = iter->pos;
@@ -252,7 +251,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
size_t copied = 0;
size_t orig_count;
- if ((pos | length) & ((1 << blkbits) - 1) ||
+ if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
!bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
return -EINVAL;
diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c
index a1c7592d2ade..79a0614eaab7 100644
--- a/fs/iomap/iter.c
+++ b/fs/iomap/iter.c
@@ -7,12 +7,28 @@
#include <linux/iomap.h>
#include "trace.h"
+/*
+ * Advance to the next range we need to map.
+ *
+ * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully
+ * processed - it was aborted because the extent the iomap spanned may have been
+ * changed during the operation. In this case, the iteration behaviour is to
+ * remap the unprocessed range of the iter, and that means we may need to remap
+ * even when we've made no progress (i.e. iter->processed = 0). Hence the
+ * "finished iterating" case needs to distinguish between
+ * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we
+ * need to remap the entire remaining range.
+ */
static inline int iomap_iter_advance(struct iomap_iter *iter)
{
+ bool stale = iter->iomap.flags & IOMAP_F_STALE;
+
/* handle the previous iteration (if any) */
if (iter->iomap.length) {
- if (iter->processed <= 0)
+ if (iter->processed < 0)
return iter->processed;
+ if (!iter->processed && !stale)
+ return 0;
if (WARN_ON_ONCE(iter->processed > iomap_length(iter)))
return -EIO;
iter->pos += iter->processed;
@@ -33,6 +49,7 @@ static inline void iomap_iter_done(struct iomap_iter *iter)
WARN_ON_ONCE(iter->iomap.offset > iter->pos);
WARN_ON_ONCE(iter->iomap.length == 0);
WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos);
+ WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE);
trace_iomap_iter_dstmap(iter->inode, &iter->iomap);
if (iter->srcmap.type != IOMAP_HOLE)
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index d48868fc40d7..f6ea9540d082 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -148,6 +148,7 @@ DEFINE_EVENT(iomap_class, name, \
TP_ARGS(inode, iomap))
DEFINE_IOMAP_EVENT(iomap_iter_dstmap);
DEFINE_IOMAP_EVENT(iomap_iter_srcmap);
+DEFINE_IOMAP_EVENT(iomap_writepage_map);
TRACE_EVENT(iomap_iter,
TP_PROTO(struct iomap_iter *iter, const void *ops,
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index b466172eec25..c4da3f634b92 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -67,8 +67,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
for ( i = 0 ; i < pcount ; i++ ) {
if (!pages[i])
continue;
- memset(page_address(pages[i]), 0, PAGE_SIZE);
- flush_dcache_page(pages[i]);
+ memzero_page(pages[i], 0, PAGE_SIZE);
SetPageUptodate(pages[i]);
}
return ((loff_t)pcount) << PAGE_SHIFT;
@@ -82,7 +81,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
return 0;
}
haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks);
- ll_rw_block(REQ_OP_READ, haveblocks, bhs);
+ bh_read_batch(haveblocks, bhs);
curbh = 0;
curpage = 0;
@@ -120,7 +119,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
zerr != Z_STREAM_END) {
if (!stream.avail_out) {
if (pages[curpage]) {
- stream.next_out = page_address(pages[curpage])
+ stream.next_out = kmap_local_page(pages[curpage])
+ poffset;
stream.avail_out = PAGE_SIZE - poffset;
poffset = 0;
@@ -176,6 +175,10 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
flush_dcache_page(pages[curpage]);
SetPageUptodate(pages[curpage]);
}
+ if (stream.next_out != (unsigned char *)zisofs_sink_page) {
+ kunmap_local(stream.next_out);
+ stream.next_out = NULL;
+ }
curpage++;
}
if (!stream.avail_in)
@@ -183,6 +186,8 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
}
inflate_out:
zlib_inflateEnd(&stream);
+ if (stream.next_out && stream.next_out != (unsigned char *)zisofs_sink_page)
+ kunmap_local(stream.next_out);
z_eio:
mutex_unlock(&zisofs_zlib_lock);
@@ -283,9 +288,7 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
}
if (poffset && *pages) {
- memset(page_address(*pages) + poffset, 0,
- PAGE_SIZE - poffset);
- flush_dcache_page(*pages);
+ memzero_page(*pages, poffset, PAGE_SIZE - poffset);
SetPageUptodate(*pages);
}
return 0;
@@ -343,10 +346,8 @@ static int zisofs_read_folio(struct file *file, struct folio *folio)
for (i = 0; i < pcount; i++, index++) {
if (i != full_page)
pages[i] = grab_cache_page_nowait(mapping, index);
- if (pages[i]) {
+ if (pages[i])
ClearPageError(pages[i]);
- kmap(pages[i]);
- }
}
err = zisofs_fill_pages(inode, full_page, pcount, pages);
@@ -357,7 +358,6 @@ static int zisofs_read_folio(struct file *file, struct folio *folio)
flush_dcache_page(pages[i]);
if (i == full_page && err)
SetPageError(pages[i]);
- kunmap(pages[i]);
unlock_page(pages[i]);
if (i != full_page)
put_page(pages[i]);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 88bf20303466..df9d70588b60 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1277,13 +1277,11 @@ static int isofs_read_level3_size(struct inode *inode)
} while (more_entries);
out:
kfree(tmpde);
- if (bh)
- brelse(bh);
+ brelse(bh);
return 0;
out_nomem:
- if (bh)
- brelse(bh);
+ brelse(bh);
return -ENOMEM;
out_noread:
@@ -1486,8 +1484,7 @@ static int isofs_read_inode(struct inode *inode, int relocated)
ret = 0;
out:
kfree(tmpde);
- if (bh)
- brelse(bh);
+ brelse(bh);
return ret;
out_badread:
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index b2b2bc9b88d9..4810438b7856 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -122,8 +122,8 @@ static int journal_submit_commit_record(journal_t *journal,
{
struct commit_header *tmp;
struct buffer_head *bh;
- int ret;
struct timespec64 now;
+ blk_opf_t write_flags = REQ_OP_WRITE | REQ_SYNC;
*cbh = NULL;
@@ -155,13 +155,11 @@ static int journal_submit_commit_record(journal_t *journal,
if (journal->j_flags & JBD2_BARRIER &&
!jbd2_has_feature_async_commit(journal))
- ret = submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH |
- REQ_FUA, bh);
- else
- ret = submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
+ write_flags |= REQ_PREFLUSH | REQ_FUA;
+ submit_bh(write_flags, bh);
*cbh = bh;
- return ret;
+ return 0;
}
/*
@@ -209,14 +207,13 @@ int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
}
/* Send all the data buffers related to an inode */
-int jbd2_submit_inode_data(struct jbd2_inode *jinode)
+int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode)
{
-
if (!jinode || !(jinode->i_flags & JI_WRITE_DATA))
return 0;
trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
- return jbd2_journal_submit_inode_data_buffers(jinode);
+ return journal->j_submit_inode_data_buffers(jinode);
}
EXPORT_SYMBOL(jbd2_submit_inode_data);
@@ -570,7 +567,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
journal->j_running_transaction = NULL;
start_time = ktime_get();
commit_transaction->t_log_start = journal->j_head;
- wake_up(&journal->j_wait_transaction_locked);
+ wake_up_all(&journal->j_wait_transaction_locked);
write_unlock(&journal->j_state_lock);
jbd2_debug(3, "JBD2: commit phase 2a\n");
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 6350d3857c89..2696f43e7239 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -923,10 +923,16 @@ int jbd2_fc_wait_bufs(journal_t *journal, int num_blks)
for (i = j_fc_off - 1; i >= j_fc_off - num_blks; i--) {
bh = journal->j_fc_wbuf[i];
wait_on_buffer(bh);
+ /*
+ * Update j_fc_off so jbd2_fc_release_bufs can release remain
+ * buffer head.
+ */
+ if (unlikely(!buffer_uptodate(bh))) {
+ journal->j_fc_off = i + 1;
+ return -EIO;
+ }
put_bh(bh);
journal->j_fc_wbuf[i] = NULL;
- if (unlikely(!buffer_uptodate(bh)))
- return -EIO;
}
return 0;
@@ -1606,7 +1612,7 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags)
{
struct buffer_head *bh = journal->j_sb_buffer;
journal_superblock_t *sb = journal->j_superblock;
- int ret;
+ int ret = 0;
/* Buffer got discarded which means block device got invalidated */
if (!buffer_mapped(bh)) {
@@ -1636,7 +1642,7 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags)
sb->s_checksum = jbd2_superblock_csum(journal, sb);
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
- ret = submit_bh(REQ_OP_WRITE | write_flags, bh);
+ submit_bh(REQ_OP_WRITE | write_flags, bh);
wait_on_buffer(bh);
if (buffer_write_io_error(bh)) {
clear_buffer_write_io_error(bh);
@@ -1644,9 +1650,8 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags)
ret = -EIO;
}
if (ret) {
- printk(KERN_ERR "JBD2: Error %d detected when updating "
- "journal superblock for %s.\n", ret,
- journal->j_devname);
+ printk(KERN_ERR "JBD2: I/O error when updating journal superblock for %s.\n",
+ journal->j_devname);
if (!is_journal_aborted(journal))
jbd2_journal_abort(journal, ret);
}
@@ -1893,19 +1898,16 @@ static int journal_get_superblock(journal_t *journal)
{
struct buffer_head *bh;
journal_superblock_t *sb;
- int err = -EIO;
+ int err;
bh = journal->j_sb_buffer;
J_ASSERT(bh != NULL);
- if (!buffer_uptodate(bh)) {
- ll_rw_block(REQ_OP_READ, 1, &bh);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
- printk(KERN_ERR
- "JBD2: IO error reading journal superblock\n");
- goto out;
- }
+ err = bh_read(bh, 0);
+ if (err < 0) {
+ printk(KERN_ERR
+ "JBD2: IO error reading journal superblock\n");
+ goto out;
}
if (buffer_verified(bh))
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index f548479615c6..8286a9ec122f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -100,7 +100,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
bufs[nbufs++] = bh;
if (nbufs == MAXBUF) {
- ll_rw_block(REQ_OP_READ, nbufs, bufs);
+ bh_readahead_batch(nbufs, bufs, 0);
journal_brelse_array(bufs, nbufs);
nbufs = 0;
}
@@ -109,7 +109,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
}
if (nbufs)
- ll_rw_block(REQ_OP_READ, nbufs, bufs);
+ bh_readahead_batch(nbufs, bufs, 0);
err = 0;
failed:
@@ -152,9 +152,14 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
return -ENOMEM;
if (!buffer_uptodate(bh)) {
- /* If this is a brand new buffer, start readahead.
- Otherwise, we assume we are already reading it. */
- if (!buffer_req(bh))
+ /*
+ * If this is a brand new buffer, start readahead.
+ * Otherwise, we assume we are already reading it.
+ */
+ bool need_readahead = !buffer_req(bh);
+
+ bh_read_nowait(bh, 0);
+ if (need_readahead)
do_readahead(journal, offset);
wait_on_buffer(bh);
}
@@ -256,6 +261,7 @@ static int fc_do_one_pass(journal_t *journal,
err = journal->j_fc_replay_callback(journal, bh, pass,
next_fc_block - journal->j_fc_first,
expected_commit_id);
+ brelse(bh);
next_fc_block++;
if (err < 0 || err == JBD2_FC_REPLAY_STOP)
break;
@@ -687,7 +693,6 @@ static int do_one_pass(journal_t *journal,
mark_buffer_dirty(nbh);
BUFFER_TRACE(nbh, "marking uptodate");
++info->nr_replays;
- /* ll_rw_block(WRITE, 1, &nbh); */
unlock_buffer(nbh);
brelse(obh);
brelse(nbh);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e1be93ccd81c..6a404ac1c178 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -168,7 +168,7 @@ static void wait_transaction_locked(journal_t *journal)
int need_to_start;
tid_t tid = journal->j_running_transaction->t_tid;
- prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+ prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
TASK_UNINTERRUPTIBLE);
need_to_start = !tid_geq(journal->j_commit_request, tid);
read_unlock(&journal->j_state_lock);
@@ -194,7 +194,7 @@ static void wait_transaction_switching(journal_t *journal)
read_unlock(&journal->j_state_lock);
return;
}
- prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+ prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
TASK_UNINTERRUPTIBLE);
read_unlock(&journal->j_state_lock);
/*
@@ -920,7 +920,7 @@ void jbd2_journal_unlock_updates (journal_t *journal)
write_lock(&journal->j_state_lock);
--journal->j_barrier_count;
write_unlock(&journal->j_state_lock);
- wake_up(&journal->j_wait_transaction_locked);
+ wake_up_all(&journal->j_wait_transaction_locked);
}
static void warn_dirty_buffer(struct buffer_head *bh)
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index e945e3484788..8bb58ce5c06c 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -229,10 +229,11 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a
return rc;
}
-int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int rc, xprefix;
+ struct inode *inode = d_inode(dentry);
switch (type) {
case ACL_TYPE_ACCESS:
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 9d9fb7cf093e..ca36a6eca594 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -28,7 +28,7 @@ struct jffs2_acl_header {
#ifdef CONFIG_JFFS2_FS_POSIX_ACL
struct posix_acl *jffs2_get_acl(struct inode *inode, int type, bool rcu);
-int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *);
extern int jffs2_init_acl_post(struct inode *);
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index c0aabbcbfd58..f399b390b5f6 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -62,7 +62,7 @@ const struct inode_operations jffs2_dir_inode_operations =
.rmdir = jffs2_rmdir,
.mknod = jffs2_mknod,
.rename = jffs2_rename,
- .get_acl = jffs2_get_acl,
+ .get_inode_acl = jffs2_get_acl,
.set_acl = jffs2_set_acl,
.setattr = jffs2_setattr,
.listxattr = jffs2_listxattr,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index ba86acbe12d3..3cf71befa475 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -64,7 +64,7 @@ const struct file_operations jffs2_file_operations =
const struct inode_operations jffs2_file_inode_operations =
{
- .get_acl = jffs2_get_acl,
+ .get_inode_acl = jffs2_get_acl,
.set_acl = jffs2_set_acl,
.setattr = jffs2_setattr,
.listxattr = jffs2_listxattr,
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 39cec28096a7..66af51c41619 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -202,7 +202,7 @@ int jffs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
rc = jffs2_do_setattr(inode, iattr);
if (!rc && (iattr->ia_valid & ATTR_MODE))
- rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+ rc = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
return rc;
}
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index c6821a509481..4061e0ba7010 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1035,7 +1035,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
{
int i, ret;
int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
- struct mtd_oob_ops ops;
+ struct mtd_oob_ops ops = { };
ops.mode = MTD_OPS_AUTO_OOB;
ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail;
@@ -1076,7 +1076,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
struct jffs2_eraseblock *jeb)
{
- struct mtd_oob_ops ops;
+ struct mtd_oob_ops ops = { };
int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
ops.mode = MTD_OPS_AUTO_OOB;
@@ -1101,7 +1101,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
struct jffs2_eraseblock *jeb)
{
int ret;
- struct mtd_oob_ops ops;
+ struct mtd_oob_ops ops = { };
int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
ops.mode = MTD_OPS_AUTO_OOB;
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index a653f34c6e26..3b667eccc73b 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -94,12 +94,13 @@ out:
return rc;
}
-int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int rc;
tid_t tid;
int update_mode = 0;
+ struct inode *inode = d_inode(dentry);
umode_t mode = inode->i_mode;
tid = txBegin(inode->i_sb, 0);
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 332dc9ac47a9..88663465aecd 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -123,7 +123,7 @@ int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
mark_inode_dirty(inode);
if (iattr->ia_valid & ATTR_MODE)
- rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+ rc = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
return rc;
}
@@ -133,7 +133,7 @@ const struct inode_operations jfs_file_inode_operations = {
.fileattr_get = jfs_fileattr_get,
.fileattr_set = jfs_fileattr_set,
#ifdef CONFIG_JFS_POSIX_ACL
- .get_acl = jfs_get_acl,
+ .get_inode_acl = jfs_get_acl,
.set_acl = jfs_set_acl,
#endif
};
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index d1ec920aa030..8ac10e396050 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -264,11 +264,6 @@ int jfs_get_block(struct inode *ip, sector_t lblock,
return rc;
}
-static int jfs_writepage(struct page *page, struct writeback_control *wbc)
-{
- return block_write_full_page(page, jfs_get_block, wbc);
-}
-
static int jfs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -355,12 +350,12 @@ const struct address_space_operations jfs_aops = {
.invalidate_folio = block_invalidate_folio,
.read_folio = jfs_read_folio,
.readahead = jfs_readahead,
- .writepage = jfs_writepage,
.writepages = jfs_writepages,
.write_begin = jfs_write_begin,
.write_end = jfs_write_end,
.bmap = jfs_bmap,
.direct_IO = jfs_direct_IO,
+ .migrate_folio = buffer_migrate_folio,
};
/*
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 3de40286d31f..f0704a25835f 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -8,7 +8,7 @@
#ifdef CONFIG_JFS_POSIX_ACL
struct posix_acl *jfs_get_acl(struct inode *inode, int type, bool rcu);
-int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
int jfs_init_acl(tid_t, struct inode *, struct inode *);
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 6b838d3ae7c2..765838578a72 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -155,7 +155,7 @@ int dbMount(struct inode *ipbmap)
struct bmap *bmp;
struct dbmap_disk *dbmp_le;
struct metapage *mp;
- int i;
+ int i, err;
/*
* allocate/initialize the in-memory bmap descriptor
@@ -170,8 +170,8 @@ int dbMount(struct inode *ipbmap)
BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
PSIZE, 0);
if (mp == NULL) {
- kfree(bmp);
- return -EIO;
+ err = -EIO;
+ goto err_kfree_bmp;
}
/* copy the on-disk bmap descriptor to its in-memory version. */
@@ -181,9 +181,8 @@ int dbMount(struct inode *ipbmap)
bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
if (!bmp->db_numag) {
- release_metapage(mp);
- kfree(bmp);
- return -EINVAL;
+ err = -EINVAL;
+ goto err_release_metapage;
}
bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
@@ -194,6 +193,16 @@ int dbMount(struct inode *ipbmap)
bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
+ if (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG) {
+ err = -EINVAL;
+ goto err_release_metapage;
+ }
+
+ if (((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) {
+ err = -EINVAL;
+ goto err_release_metapage;
+ }
+
for (i = 0; i < MAXAG; i++)
bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]);
bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize);
@@ -214,6 +223,12 @@ int dbMount(struct inode *ipbmap)
BMAP_LOCK_INIT(bmp);
return (0);
+
+err_release_metapage:
+ release_metapage(mp);
+err_kfree_bmp:
+ kfree(bmp);
+ return err;
}
diff --git a/fs/jfs/jfs_extent.h b/fs/jfs/jfs_extent.h
index 1c984214e95e..a0ee4ccea66e 100644
--- a/fs/jfs/jfs_extent.h
+++ b/fs/jfs/jfs_extent.h
@@ -10,9 +10,7 @@
(addressPXD(&(JFS_IP(ip)->ixpxd)) + lengthPXD(&(JFS_IP(ip)->ixpxd)) - 1)
extern int extAlloc(struct inode *, s64, s64, xad_t *, bool);
-extern int extFill(struct inode *, xad_t *);
extern int extHint(struct inode *, s64, xad_t *);
-extern int extRealloc(struct inode *, s64, xad_t *, bool);
extern int extRecord(struct inode *, xad_t *);
#endif /* _H_JFS_EXTENT */
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 799d3837e7c2..390cbfce391f 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -310,8 +310,8 @@ int diRead(struct inode *ip)
iagno = INOTOIAG(ip->i_ino);
/* read the iag */
- imap = JFS_IP(ipimap)->i_imap;
IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
+ imap = JFS_IP(ipimap)->i_imap;
rc = diIAGRead(imap, iagno, &mp);
IREAD_UNLOCK(ipimap);
if (rc) {
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 48d1f70f786c..b83aae56a1f2 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -234,11 +234,15 @@ int jfs_mount_rw(struct super_block *sb, int remount)
truncate_inode_pages(sbi->ipimap->i_mapping, 0);
truncate_inode_pages(sbi->ipbmap->i_mapping, 0);
+
+ IWRITE_LOCK(sbi->ipimap, RDWRLOCK_IMAP);
diUnmount(sbi->ipimap, 1);
if ((rc = diMount(sbi->ipimap))) {
+ IWRITE_UNLOCK(sbi->ipimap);
jfs_err("jfs_mount_rw: diMount failed!");
return rc;
}
+ IWRITE_UNLOCK(sbi->ipimap);
dbUnmount(sbi->ipbmap, 1);
if ((rc = dbMount(sbi->ipbmap))) {
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index 3e8b13e6aa01..8ec43f53f686 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c
@@ -68,7 +68,6 @@ int jfs_umount(struct super_block *sb)
/*
* close secondary aggregate inode allocation map
*/
- ipaimap2 = sbi->ipaimap2;
if (ipaimap2) {
diUnmount(ipaimap2, 0);
diFreeSpecial(ipaimap2);
@@ -78,7 +77,6 @@ int jfs_umount(struct super_block *sb)
/*
* close aggregate inode allocation map
*/
- ipaimap = sbi->ipaimap;
diUnmount(ipaimap, 0);
diFreeSpecial(ipaimap);
sbi->ipaimap = NULL;
@@ -89,7 +87,7 @@ int jfs_umount(struct super_block *sb)
dbUnmount(ipbmap, 0);
diFreeSpecial(ipbmap);
- sbi->ipimap = NULL;
+ sbi->ipbmap = NULL;
/*
* Make sure all metadata makes it to disk before we mark
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index c50167a7bc50..0d33816d251d 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -25,7 +25,7 @@ struct jfs_ea_list {
struct jfs_ea ea[]; /* Variable length list */
};
-/* Macros for defining maxiumum number of bytes supported for EAs */
+/* Macros for defining maximum number of bytes supported for EAs */
#define MAXEASIZE 65535
#define MAXEALISTSIZE MAXEASIZE
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
index 142caafc73b1..ad7592191d76 100644
--- a/fs/jfs/jfs_xtree.h
+++ b/fs/jfs/jfs_xtree.h
@@ -96,12 +96,8 @@ extern int xtInsert(tid_t tid, struct inode *ip,
extern int xtExtend(tid_t tid, struct inode *ip, s64 xoff, int xlen,
int flag);
extern int xtUpdate(tid_t tid, struct inode *ip, struct xad *nxad);
-extern int xtDelete(tid_t tid, struct inode *ip, s64 xoff, int xlen,
- int flag);
extern s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int type);
extern s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size);
-extern int xtRelocate(tid_t tid, struct inode *ip,
- xad_t * oxad, s64 nxaddr, int xtype);
extern int xtAppend(tid_t tid,
struct inode *ip, int xflag, s64 xoff, int maxblocks,
int *xlenp, s64 * xaddrp, int flag);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 9db4f5789c0e..a38d14eed047 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -946,7 +946,7 @@ static int jfs_symlink(struct user_namespace *mnt_userns, struct inode *dip,
if (ssize <= IDATASIZE) {
ip->i_op = &jfs_fast_symlink_inode_operations;
- ip->i_link = JFS_IP(ip)->i_inline;
+ ip->i_link = JFS_IP(ip)->i_inline_all;
memcpy(ip->i_link, name, ssize);
ip->i_size = ssize - 1;
@@ -1525,7 +1525,7 @@ const struct inode_operations jfs_dir_inode_operations = {
.fileattr_get = jfs_fileattr_get,
.fileattr_set = jfs_fileattr_set,
#ifdef CONFIG_JFS_POSIX_ACL
- .get_acl = jfs_get_acl,
+ .get_inode_acl = jfs_get_acl,
.set_acl = jfs_set_acl,
#endif
};
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 85d4f44f2ac4..d2f82cb7db1b 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -745,8 +745,7 @@ static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data,
len = i_size-off;
toread = len;
while (toread > 0) {
- tocopy = sb->s_blocksize - offset < toread ?
- sb->s_blocksize - offset : toread;
+ tocopy = min_t(size_t, sb->s_blocksize - offset, toread);
tmp_bh.b_state = 0;
tmp_bh.b_size = i_blocksize(inode);
@@ -785,8 +784,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
inode_lock(inode);
while (towrite > 0) {
- tocopy = sb->s_blocksize - offset < towrite ?
- sb->s_blocksize - offset : towrite;
+ tocopy = min_t(size_t, sb->s_blocksize - offset, towrite);
tmp_bh.b_state = 0;
tmp_bh.b_size = i_blocksize(inode);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 1cc88ba6de90..935ef8cb02b2 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -31,10 +31,15 @@ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */
#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
+static bool __kernfs_active(struct kernfs_node *kn)
+{
+ return atomic_read(&kn->active) >= 0;
+}
+
static bool kernfs_active(struct kernfs_node *kn)
{
lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem);
- return atomic_read(&kn->active) >= 0;
+ return __kernfs_active(kn);
}
static bool kernfs_lockdep(struct kernfs_node *kn)
@@ -120,9 +125,9 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
* kn_to: /n1/n2/n3 [depth=3]
* result: /../..
*
- * [3] when @kn_to is NULL result will be "(null)"
+ * [3] when @kn_to is %NULL result will be "(null)"
*
- * Returns the length of the full path. If the full length is equal to or
+ * Return: the length of the full path. If the full length is equal to or
* greater than @buflen, @buf contains the truncated path with the trailing
* '\0'. On error, -errno is returned.
*/
@@ -180,10 +185,12 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
* @buflen: size of @buf
*
* Copies the name of @kn into @buf of @buflen bytes. The behavior is
- * similar to strlcpy(). It returns the length of @kn's name and if @buf
- * isn't long enough, it's filled upto @buflen-1 and nul terminated.
+ * similar to strlcpy().
+ *
+ * Fills buffer with "(null)" if @kn is %NULL.
*
- * Fills buffer with "(null)" if @kn is NULL.
+ * Return: the length of @kn's name and if @buf isn't long enough,
+ * it's filled up to @buflen-1 and nul terminated.
*
* This function can be called from any context.
*/
@@ -210,7 +217,7 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
* path (which includes '..'s) as needed to reach from @from to @to is
* returned.
*
- * Returns the length of the full path. If the full length is equal to or
+ * Return: the length of the full path. If the full length is equal to or
* greater than @buflen, @buf contains the truncated path with the trailing
* '\0'. On error, -errno is returned.
*/
@@ -282,6 +289,8 @@ out:
*
* Determines @kn's parent, pins and returns it. This function can be
* called from any context.
+ *
+ * Return: parent node of @kn
*/
struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
{
@@ -297,11 +306,11 @@ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
}
/**
- * kernfs_name_hash
+ * kernfs_name_hash - calculate hash of @ns + @name
* @name: Null terminated string to hash
* @ns: Namespace tag to hash
*
- * Returns 31 bit hash of ns + name (so it fits in an off_t )
+ * Return: 31-bit hash of ns + name (so it fits in an off_t)
*/
static unsigned int kernfs_name_hash(const char *name, const void *ns)
{
@@ -349,8 +358,8 @@ static int kernfs_sd_compare(const struct kernfs_node *left,
* Locking:
* kernfs_rwsem held exclusive
*
- * RETURNS:
- * 0 on susccess -EEXIST on failure.
+ * Return:
+ * %0 on success, -EEXIST on failure.
*/
static int kernfs_link_sibling(struct kernfs_node *kn)
{
@@ -389,8 +398,10 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
* @kn: kernfs_node of interest
*
* Try to unlink @kn from its sibling rbtree which starts from
- * kn->parent->dir.children. Returns %true if @kn was actually
- * removed, %false if @kn wasn't on the rbtree.
+ * kn->parent->dir.children.
+ *
+ * Return: %true if @kn was actually removed,
+ * %false if @kn wasn't on the rbtree.
*
* Locking:
* kernfs_rwsem held exclusive
@@ -414,10 +425,10 @@ static bool kernfs_unlink_sibling(struct kernfs_node *kn)
* @kn: kernfs_node to get an active reference to
*
* Get an active reference of @kn. This function is noop if @kn
- * is NULL.
+ * is %NULL.
*
- * RETURNS:
- * Pointer to @kn on success, NULL on failure.
+ * Return:
+ * Pointer to @kn on success, %NULL on failure.
*/
struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
{
@@ -437,7 +448,7 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
* @kn: kernfs_node to put an active reference to
*
* Put an active reference to @kn. This function is noop if @kn
- * is NULL.
+ * is %NULL.
*/
void kernfs_put_active(struct kernfs_node *kn)
{
@@ -459,7 +470,7 @@ void kernfs_put_active(struct kernfs_node *kn)
* kernfs_drain - drain kernfs_node
* @kn: kernfs_node to drain
*
- * Drain existing usages and nuke all existing mmaps of @kn. Mutiple
+ * Drain existing usages and nuke all existing mmaps of @kn. Multiple
* removers may invoke this function concurrently on @kn and all will
* return after draining is complete.
*/
@@ -472,6 +483,16 @@ static void kernfs_drain(struct kernfs_node *kn)
lockdep_assert_held_write(&root->kernfs_rwsem);
WARN_ON_ONCE(kernfs_active(kn));
+ /*
+ * Skip draining if already fully drained. This avoids draining and its
+ * lockdep annotations for nodes which have never been activated
+ * allowing embedding kernfs_remove() in create error paths without
+ * worrying about draining.
+ */
+ if (atomic_read(&kn->active) == KN_DEACTIVATED_BIAS &&
+ !kernfs_should_drain_open_files(kn))
+ return;
+
up_write(&root->kernfs_rwsem);
if (kernfs_lockdep(kn)) {
@@ -480,7 +501,6 @@ static void kernfs_drain(struct kernfs_node *kn)
lock_contended(&kn->dep_map, _RET_IP_);
}
- /* but everyone should wait for draining */
wait_event(root->deactivate_waitq,
atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);
@@ -489,7 +509,8 @@ static void kernfs_drain(struct kernfs_node *kn)
rwsem_release(&kn->dep_map, _RET_IP_);
}
- kernfs_drain_open_files(kn);
+ if (kernfs_should_drain_open_files(kn))
+ kernfs_drain_open_files(kn);
down_write(&root->kernfs_rwsem);
}
@@ -562,7 +583,7 @@ EXPORT_SYMBOL_GPL(kernfs_put);
* kernfs_node_from_dentry - determine kernfs_node associated with a dentry
* @dentry: the dentry in question
*
- * Return the kernfs_node associated with @dentry. If @dentry is not a
+ * Return: the kernfs_node associated with @dentry. If @dentry is not a
* kernfs one, %NULL is returned.
*
* While the returned kernfs_node will stay accessible as long as @dentry
@@ -669,8 +690,8 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
* @id's lower 32bits encode ino and upper gen. If the gen portion is
* zero, all generations are matched.
*
- * RETURNS:
- * NULL on failure. Return a kernfs node with reference counter incremented
+ * Return: %NULL on failure,
+ * otherwise a kernfs node with reference counter incremented.
*/
struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
u64 id)
@@ -696,12 +717,11 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
}
/*
- * ACTIVATED is protected with kernfs_mutex but it was clear when
- * @kn was added to idr and we just wanna see it set. No need to
- * grab kernfs_mutex.
+ * We should fail if @kn has never been activated and guarantee success
+ * if the caller knows that @kn is active. Both can be achieved by
+ * __kernfs_active() which tests @kn->active without kernfs_rwsem.
*/
- if (unlikely(!(kn->flags & KERNFS_ACTIVATED) ||
- !atomic_inc_not_zero(&kn->count)))
+ if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count)))
goto err_unlock;
spin_unlock(&kernfs_idr_lock);
@@ -719,8 +739,8 @@ err_unlock:
* function increments nlink of the parent's inode if @kn is a
* directory and link into the children list of the parent.
*
- * RETURNS:
- * 0 on success, -EEXIST if entry with the given name already
+ * Return:
+ * %0 on success, -EEXIST if entry with the given name already
* exists.
*/
int kernfs_add_one(struct kernfs_node *kn)
@@ -743,10 +763,7 @@ int kernfs_add_one(struct kernfs_node *kn)
goto out_unlock;
ret = -ENOENT;
- if (parent->flags & KERNFS_EMPTY_DIR)
- goto out_unlock;
-
- if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))
+ if (parent->flags & (KERNFS_REMOVING | KERNFS_EMPTY_DIR))
goto out_unlock;
kn->hash = kernfs_name_hash(kn->name, kn->ns);
@@ -786,8 +803,9 @@ out_unlock:
* @name: name to look for
* @ns: the namespace tag to use
*
- * Look for kernfs_node with name @name under @parent. Returns pointer to
- * the found kernfs_node on success, %NULL on failure.
+ * Look for kernfs_node with name @name under @parent.
+ *
+ * Return: pointer to the found kernfs_node on success, %NULL on failure.
*/
static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
const unsigned char *name,
@@ -860,8 +878,9 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
* @ns: the namespace tag to use
*
* Look for kernfs_node with name @name under @parent and get a reference
- * if found. This function may sleep and returns pointer to the found
- * kernfs_node on success, %NULL on failure.
+ * if found. This function may sleep.
+ *
+ * Return: pointer to the found kernfs_node on success, %NULL on failure.
*/
struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
const char *name, const void *ns)
@@ -885,8 +904,9 @@ EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
* @ns: the namespace tag to use
*
* Look for kernfs_node with path @path under @parent and get a reference
- * if found. This function may sleep and returns pointer to the found
- * kernfs_node on success, %NULL on failure.
+ * if found. This function may sleep.
+ *
+ * Return: pointer to the found kernfs_node on success, %NULL on failure.
*/
struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
const char *path, const void *ns)
@@ -908,7 +928,7 @@ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
* @flags: KERNFS_ROOT_* flags
* @priv: opaque data associated with the new directory
*
- * Returns the root of the new hierarchy on success, ERR_PTR() value on
+ * Return: the root of the new hierarchy on success, ERR_PTR() value on
* failure.
*/
struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
@@ -980,6 +1000,8 @@ void kernfs_destroy_root(struct kernfs_root *root)
/**
* kernfs_root_to_node - return the kernfs_node associated with a kernfs_root
* @root: root to use to lookup
+ *
+ * Return: @root's kernfs_node
*/
struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root)
{
@@ -996,7 +1018,7 @@ struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root)
* @priv: opaque data associated with the new directory
* @ns: optional namespace tag of the directory
*
- * Returns the created node on success, ERR_PTR() value on failure.
+ * Return: the created node on success, ERR_PTR() value on failure.
*/
struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
const char *name, umode_t mode,
@@ -1030,7 +1052,7 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
* @parent: parent in which to create a new directory
* @name: name of the new directory
*
- * Returns the created node on success, ERR_PTR() value on failure.
+ * Return: the created node on success, ERR_PTR() value on failure.
*/
struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
const char *name)
@@ -1072,20 +1094,30 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
/* If the kernfs parent node has changed discard and
* proceed to ->lookup.
+ *
+ * There's nothing special needed here when getting the
+ * dentry parent, even if a concurrent rename is in
+ * progress. That's because the dentry is negative so
+ * it can only be the target of the rename and it will
+ * be doing a d_move() not a replace. Consequently the
+ * dentry d_parent won't change over the d_move().
+ *
+ * Also kernfs negative dentries transitioning from
+ * negative to positive during revalidate won't happen
+ * because they are invalidated on containing directory
+ * changes and the lookup re-done so that a new positive
+ * dentry can be properly created.
*/
- spin_lock(&dentry->d_lock);
+ root = kernfs_root_from_sb(dentry->d_sb);
+ down_read(&root->kernfs_rwsem);
parent = kernfs_dentry_node(dentry->d_parent);
if (parent) {
- spin_unlock(&dentry->d_lock);
- root = kernfs_root(parent);
- down_read(&root->kernfs_rwsem);
if (kernfs_dir_changed(parent, dentry)) {
up_read(&root->kernfs_rwsem);
return 0;
}
- up_read(&root->kernfs_rwsem);
- } else
- spin_unlock(&dentry->d_lock);
+ }
+ up_read(&root->kernfs_rwsem);
/* The kernfs parent node hasn't changed, leave the
* dentry negative and return success.
@@ -1279,6 +1311,8 @@ static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos)
* Find the next descendant to visit for post-order traversal of @root's
* descendants. @root is included in the iteration and the last node to be
* visited.
+ *
+ * Return: the next descendant to visit or %NULL when done.
*/
static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
struct kernfs_node *root)
@@ -1304,6 +1338,21 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
return pos->parent;
}
+static void kernfs_activate_one(struct kernfs_node *kn)
+{
+ lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem);
+
+ kn->flags |= KERNFS_ACTIVATED;
+
+ if (kernfs_active(kn) || (kn->flags & (KERNFS_HIDDEN | KERNFS_REMOVING)))
+ return;
+
+ WARN_ON_ONCE(kn->parent && RB_EMPTY_NODE(&kn->rb));
+ WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
+
+ atomic_sub(KN_DEACTIVATED_BIAS, &kn->active);
+}
+
/**
* kernfs_activate - activate a node which started deactivated
* @kn: kernfs_node whose subtree is to be activated
@@ -1325,15 +1374,42 @@ void kernfs_activate(struct kernfs_node *kn)
down_write(&root->kernfs_rwsem);
pos = NULL;
- while ((pos = kernfs_next_descendant_post(pos, kn))) {
- if (pos->flags & KERNFS_ACTIVATED)
- continue;
+ while ((pos = kernfs_next_descendant_post(pos, kn)))
+ kernfs_activate_one(pos);
+
+ up_write(&root->kernfs_rwsem);
+}
+
+/**
+ * kernfs_show - show or hide a node
+ * @kn: kernfs_node to show or hide
+ * @show: whether to show or hide
+ *
+ * If @show is %false, @kn is marked hidden and deactivated. A hidden node is
+ * ignored in future activaitons. If %true, the mark is removed and activation
+ * state is restored. This function won't implicitly activate a new node in a
+ * %KERNFS_ROOT_CREATE_DEACTIVATED root which hasn't been activated yet.
+ *
+ * To avoid recursion complexities, directories aren't supported for now.
+ */
+void kernfs_show(struct kernfs_node *kn, bool show)
+{
+ struct kernfs_root *root = kernfs_root(kn);
+
+ if (WARN_ON_ONCE(kernfs_type(kn) == KERNFS_DIR))
+ return;
- WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb));
- WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS);
+ down_write(&root->kernfs_rwsem);
- atomic_sub(KN_DEACTIVATED_BIAS, &pos->active);
- pos->flags |= KERNFS_ACTIVATED;
+ if (show) {
+ kn->flags &= ~KERNFS_HIDDEN;
+ if (kn->flags & KERNFS_ACTIVATED)
+ kernfs_activate_one(kn);
+ } else {
+ kn->flags |= KERNFS_HIDDEN;
+ if (kernfs_active(kn))
+ atomic_add(KN_DEACTIVATED_BIAS, &kn->active);
+ kernfs_drain(kn);
}
up_write(&root->kernfs_rwsem);
@@ -1358,34 +1434,27 @@ static void __kernfs_remove(struct kernfs_node *kn)
pr_debug("kernfs %s: removing\n", kn->name);
- /* prevent any new usage under @kn by deactivating all nodes */
+ /* prevent new usage by marking all nodes removing and deactivating */
pos = NULL;
- while ((pos = kernfs_next_descendant_post(pos, kn)))
+ while ((pos = kernfs_next_descendant_post(pos, kn))) {
+ pos->flags |= KERNFS_REMOVING;
if (kernfs_active(pos))
atomic_add(KN_DEACTIVATED_BIAS, &pos->active);
+ }
/* deactivate and unlink the subtree node-by-node */
do {
pos = kernfs_leftmost_descendant(kn);
/*
- * kernfs_drain() drops kernfs_rwsem temporarily and @pos's
+ * kernfs_drain() may drop kernfs_rwsem temporarily and @pos's
* base ref could have been put by someone else by the time
* the function returns. Make sure it doesn't go away
* underneath us.
*/
kernfs_get(pos);
- /*
- * Drain iff @kn was activated. This avoids draining and
- * its lockdep annotations for nodes which have never been
- * activated and allows embedding kernfs_remove() in create
- * error paths without worrying about draining.
- */
- if (kn->flags & KERNFS_ACTIVATED)
- kernfs_drain(pos);
- else
- WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
+ kernfs_drain(pos);
/*
* kernfs_unlink_sibling() succeeds once per node. Use it
@@ -1507,6 +1576,8 @@ void kernfs_unbreak_active_protection(struct kernfs_node *kn)
* the whole kernfs_ops which won the arbitration. This can be used to
* guarantee, for example, all concurrent writes to a "delete" file to
* finish only after the whole operation is complete.
+ *
+ * Return: %true if @kn is removed by this call, otherwise %false.
*/
bool kernfs_remove_self(struct kernfs_node *kn)
{
@@ -1567,7 +1638,8 @@ bool kernfs_remove_self(struct kernfs_node *kn)
* @ns: namespace tag of the kernfs_node to remove
*
* Look for the kernfs_node with @name and @ns under @parent and remove it.
- * Returns 0 on success, -ENOENT if such entry doesn't exist.
+ *
+ * Return: %0 on success, -ENOENT if such entry doesn't exist.
*/
int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
const void *ns)
@@ -1585,8 +1657,11 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
down_write(&root->kernfs_rwsem);
kn = kernfs_find_ns(parent, name, ns);
- if (kn)
+ if (kn) {
+ kernfs_get(kn);
__kernfs_remove(kn);
+ kernfs_put(kn);
+ }
up_write(&root->kernfs_rwsem);
@@ -1602,6 +1677,8 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
* @new_parent: new parent to put @sd under
* @new_name: new name
* @new_ns: new namespace tag
+ *
+ * Return: %0 on success, -errno on failure.
*/
int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
const char *new_name, const void *new_ns)
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index b3ec34386b43..e4a50e4ff0d2 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -23,6 +23,8 @@ struct kernfs_open_node {
atomic_t event;
wait_queue_head_t poll;
struct list_head files; /* goes through kernfs_open_file.list */
+ unsigned int nr_mmapped;
+ unsigned int nr_to_release;
};
/*
@@ -31,7 +33,7 @@ struct kernfs_open_node {
* pending queue is implemented as a singly linked list of kernfs_nodes.
* The list is terminated with the self pointer so that whether a
* kernfs_node is on the list or not can be determined by testing the next
- * pointer for NULL.
+ * pointer for %NULL.
*/
#define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list)
@@ -57,31 +59,19 @@ static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn)
}
/**
- * kernfs_deref_open_node - Get kernfs_open_node corresponding to @kn.
+ * of_on - Get the kernfs_open_node of the specified kernfs_open_file
+ * @of: target kernfs_open_file
*
- * @of: associated kernfs_open_file instance.
- * @kn: target kernfs_node.
- *
- * Fetch and return ->attr.open of @kn if @of->list is non empty.
- * If @of->list is not empty we can safely assume that @of is on
- * @kn->attr.open->files list and this guarantees that @kn->attr.open
- * will not vanish i.e. dereferencing outside RCU read-side critical
- * section is safe here.
- *
- * The caller needs to make sure that @of->list is not empty.
+ * Return: the kernfs_open_node of the kernfs_open_file
*/
-static struct kernfs_open_node *
-kernfs_deref_open_node(struct kernfs_open_file *of, struct kernfs_node *kn)
+static struct kernfs_open_node *of_on(struct kernfs_open_file *of)
{
- struct kernfs_open_node *on;
-
- on = rcu_dereference_check(kn->attr.open, !list_empty(&of->list));
-
- return on;
+ return rcu_dereference_protected(of->kn->attr.open,
+ !list_empty(&of->list));
}
/**
- * kernfs_deref_open_node_protected - Get kernfs_open_node corresponding to @kn
+ * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn
*
* @kn: target kernfs_node.
*
@@ -94,9 +84,11 @@ kernfs_deref_open_node(struct kernfs_open_file *of, struct kernfs_node *kn)
* outside RCU read-side critical section.
*
* The caller needs to make sure that kernfs_open_file_mutex is held.
+ *
+ * Return: @kn->attr.open when kernfs_open_file_mutex is held.
*/
static struct kernfs_open_node *
-kernfs_deref_open_node_protected(struct kernfs_node *kn)
+kernfs_deref_open_node_locked(struct kernfs_node *kn)
{
return rcu_dereference_protected(kn->attr.open,
lockdep_is_held(kernfs_open_file_mutex_ptr(kn)));
@@ -207,12 +199,8 @@ static void kernfs_seq_stop(struct seq_file *sf, void *v)
static int kernfs_seq_show(struct seq_file *sf, void *v)
{
struct kernfs_open_file *of = sf->private;
- struct kernfs_open_node *on = kernfs_deref_open_node(of, of->kn);
-
- if (!on)
- return -EINVAL;
- of->event = atomic_read(&on->event);
+ of->event = atomic_read(&of_on(of)->event);
return of->kn->attr.ops->seq_show(sf, v);
}
@@ -235,7 +223,6 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
struct kernfs_open_file *of = kernfs_of(iocb->ki_filp);
ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE);
const struct kernfs_ops *ops;
- struct kernfs_open_node *on;
char *buf;
buf = of->prealloc_buf;
@@ -257,14 +244,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
goto out_free;
}
- on = kernfs_deref_open_node(of, of->kn);
- if (!on) {
- len = -EINVAL;
- mutex_unlock(&of->mutex);
- goto out_free;
- }
-
- of->event = atomic_read(&on->event);
+ of->event = atomic_read(&of_on(of)->event);
ops = kernfs_ops(of->kn);
if (ops->read)
@@ -553,6 +533,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
rc = 0;
of->mmapped = true;
+ of_on(of)->nr_mmapped++;
of->vm_ops = vma->vm_ops;
vma->vm_ops = &kernfs_vm_ops;
out_put:
@@ -571,40 +552,39 @@ out_unlock:
* If @kn->attr.open exists, increment its reference count; otherwise,
* create one. @of is chained to the files list.
*
- * LOCKING:
+ * Locking:
* Kernel thread context (may sleep).
*
- * RETURNS:
- * 0 on success, -errno on failure.
+ * Return:
+ * %0 on success, -errno on failure.
*/
static int kernfs_get_open_node(struct kernfs_node *kn,
struct kernfs_open_file *of)
{
- struct kernfs_open_node *on, *new_on = NULL;
- struct mutex *mutex = NULL;
+ struct kernfs_open_node *on;
+ struct mutex *mutex;
mutex = kernfs_open_file_mutex_lock(kn);
- on = kernfs_deref_open_node_protected(kn);
+ on = kernfs_deref_open_node_locked(kn);
- if (on) {
- list_add_tail(&of->list, &on->files);
- mutex_unlock(mutex);
- return 0;
- } else {
+ if (!on) {
/* not there, initialize a new one */
- new_on = kmalloc(sizeof(*new_on), GFP_KERNEL);
- if (!new_on) {
+ on = kzalloc(sizeof(*on), GFP_KERNEL);
+ if (!on) {
mutex_unlock(mutex);
return -ENOMEM;
}
- atomic_set(&new_on->event, 1);
- init_waitqueue_head(&new_on->poll);
- INIT_LIST_HEAD(&new_on->files);
- list_add_tail(&of->list, &new_on->files);
- rcu_assign_pointer(kn->attr.open, new_on);
+ atomic_set(&on->event, 1);
+ init_waitqueue_head(&on->poll);
+ INIT_LIST_HEAD(&on->files);
+ rcu_assign_pointer(kn->attr.open, on);
}
- mutex_unlock(mutex);
+ list_add_tail(&of->list, &on->files);
+ if (kn->flags & KERNFS_HAS_RELEASE)
+ on->nr_to_release++;
+
+ mutex_unlock(mutex);
return 0;
}
@@ -613,6 +593,7 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
*
* @kn: target kernfs_node
* @of: associated kernfs_open_file
+ * @open_failed: ->open() failed, cancel ->release()
*
* Unlink @of from list of @kn's associated open files. If list of
* associated open files becomes empty, disassociate and free
@@ -622,21 +603,30 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
* None.
*/
static void kernfs_unlink_open_file(struct kernfs_node *kn,
- struct kernfs_open_file *of)
+ struct kernfs_open_file *of,
+ bool open_failed)
{
struct kernfs_open_node *on;
- struct mutex *mutex = NULL;
+ struct mutex *mutex;
mutex = kernfs_open_file_mutex_lock(kn);
- on = kernfs_deref_open_node_protected(kn);
+ on = kernfs_deref_open_node_locked(kn);
if (!on) {
mutex_unlock(mutex);
return;
}
- if (of)
+ if (of) {
+ if (kn->flags & KERNFS_HAS_RELEASE) {
+ WARN_ON_ONCE(of->released == open_failed);
+ if (open_failed)
+ on->nr_to_release--;
+ }
+ if (of->mmapped)
+ on->nr_mmapped--;
list_del(&of->list);
+ }
if (list_empty(&on->files)) {
rcu_assign_pointer(kn->attr.open, NULL);
@@ -763,7 +753,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
return 0;
err_put_node:
- kernfs_unlink_open_file(kn, of);
+ kernfs_unlink_open_file(kn, of, true);
err_seq_release:
seq_release(inode, file);
err_free:
@@ -795,6 +785,7 @@ static void kernfs_release_file(struct kernfs_node *kn,
*/
kn->attr.ops->release(of);
of->released = true;
+ of_on(of)->nr_to_release--;
}
}
@@ -802,15 +793,16 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
{
struct kernfs_node *kn = inode->i_private;
struct kernfs_open_file *of = kernfs_of(filp);
- struct mutex *mutex = NULL;
if (kn->flags & KERNFS_HAS_RELEASE) {
+ struct mutex *mutex;
+
mutex = kernfs_open_file_mutex_lock(kn);
kernfs_release_file(kn, of);
mutex_unlock(mutex);
}
- kernfs_unlink_open_file(kn, of);
+ kernfs_unlink_open_file(kn, of, false);
seq_release(inode, filp);
kfree(of->prealloc_buf);
kfree(of);
@@ -818,28 +810,33 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
return 0;
}
-void kernfs_drain_open_files(struct kernfs_node *kn)
+bool kernfs_should_drain_open_files(struct kernfs_node *kn)
{
struct kernfs_open_node *on;
- struct kernfs_open_file *of;
- struct mutex *mutex = NULL;
-
- if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE)))
- return;
+ bool ret;
/*
- * lockless opportunistic check is safe below because no one is adding to
- * ->attr.open at this point of time. This check allows early bail out
- * if ->attr.open is already NULL. kernfs_unlink_open_file makes
- * ->attr.open NULL only while holding kernfs_open_file_mutex so below
- * check under kernfs_open_file_mutex_ptr(kn) will ensure bailing out if
- * ->attr.open became NULL while waiting for the mutex.
+ * @kn being deactivated guarantees that @kn->attr.open can't change
+ * beneath us making the lockless test below safe.
*/
- if (!rcu_access_pointer(kn->attr.open))
- return;
+ WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
+
+ rcu_read_lock();
+ on = rcu_dereference(kn->attr.open);
+ ret = on && (on->nr_mmapped || on->nr_to_release);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+void kernfs_drain_open_files(struct kernfs_node *kn)
+{
+ struct kernfs_open_node *on;
+ struct kernfs_open_file *of;
+ struct mutex *mutex;
mutex = kernfs_open_file_mutex_lock(kn);
- on = kernfs_deref_open_node_protected(kn);
+ on = kernfs_deref_open_node_locked(kn);
if (!on) {
mutex_unlock(mutex);
return;
@@ -848,13 +845,17 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
list_for_each_entry(of, &on->files, list) {
struct inode *inode = file_inode(of->file);
- if (kn->flags & KERNFS_HAS_MMAP)
+ if (of->mmapped) {
unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+ of->mmapped = false;
+ on->nr_mmapped--;
+ }
if (kn->flags & KERNFS_HAS_RELEASE)
kernfs_release_file(kn, of);
}
+ WARN_ON_ONCE(on->nr_mmapped || on->nr_to_release);
mutex_unlock(mutex);
}
@@ -874,11 +875,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
*/
__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait)
{
- struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry);
- struct kernfs_open_node *on = kernfs_deref_open_node(of, kn);
-
- if (!on)
- return EPOLLERR;
+ struct kernfs_open_node *on = of_on(of);
poll_wait(of->file, &on->poll, wait);
@@ -1031,7 +1028,7 @@ const struct file_operations kernfs_file_fops = {
* @ns: optional namespace tag of the file
* @key: lockdep key for the file's active_ref, %NULL to disable lockdep
*
- * Returns the created node on success, ERR_PTR() value on error.
+ * Return: the created node on success, ERR_PTR() value on error.
*/
struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
const char *name,
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 3d783d80f5da..eac0f210299a 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -94,7 +94,7 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
* @kn: target node
* @iattr: iattr to set
*
- * Returns 0 on success, -errno on failure.
+ * Return: %0 on success, -errno on failure.
*/
int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
{
@@ -190,10 +190,8 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns,
struct kernfs_root *root = kernfs_root(kn);
down_read(&root->kernfs_rwsem);
- spin_lock(&inode->i_lock);
kernfs_refresh_inode(kn, inode);
generic_fillattr(&init_user_ns, inode, stat);
- spin_unlock(&inode->i_lock);
up_read(&root->kernfs_rwsem);
return 0;
@@ -241,11 +239,11 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
* allocated and basics are initialized. New inode is returned
* locked.
*
- * LOCKING:
+ * Locking:
* Kernel thread context (may sleep).
*
- * RETURNS:
- * Pointer to allocated inode on success, NULL on failure.
+ * Return:
+ * Pointer to allocated inode on success, %NULL on failure.
*/
struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
{
@@ -288,10 +286,8 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
root = kernfs_root(kn);
down_read(&root->kernfs_rwsem);
- spin_lock(&inode->i_lock);
kernfs_refresh_inode(kn, inode);
ret = generic_permission(&init_user_ns, inode, mask);
- spin_unlock(&inode->i_lock);
up_read(&root->kernfs_rwsem);
return ret;
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 3ae214d02d44..9046d9f39e63 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -58,7 +58,7 @@ struct kernfs_root {
* kernfs_root - find out the kernfs_root a kernfs_node belongs to
* @kn: kernfs_node of interest
*
- * Return the kernfs_root @kn belongs to.
+ * Return: the kernfs_root @kn belongs to.
*/
static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
{
@@ -157,6 +157,7 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
*/
extern const struct file_operations kernfs_file_fops;
+bool kernfs_should_drain_open_files(struct kernfs_node *kn);
void kernfs_drain_open_files(struct kernfs_node *kn);
/*
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index d0859f72d2d6..e08e8d999807 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -153,7 +153,7 @@ static const struct export_operations kernfs_export_ops = {
* kernfs_root_from_sb - determine kernfs_root associated with a super_block
* @sb: the super_block in question
*
- * Return the kernfs_root associated with @sb. If @sb is not a kernfs one,
+ * Return: the kernfs_root associated with @sb. If @sb is not a kernfs one,
* %NULL is returned.
*/
struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
@@ -167,7 +167,7 @@ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
* find the next ancestor in the path down to @child, where @parent was the
* ancestor whose descendant we want to find.
*
- * Say the path is /a/b/c/d. @child is d, @parent is NULL. We return the root
+ * Say the path is /a/b/c/d. @child is d, @parent is %NULL. We return the root
* node. If @parent is b, then we return the node for c.
* Passing in d as @parent is not ok.
*/
@@ -192,6 +192,8 @@ static struct kernfs_node *find_next_ancestor(struct kernfs_node *child,
* kernfs_node_dentry - get a dentry for the given kernfs_node
* @kn: kernfs_node for which a dentry is needed
* @sb: the kernfs super_block
+ *
+ * Return: the dentry pointer
*/
struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
struct super_block *sb)
@@ -296,7 +298,7 @@ static int kernfs_set_super(struct super_block *sb, struct fs_context *fc)
* kernfs_super_ns - determine the namespace tag of a kernfs super_block
* @sb: super_block of interest
*
- * Return the namespace tag associated with kernfs super_block @sb.
+ * Return: the namespace tag associated with kernfs super_block @sb.
*/
const void *kernfs_super_ns(struct super_block *sb)
{
@@ -313,6 +315,8 @@ const void *kernfs_super_ns(struct super_block *sb)
* implementation, which should set the specified ->@fs_type and ->@flags, and
* specify the hierarchy and namespace tag to mount via ->@root and ->@ns,
* respectively.
+ *
+ * Return: %0 on success, -errno on failure.
*/
int kernfs_get_tree(struct fs_context *fc)
{
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 0ab13824822f..45371a70caa7 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -19,7 +19,7 @@
* @name: name of the symlink
* @target: target node for the symlink to point to
*
- * Returns the created node on success, ERR_PTR() value on error.
+ * Return: the created node on success, ERR_PTR() value on error.
* Ownership of the link matches ownership of the target.
*/
struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c
index c5a5c7b90d72..2a39ffb8423b 100644
--- a/fs/ksmbd/auth.c
+++ b/fs/ksmbd/auth.c
@@ -424,6 +424,9 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob,
NTLMSSP_NEGOTIATE_56);
}
+ if (cflags & NTLMSSP_NEGOTIATE_SEAL && smb3_encryption_negotiated(conn))
+ flags |= NTLMSSP_NEGOTIATE_SEAL;
+
if (cflags & NTLMSSP_NEGOTIATE_ALWAYS_SIGN)
flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
@@ -984,13 +987,16 @@ out:
return rc;
}
-static int ksmbd_get_encryption_key(struct ksmbd_conn *conn, __u64 ses_id,
+static int ksmbd_get_encryption_key(struct ksmbd_work *work, __u64 ses_id,
int enc, u8 *key)
{
struct ksmbd_session *sess;
u8 *ses_enc_key;
- sess = ksmbd_session_lookup_all(conn, ses_id);
+ if (enc)
+ sess = work->sess;
+ else
+ sess = ksmbd_session_lookup_all(work->conn, ses_id);
if (!sess)
return -EINVAL;
@@ -1078,9 +1084,10 @@ static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec,
return sg;
}
-int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov,
+int ksmbd_crypt_message(struct ksmbd_work *work, struct kvec *iov,
unsigned int nvec, int enc)
{
+ struct ksmbd_conn *conn = work->conn;
struct smb2_transform_hdr *tr_hdr = smb2_get_msg(iov[0].iov_base);
unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20;
int rc;
@@ -1094,7 +1101,7 @@ int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov,
unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
struct ksmbd_crypto_ctx *ctx;
- rc = ksmbd_get_encryption_key(conn,
+ rc = ksmbd_get_encryption_key(work,
le64_to_cpu(tr_hdr->SessionId),
enc,
key);
diff --git a/fs/ksmbd/auth.h b/fs/ksmbd/auth.h
index 25b772653de0..362b6159a6cf 100644
--- a/fs/ksmbd/auth.h
+++ b/fs/ksmbd/auth.h
@@ -33,9 +33,10 @@
struct ksmbd_session;
struct ksmbd_conn;
+struct ksmbd_work;
struct kvec;
-int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov,
+int ksmbd_crypt_message(struct ksmbd_work *work, struct kvec *iov,
unsigned int nvec, int enc);
void ksmbd_copy_gss_neg_header(void *buf);
int ksmbd_auth_ntlmv2(struct ksmbd_conn *conn, struct ksmbd_session *sess,
diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c
index 756ad631c019..12be8386446a 100644
--- a/fs/ksmbd/connection.c
+++ b/fs/ksmbd/connection.c
@@ -60,6 +60,12 @@ struct ksmbd_conn *ksmbd_conn_alloc(void)
conn->local_nls = load_nls("utf8");
if (!conn->local_nls)
conn->local_nls = load_nls_default();
+ if (IS_ENABLED(CONFIG_UNICODE))
+ conn->um = utf8_load(UNICODE_AGE(12, 1, 0));
+ else
+ conn->um = ERR_PTR(-EOPNOTSUPP);
+ if (IS_ERR(conn->um))
+ conn->um = NULL;
atomic_set(&conn->req_running, 0);
atomic_set(&conn->r_count, 0);
conn->total_credits = 1;
@@ -350,6 +356,8 @@ out:
wait_event(conn->r_count_q, atomic_read(&conn->r_count) == 0);
+ if (IS_ENABLED(CONFIG_UNICODE))
+ utf8_unload(conn->um);
unload_nls(conn->local_nls);
if (default_conn_ops.terminate_fn)
default_conn_ops.terminate_fn(conn);
diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h
index e7f7d5707951..3643354a3fa7 100644
--- a/fs/ksmbd/connection.h
+++ b/fs/ksmbd/connection.h
@@ -14,6 +14,7 @@
#include <net/request_sock.h>
#include <linux/kthread.h>
#include <linux/nls.h>
+#include <linux/unicode.h>
#include "smb_common.h"
#include "ksmbd_work.h"
@@ -46,6 +47,7 @@ struct ksmbd_conn {
char *request_buf;
struct ksmbd_transport *transport;
struct nls_table *local_nls;
+ struct unicode_map *um;
struct list_head conns_list;
/* smb session 1 per user */
struct xarray sessions;
diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h
index e0cbcfa98c7e..b6bd8311e6b4 100644
--- a/fs/ksmbd/ksmbd_netlink.h
+++ b/fs/ksmbd/ksmbd_netlink.h
@@ -74,6 +74,7 @@ struct ksmbd_heartbeat {
#define KSMBD_GLOBAL_FLAG_SMB2_LEASES BIT(0)
#define KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION BIT(1)
#define KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL BIT(2)
+#define KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF BIT(3)
/*
* IPC request for ksmbd server startup
@@ -163,7 +164,8 @@ struct ksmbd_share_config_response {
__u16 force_directory_mode;
__u16 force_uid;
__u16 force_gid;
- __u32 reserved[128]; /* Reserved room */
+ __s8 share_name[KSMBD_REQ_MAX_SHARE_NAME];
+ __u32 reserved[112]; /* Reserved room */
__u32 veto_list_sz;
__s8 ____payload[];
};
diff --git a/fs/ksmbd/mgmt/share_config.c b/fs/ksmbd/mgmt/share_config.c
index c9bca1c2c834..328a412259dc 100644
--- a/fs/ksmbd/mgmt/share_config.c
+++ b/fs/ksmbd/mgmt/share_config.c
@@ -16,6 +16,7 @@
#include "user_config.h"
#include "user_session.h"
#include "../transport_ipc.h"
+#include "../misc.h"
#define SHARE_HASH_BITS 3
static DEFINE_HASHTABLE(shares_table, SHARE_HASH_BITS);
@@ -26,7 +27,7 @@ struct ksmbd_veto_pattern {
struct list_head list;
};
-static unsigned int share_name_hash(char *name)
+static unsigned int share_name_hash(const char *name)
{
return jhash(name, strlen(name), 0);
}
@@ -72,7 +73,7 @@ __get_share_config(struct ksmbd_share_config *share)
return share;
}
-static struct ksmbd_share_config *__share_lookup(char *name)
+static struct ksmbd_share_config *__share_lookup(const char *name)
{
struct ksmbd_share_config *share;
unsigned int key = share_name_hash(name);
@@ -119,7 +120,8 @@ static int parse_veto_list(struct ksmbd_share_config *share,
return 0;
}
-static struct ksmbd_share_config *share_config_request(char *name)
+static struct ksmbd_share_config *share_config_request(struct unicode_map *um,
+ const char *name)
{
struct ksmbd_share_config_response *resp;
struct ksmbd_share_config *share = NULL;
@@ -133,6 +135,19 @@ static struct ksmbd_share_config *share_config_request(char *name)
if (resp->flags == KSMBD_SHARE_FLAG_INVALID)
goto out;
+ if (*resp->share_name) {
+ char *cf_resp_name;
+ bool equal;
+
+ cf_resp_name = ksmbd_casefold_sharename(um, resp->share_name);
+ if (IS_ERR(cf_resp_name))
+ goto out;
+ equal = !strcmp(cf_resp_name, name);
+ kfree(cf_resp_name);
+ if (!equal)
+ goto out;
+ }
+
share = kzalloc(sizeof(struct ksmbd_share_config), GFP_KERNEL);
if (!share)
goto out;
@@ -190,20 +205,11 @@ out:
return share;
}
-static void strtolower(char *share_name)
-{
- while (*share_name) {
- *share_name = tolower(*share_name);
- share_name++;
- }
-}
-
-struct ksmbd_share_config *ksmbd_share_config_get(char *name)
+struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um,
+ const char *name)
{
struct ksmbd_share_config *share;
- strtolower(name);
-
down_read(&shares_table_lock);
share = __share_lookup(name);
if (share)
@@ -212,7 +218,7 @@ struct ksmbd_share_config *ksmbd_share_config_get(char *name)
if (share)
return share;
- return share_config_request(name);
+ return share_config_request(um, name);
}
bool ksmbd_share_veto_filename(struct ksmbd_share_config *share,
diff --git a/fs/ksmbd/mgmt/share_config.h b/fs/ksmbd/mgmt/share_config.h
index 902f2cb1963a..3fd338293942 100644
--- a/fs/ksmbd/mgmt/share_config.h
+++ b/fs/ksmbd/mgmt/share_config.h
@@ -9,6 +9,7 @@
#include <linux/workqueue.h>
#include <linux/hashtable.h>
#include <linux/path.h>
+#include <linux/unicode.h>
struct ksmbd_share_config {
char *name;
@@ -74,7 +75,8 @@ static inline void ksmbd_share_config_put(struct ksmbd_share_config *share)
__ksmbd_share_config_put(share);
}
-struct ksmbd_share_config *ksmbd_share_config_get(char *name);
+struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um,
+ const char *name);
bool ksmbd_share_veto_filename(struct ksmbd_share_config *share,
const char *filename);
#endif /* __SHARE_CONFIG_MANAGEMENT_H__ */
diff --git a/fs/ksmbd/mgmt/tree_connect.c b/fs/ksmbd/mgmt/tree_connect.c
index 97ab7987df6e..8ce17b3fb8da 100644
--- a/fs/ksmbd/mgmt/tree_connect.c
+++ b/fs/ksmbd/mgmt/tree_connect.c
@@ -17,7 +17,7 @@
struct ksmbd_tree_conn_status
ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess,
- char *share_name)
+ const char *share_name)
{
struct ksmbd_tree_conn_status status = {-ENOENT, NULL};
struct ksmbd_tree_connect_response *resp = NULL;
@@ -26,7 +26,7 @@ ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess,
struct sockaddr *peer_addr;
int ret;
- sc = ksmbd_share_config_get(share_name);
+ sc = ksmbd_share_config_get(conn->um, share_name);
if (!sc)
return status;
@@ -61,7 +61,7 @@ ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess,
struct ksmbd_share_config *new_sc;
ksmbd_share_config_del(sc);
- new_sc = ksmbd_share_config_get(share_name);
+ new_sc = ksmbd_share_config_get(conn->um, share_name);
if (!new_sc) {
pr_err("Failed to update stale share config\n");
status.ret = -ESTALE;
diff --git a/fs/ksmbd/mgmt/tree_connect.h b/fs/ksmbd/mgmt/tree_connect.h
index 71e50271dccf..0f97ddc1e39c 100644
--- a/fs/ksmbd/mgmt/tree_connect.h
+++ b/fs/ksmbd/mgmt/tree_connect.h
@@ -42,7 +42,7 @@ struct ksmbd_session;
struct ksmbd_tree_conn_status
ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess,
- char *share_name);
+ const char *share_name);
int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess,
struct ksmbd_tree_connect *tree_conn);
diff --git a/fs/ksmbd/mgmt/user_session.c b/fs/ksmbd/mgmt/user_session.c
index 3fa2139a0b30..92b1603b5abe 100644
--- a/fs/ksmbd/mgmt/user_session.c
+++ b/fs/ksmbd/mgmt/user_session.c
@@ -108,15 +108,17 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name)
entry->method = method;
entry->id = ksmbd_ipc_id_alloc();
if (entry->id < 0)
- goto error;
+ goto free_entry;
resp = ksmbd_rpc_open(sess, entry->id);
if (!resp)
- goto error;
+ goto free_id;
kvfree(resp);
return entry->id;
-error:
+free_id:
+ ksmbd_rpc_id_free(entry->id);
+free_entry:
list_del(&entry->list);
kfree(entry);
return -EINVAL;
diff --git a/fs/ksmbd/misc.c b/fs/ksmbd/misc.c
index df991107ad2c..9e8afaa686e3 100644
--- a/fs/ksmbd/misc.c
+++ b/fs/ksmbd/misc.c
@@ -7,6 +7,7 @@
#include <linux/kernel.h>
#include <linux/xattr.h>
#include <linux/fs.h>
+#include <linux/unicode.h>
#include "misc.h"
#include "smb_common.h"
@@ -159,7 +160,7 @@ out:
*/
char *convert_to_nt_pathname(struct ksmbd_share_config *share,
- struct path *path)
+ const struct path *path)
{
char *pathname, *ab_pathname, *nt_pathname;
int share_path_len = share->path_sz;
@@ -226,26 +227,53 @@ void ksmbd_conv_path_to_windows(char *path)
strreplace(path, '/', '\\');
}
+char *ksmbd_casefold_sharename(struct unicode_map *um, const char *name)
+{
+ char *cf_name;
+ int cf_len;
+
+ cf_name = kzalloc(KSMBD_REQ_MAX_SHARE_NAME, GFP_KERNEL);
+ if (!cf_name)
+ return ERR_PTR(-ENOMEM);
+
+ if (IS_ENABLED(CONFIG_UNICODE) && um) {
+ const struct qstr q_name = {.name = name, .len = strlen(name)};
+
+ cf_len = utf8_casefold(um, &q_name, cf_name,
+ KSMBD_REQ_MAX_SHARE_NAME);
+ if (cf_len < 0)
+ goto out_ascii;
+
+ return cf_name;
+ }
+
+out_ascii:
+ cf_len = strscpy(cf_name, name, KSMBD_REQ_MAX_SHARE_NAME);
+ if (cf_len < 0) {
+ kfree(cf_name);
+ return ERR_PTR(-E2BIG);
+ }
+
+ for (; *cf_name; ++cf_name)
+ *cf_name = isascii(*cf_name) ? tolower(*cf_name) : *cf_name;
+ return cf_name - cf_len;
+}
+
/**
* ksmbd_extract_sharename() - get share name from tree connect request
* @treename: buffer containing tree name and share name
*
* Return: share name on success, otherwise error
*/
-char *ksmbd_extract_sharename(char *treename)
+char *ksmbd_extract_sharename(struct unicode_map *um, const char *treename)
{
- char *name = treename;
- char *dst;
- char *pos = strrchr(name, '\\');
+ const char *name = treename, *pos = strrchr(name, '\\');
if (pos)
name = (pos + 1);
/* caller has to free the memory */
- dst = kstrdup(name, GFP_KERNEL);
- if (!dst)
- return ERR_PTR(-ENOMEM);
- return dst;
+ return ksmbd_casefold_sharename(um, name);
}
/**
diff --git a/fs/ksmbd/misc.h b/fs/ksmbd/misc.h
index aae2a252945f..1facfcd21200 100644
--- a/fs/ksmbd/misc.h
+++ b/fs/ksmbd/misc.h
@@ -15,12 +15,13 @@ int match_pattern(const char *str, size_t len, const char *pattern);
int ksmbd_validate_filename(char *filename);
int parse_stream_name(char *filename, char **stream_name, int *s_type);
char *convert_to_nt_pathname(struct ksmbd_share_config *share,
- struct path *path);
+ const struct path *path);
int get_nlink(struct kstat *st);
void ksmbd_conv_path_to_unix(char *path);
void ksmbd_strip_last_slash(char *path);
void ksmbd_conv_path_to_windows(char *path);
-char *ksmbd_extract_sharename(char *treename);
+char *ksmbd_casefold_sharename(struct unicode_map *um, const char *name);
+char *ksmbd_extract_sharename(struct unicode_map *um, const char *treename);
char *convert_to_unix_name(struct ksmbd_share_config *share, const char *name);
#define KSMBD_DIR_INFO_ALIGNMENT 8
diff --git a/fs/ksmbd/ndr.c b/fs/ksmbd/ndr.c
index 5052be9261d9..0ae8d08d85a8 100644
--- a/fs/ksmbd/ndr.c
+++ b/fs/ksmbd/ndr.c
@@ -345,6 +345,8 @@ int ndr_encode_posix_acl(struct ndr *n,
{
unsigned int ref_id = 0x00020000;
int ret;
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
n->offset = 0;
n->length = 1024;
@@ -372,10 +374,12 @@ int ndr_encode_posix_acl(struct ndr *n,
if (ret)
return ret;
- ret = ndr_write_int64(n, from_kuid(&init_user_ns, i_uid_into_mnt(user_ns, inode)));
+ vfsuid = i_uid_into_vfsuid(user_ns, inode);
+ ret = ndr_write_int64(n, from_kuid(&init_user_ns, vfsuid_into_kuid(vfsuid)));
if (ret)
return ret;
- ret = ndr_write_int64(n, from_kgid(&init_user_ns, i_gid_into_mnt(user_ns, inode)));
+ vfsgid = i_gid_into_vfsgid(user_ns, inode);
+ ret = ndr_write_int64(n, from_kgid(&init_user_ns, vfsgid_into_kgid(vfsgid)));
if (ret)
return ret;
ret = ndr_write_int32(n, inode->i_mode);
diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c
index 9046cff4374b..d7d47b82451d 100644
--- a/fs/ksmbd/oplock.c
+++ b/fs/ksmbd/oplock.c
@@ -1609,12 +1609,18 @@ void create_posix_rsp_buf(char *cc, struct ksmbd_file *fp)
struct create_posix_rsp *buf;
struct inode *inode = file_inode(fp->filp);
struct user_namespace *user_ns = file_mnt_user_ns(fp->filp);
+ vfsuid_t vfsuid = i_uid_into_vfsuid(user_ns, inode);
+ vfsgid_t vfsgid = i_gid_into_vfsgid(user_ns, inode);
buf = (struct create_posix_rsp *)cc;
memset(buf, 0, sizeof(struct create_posix_rsp));
buf->ccontext.DataOffset = cpu_to_le16(offsetof
(struct create_posix_rsp, nlink));
- buf->ccontext.DataLength = cpu_to_le32(52);
+ /*
+ * DataLength = nlink(4) + reparse_tag(4) + mode(4) +
+ * domain sid(28) + unix group sid(16).
+ */
+ buf->ccontext.DataLength = cpu_to_le32(56);
buf->ccontext.NameOffset = cpu_to_le16(offsetof
(struct create_posix_rsp, Name));
buf->ccontext.NameLength = cpu_to_le16(POSIX_CTXT_DATA_LEN);
@@ -1638,13 +1644,18 @@ void create_posix_rsp_buf(char *cc, struct ksmbd_file *fp)
buf->nlink = cpu_to_le32(inode->i_nlink);
buf->reparse_tag = cpu_to_le32(fp->volatile_id);
- buf->mode = cpu_to_le32(inode->i_mode);
- id_to_sid(from_kuid_munged(&init_user_ns,
- i_uid_into_mnt(user_ns, inode)),
- SIDNFS_USER, (struct smb_sid *)&buf->SidBuffer[0]);
- id_to_sid(from_kgid_munged(&init_user_ns,
- i_gid_into_mnt(user_ns, inode)),
- SIDNFS_GROUP, (struct smb_sid *)&buf->SidBuffer[20]);
+ buf->mode = cpu_to_le32(inode->i_mode & 0777);
+ /*
+ * SidBuffer(44) contain two sids(Domain sid(28), UNIX group sid(16)).
+ * Domain sid(28) = revision(1) + num_subauth(1) + authority(6) +
+ * sub_auth(4 * 4(num_subauth)) + RID(4).
+ * UNIX group id(16) = revision(1) + num_subauth(1) + authority(6) +
+ * sub_auth(4 * 1(num_subauth)) + RID(4).
+ */
+ id_to_sid(from_kuid_munged(&init_user_ns, vfsuid_into_kuid(vfsuid)),
+ SIDOWNER, (struct smb_sid *)&buf->SidBuffer[0]);
+ id_to_sid(from_kgid_munged(&init_user_ns, vfsgid_into_kgid(vfsgid)),
+ SIDUNIX_GROUP, (struct smb_sid *)&buf->SidBuffer[28]);
}
/*
diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c
index ce42bff42ef9..394b6ceac431 100644
--- a/fs/ksmbd/server.c
+++ b/fs/ksmbd/server.c
@@ -235,10 +235,8 @@ send:
if (work->sess && work->sess->enc && work->encrypted &&
conn->ops->encrypt_resp) {
rc = conn->ops->encrypt_resp(work);
- if (rc < 0) {
+ if (rc < 0)
conn->ops->set_rsp_status(work, STATUS_DATA_ERROR);
- goto send;
- }
}
ksmbd_conn_write(work);
@@ -434,11 +432,9 @@ static ssize_t stats_show(struct class *class, struct class_attribute *attr,
"reset",
"shutdown"
};
-
- ssize_t sz = scnprintf(buf, PAGE_SIZE, "%d %s %d %lu\n", stats_version,
- state[server_conf.state], server_conf.tcp_port,
- server_conf.ipc_last_active / HZ);
- return sz;
+ return sysfs_emit(buf, "%d %s %d %lu\n", stats_version,
+ state[server_conf.state], server_conf.tcp_port,
+ server_conf.ipc_last_active / HZ);
}
static ssize_t kill_server_store(struct class *class,
@@ -470,19 +466,13 @@ static ssize_t debug_show(struct class *class, struct class_attribute *attr,
for (i = 0; i < ARRAY_SIZE(debug_type_strings); i++) {
if ((ksmbd_debug_types >> i) & 1) {
- pos = scnprintf(buf + sz,
- PAGE_SIZE - sz,
- "[%s] ",
- debug_type_strings[i]);
+ pos = sysfs_emit_at(buf, sz, "[%s] ", debug_type_strings[i]);
} else {
- pos = scnprintf(buf + sz,
- PAGE_SIZE - sz,
- "%s ",
- debug_type_strings[i]);
+ pos = sysfs_emit_at(buf, sz, "%s ", debug_type_strings[i]);
}
sz += pos;
}
- sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n");
+ sz += sysfs_emit_at(buf, sz, "\n");
return sz;
}
diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c
index ab23da2120b9..e401302478c3 100644
--- a/fs/ksmbd/smb2ops.c
+++ b/fs/ksmbd/smb2ops.c
@@ -247,8 +247,9 @@ void init_smb3_02_server(struct ksmbd_conn *conn)
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES)
conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING;
- if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION &&
- conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION)
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION ||
+ (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) &&
+ conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION))
conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION;
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL)
@@ -271,6 +272,11 @@ int init_smb3_11_server(struct ksmbd_conn *conn)
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES)
conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING;
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION ||
+ (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) &&
+ conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION))
+ conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION;
+
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL)
conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL;
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index 19412ac701a6..14d7f3599c63 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -903,7 +903,7 @@ static void decode_encrypt_ctxt(struct ksmbd_conn *conn,
return;
}
- if (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION))
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF)
return;
for (i = 0; i < cph_cnt; i++) {
@@ -925,7 +925,7 @@ static void decode_encrypt_ctxt(struct ksmbd_conn *conn,
*
* Return: true if connection should be encrypted, else false
*/
-static bool smb3_encryption_negotiated(struct ksmbd_conn *conn)
+bool smb3_encryption_negotiated(struct ksmbd_conn *conn)
{
if (!conn->ops->generate_encryptionkey)
return false;
@@ -1508,7 +1508,8 @@ static int ntlm_authenticate(struct ksmbd_work *work)
return -EINVAL;
}
sess->enc = true;
- rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE;
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION)
+ rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE;
/*
* signing is disable if encryption is enable
* on this session
@@ -1599,7 +1600,8 @@ static int krb5_authenticate(struct ksmbd_work *work)
return -EINVAL;
}
sess->enc = true;
- rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE;
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION)
+ rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE;
sess->sign = false;
}
@@ -1883,7 +1885,7 @@ int smb2_tree_connect(struct ksmbd_work *work)
goto out_err1;
}
- name = ksmbd_extract_sharename(treename);
+ name = ksmbd_extract_sharename(conn->um, treename);
if (IS_ERR(name)) {
status.ret = KSMBD_TREE_CONN_STATUS_ERROR;
goto out_err1;
@@ -2185,7 +2187,7 @@ out:
* Return: 0 on success, otherwise error
*/
static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len,
- struct path *path)
+ const struct path *path)
{
struct user_namespace *user_ns = mnt_user_ns(path->mnt);
char *attr_name = NULL, *value;
@@ -2272,7 +2274,7 @@ next:
return rc;
}
-static noinline int smb2_set_stream_name_xattr(struct path *path,
+static noinline int smb2_set_stream_name_xattr(const struct path *path,
struct ksmbd_file *fp,
char *stream_name, int s_type)
{
@@ -2311,7 +2313,7 @@ static noinline int smb2_set_stream_name_xattr(struct path *path,
return 0;
}
-static int smb2_remove_smb_xattrs(struct path *path)
+static int smb2_remove_smb_xattrs(const struct path *path)
{
struct user_namespace *user_ns = mnt_user_ns(path->mnt);
char *name, *xattr_list = NULL;
@@ -2345,7 +2347,7 @@ out:
return err;
}
-static int smb2_create_truncate(struct path *path)
+static int smb2_create_truncate(const struct path *path)
{
int rc = vfs_truncate(path, 0);
@@ -2364,7 +2366,7 @@ static int smb2_create_truncate(struct path *path)
return rc;
}
-static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, struct path *path,
+static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, const struct path *path,
struct ksmbd_file *fp)
{
struct xattr_dos_attrib da = {0};
@@ -2387,7 +2389,7 @@ static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, struct path *path,
}
static void smb2_update_xattrs(struct ksmbd_tree_connect *tcon,
- struct path *path, struct ksmbd_file *fp)
+ const struct path *path, struct ksmbd_file *fp)
{
struct xattr_dos_attrib da;
int rc;
@@ -2447,7 +2449,7 @@ static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name,
static int smb2_create_sd_buffer(struct ksmbd_work *work,
struct smb2_create_req *req,
- struct path *path)
+ const struct path *path)
{
struct create_context *context;
struct create_sd_buf_req *sd_buf;
@@ -2477,16 +2479,19 @@ static void ksmbd_acls_fattr(struct smb_fattr *fattr,
struct user_namespace *mnt_userns,
struct inode *inode)
{
- fattr->cf_uid = i_uid_into_mnt(mnt_userns, inode);
- fattr->cf_gid = i_gid_into_mnt(mnt_userns, inode);
+ vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+
+ fattr->cf_uid = vfsuid_into_kuid(vfsuid);
+ fattr->cf_gid = vfsgid_into_kgid(vfsgid);
fattr->cf_mode = inode->i_mode;
fattr->cf_acls = NULL;
fattr->cf_dacls = NULL;
if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) {
- fattr->cf_acls = get_acl(inode, ACL_TYPE_ACCESS);
+ fattr->cf_acls = get_inode_acl(inode, ACL_TYPE_ACCESS);
if (S_ISDIR(inode->i_mode))
- fattr->cf_dacls = get_acl(inode, ACL_TYPE_DEFAULT);
+ fattr->cf_dacls = get_inode_acl(inode, ACL_TYPE_DEFAULT);
}
}
@@ -2761,7 +2766,6 @@ int smb2_open(struct ksmbd_work *work)
} else {
file_present = true;
user_ns = mnt_user_ns(path.mnt);
- generic_fillattr(user_ns, d_inode(path.dentry), &stat);
}
if (stream_name) {
if (req->CreateOptions & FILE_DIRECTORY_FILE_LE) {
@@ -2770,7 +2774,8 @@ int smb2_open(struct ksmbd_work *work)
rsp->hdr.Status = STATUS_NOT_A_DIRECTORY;
}
} else {
- if (S_ISDIR(stat.mode) && s_type == DATA_STREAM) {
+ if (file_present && S_ISDIR(d_inode(path.dentry)->i_mode) &&
+ s_type == DATA_STREAM) {
rc = -EIO;
rsp->hdr.Status = STATUS_FILE_IS_A_DIRECTORY;
}
@@ -2787,7 +2792,8 @@ int smb2_open(struct ksmbd_work *work)
}
if (file_present && req->CreateOptions & FILE_NON_DIRECTORY_FILE_LE &&
- S_ISDIR(stat.mode) && !(req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) {
+ S_ISDIR(d_inode(path.dentry)->i_mode) &&
+ !(req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) {
ksmbd_debug(SMB, "open() argument is a directory: %s, %x\n",
name, req->CreateOptions);
rsp->hdr.Status = STATUS_FILE_IS_A_DIRECTORY;
@@ -2797,7 +2803,7 @@ int smb2_open(struct ksmbd_work *work)
if (file_present && (req->CreateOptions & FILE_DIRECTORY_FILE_LE) &&
!(req->CreateDisposition == FILE_CREATE_LE) &&
- !S_ISDIR(stat.mode)) {
+ !S_ISDIR(d_inode(path.dentry)->i_mode)) {
rsp->hdr.Status = STATUS_NOT_A_DIRECTORY;
rc = -EIO;
goto err_out;
@@ -2952,7 +2958,7 @@ int smb2_open(struct ksmbd_work *work)
struct inode *inode = d_inode(path.dentry);
posix_acl_rc = ksmbd_vfs_inherit_posix_acl(user_ns,
- inode,
+ path.dentry,
d_inode(path.dentry->d_parent));
if (posix_acl_rc)
ksmbd_debug(SMB, "inherit posix acl failed : %d\n", posix_acl_rc);
@@ -2968,7 +2974,7 @@ int smb2_open(struct ksmbd_work *work)
if (rc) {
if (posix_acl_rc)
ksmbd_vfs_set_init_posix_acl(user_ns,
- inode);
+ path.dentry);
if (test_share_config_flag(work->tcon->share_conf,
KSMBD_SHARE_FLAG_ACL_XATTR)) {
@@ -3434,7 +3440,7 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
goto free_conv_name;
}
- struct_sz = readdir_info_level_struct_sz(info_level) - 1 + conv_len;
+ struct_sz = readdir_info_level_struct_sz(info_level) + conv_len;
next_entry_offset = ALIGN(struct_sz, KSMBD_DIR_INFO_ALIGNMENT);
d_info->last_entry_off_align = next_entry_offset - struct_sz;
@@ -3561,17 +3567,22 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
posix_info->AllocationSize = cpu_to_le64(ksmbd_kstat->kstat->blocks << 9);
posix_info->DeviceId = cpu_to_le32(ksmbd_kstat->kstat->rdev);
posix_info->HardLinks = cpu_to_le32(ksmbd_kstat->kstat->nlink);
- posix_info->Mode = cpu_to_le32(ksmbd_kstat->kstat->mode);
+ posix_info->Mode = cpu_to_le32(ksmbd_kstat->kstat->mode & 0777);
posix_info->Inode = cpu_to_le64(ksmbd_kstat->kstat->ino);
posix_info->DosAttributes =
S_ISDIR(ksmbd_kstat->kstat->mode) ?
FILE_ATTRIBUTE_DIRECTORY_LE : FILE_ATTRIBUTE_ARCHIVE_LE;
if (d_info->hide_dot_file && d_info->name[0] == '.')
posix_info->DosAttributes |= FILE_ATTRIBUTE_HIDDEN_LE;
+ /*
+ * SidBuffer(32) contain two sids(Domain sid(16), UNIX group sid(16)).
+ * UNIX sid(16) = revision(1) + num_subauth(1) + authority(6) +
+ * sub_auth(4 * 1(num_subauth)) + RID(4).
+ */
id_to_sid(from_kuid_munged(&init_user_ns, ksmbd_kstat->kstat->uid),
- SIDNFS_USER, (struct smb_sid *)&posix_info->SidBuffer[0]);
+ SIDUNIX_USER, (struct smb_sid *)&posix_info->SidBuffer[0]);
id_to_sid(from_kgid_munged(&init_user_ns, ksmbd_kstat->kstat->gid),
- SIDNFS_GROUP, (struct smb_sid *)&posix_info->SidBuffer[20]);
+ SIDUNIX_GROUP, (struct smb_sid *)&posix_info->SidBuffer[16]);
memcpy(posix_info->name, conv_name, conv_len);
posix_info->name_len = cpu_to_le32(conv_len);
posix_info->NextEntryOffset = cpu_to_le32(next_entry_offset);
@@ -3681,7 +3692,7 @@ static int reserve_populate_dentry(struct ksmbd_dir_info *d_info,
return -EOPNOTSUPP;
conv_len = (d_info->name_len + 1) * 2;
- next_entry_offset = ALIGN(struct_sz - 1 + conv_len,
+ next_entry_offset = ALIGN(struct_sz + conv_len,
KSMBD_DIR_INFO_ALIGNMENT);
if (next_entry_offset > d_info->out_buf_len) {
@@ -3776,7 +3787,7 @@ static int reserve_populate_dentry(struct ksmbd_dir_info *d_info,
return 0;
}
-static int __query_dir(struct dir_context *ctx, const char *name, int namlen,
+static bool __query_dir(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ksmbd_readdir_data *buf;
@@ -3790,27 +3801,20 @@ static int __query_dir(struct dir_context *ctx, const char *name, int namlen,
/* dot and dotdot entries are already reserved */
if (!strcmp(".", name) || !strcmp("..", name))
- return 0;
+ return true;
if (ksmbd_share_veto_filename(priv->work->tcon->share_conf, name))
- return 0;
+ return true;
if (!match_pattern(name, namlen, priv->search_pattern))
- return 0;
+ return true;
d_info->name = name;
d_info->name_len = namlen;
rc = reserve_populate_dentry(d_info, priv->info_level);
if (rc)
- return rc;
- if (d_info->flags & SMB2_RETURN_SINGLE_ENTRY) {
+ return false;
+ if (d_info->flags & SMB2_RETURN_SINGLE_ENTRY)
d_info->out_buf_len = 0;
- return 0;
- }
- return 0;
-}
-
-static void restart_ctx(struct dir_context *ctx)
-{
- ctx->pos = 0;
+ return true;
}
static int verify_info_level(int info_level)
@@ -3894,8 +3898,7 @@ int smb2_query_dir(struct ksmbd_work *work)
inode_permission(file_mnt_user_ns(dir_fp->filp),
file_inode(dir_fp->filp),
MAY_READ | MAY_EXEC)) {
- pr_err("no right to enumerate directory (%pd)\n",
- dir_fp->filp->f_path.dentry);
+ pr_err("no right to enumerate directory (%pD)\n", dir_fp->filp);
rc = -EACCES;
goto err_out2;
}
@@ -3921,7 +3924,6 @@ int smb2_query_dir(struct ksmbd_work *work)
if (srch_flag & SMB2_REOPEN || srch_flag & SMB2_RESTART_SCANS) {
ksmbd_debug(SMB, "Restart directory scan\n");
generic_file_llseek(dir_fp->filp, 0, SEEK_SET);
- restart_ctx(&dir_fp->readdir_data.ctx);
}
memset(&d_info, 0, sizeof(struct ksmbd_dir_info));
@@ -3968,11 +3970,9 @@ int smb2_query_dir(struct ksmbd_work *work)
*/
if (!d_info.out_buf_len && !d_info.num_entry)
goto no_buf_len;
- if (rc == 0)
- restart_ctx(&dir_fp->readdir_data.ctx);
- if (rc == -ENOSPC)
+ if (rc > 0 || rc == -ENOSPC)
rc = 0;
- if (rc)
+ else if (rc)
goto err_out;
d_info.wptr = d_info.rptr;
@@ -4029,6 +4029,8 @@ err_out2:
rsp->hdr.Status = STATUS_NO_MEMORY;
else if (rc == -EFAULT)
rsp->hdr.Status = STATUS_INVALID_INFO_CLASS;
+ else if (rc == -EIO)
+ rsp->hdr.Status = STATUS_FILE_CORRUPT_ERROR;
if (!rsp->hdr.Status)
rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR;
@@ -4158,7 +4160,7 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp,
int rc, name_len, value_len, xattr_list_len, idx;
ssize_t buf_free_len, alignment_bytes, next_offset, rsp_data_cnt = 0;
struct smb2_ea_info_req *ea_req = NULL;
- struct path *path;
+ const struct path *path;
struct user_namespace *user_ns = file_mnt_user_ns(fp->filp);
if (!(fp->daccess & FILE_READ_EA_LE)) {
@@ -4495,7 +4497,7 @@ static void get_file_stream_info(struct ksmbd_work *work,
struct smb2_file_stream_info *file_info;
char *stream_name, *xattr_list = NULL, *stream_buf;
struct kstat stat;
- struct path *path = &fp->filp->f_path;
+ const struct path *path = &fp->filp->f_path;
ssize_t xattr_list_len;
int nbytes = 0, streamlen, stream_name_len, next, idx = 0;
int buf_free_len;
@@ -4720,7 +4722,11 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp,
{
struct smb311_posix_qinfo *file_info;
struct inode *inode = file_inode(fp->filp);
+ struct user_namespace *user_ns = file_mnt_user_ns(fp->filp);
+ vfsuid_t vfsuid = i_uid_into_vfsuid(user_ns, inode);
+ vfsgid_t vfsgid = i_gid_into_vfsgid(user_ns, inode);
u64 time;
+ int out_buf_len = sizeof(struct smb311_posix_qinfo) + 32;
file_info = (struct smb311_posix_qinfo *)rsp->Buffer;
file_info->CreationTime = cpu_to_le64(fp->create_time);
@@ -4735,12 +4741,22 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp,
file_info->EndOfFile = cpu_to_le64(inode->i_size);
file_info->AllocationSize = cpu_to_le64(inode->i_blocks << 9);
file_info->HardLinks = cpu_to_le32(inode->i_nlink);
- file_info->Mode = cpu_to_le32(inode->i_mode);
+ file_info->Mode = cpu_to_le32(inode->i_mode & 0777);
file_info->DeviceId = cpu_to_le32(inode->i_rdev);
- rsp->OutputBufferLength =
- cpu_to_le32(sizeof(struct smb311_posix_qinfo));
- inc_rfc1001_len(rsp_org, sizeof(struct smb311_posix_qinfo));
- return 0;
+
+ /*
+ * Sids(32) contain two sids(Domain sid(16), UNIX group sid(16)).
+ * UNIX sid(16) = revision(1) + num_subauth(1) + authority(6) +
+ * sub_auth(4 * 1(num_subauth)) + RID(4).
+ */
+ id_to_sid(from_kuid_munged(&init_user_ns, vfsuid_into_kuid(vfsuid)),
+ SIDUNIX_USER, (struct smb_sid *)&file_info->Sids[0]);
+ id_to_sid(from_kgid_munged(&init_user_ns, vfsgid_into_kgid(vfsgid)),
+ SIDUNIX_GROUP, (struct smb_sid *)&file_info->Sids[16]);
+
+ rsp->OutputBufferLength = cpu_to_le32(out_buf_len);
+ inc_rfc1001_len(rsp_org, out_buf_len);
+ return out_buf_len;
}
static int smb2_get_info_file(struct ksmbd_work *work,
@@ -4860,8 +4876,8 @@ static int smb2_get_info_file(struct ksmbd_work *work,
pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n");
rc = -EOPNOTSUPP;
} else {
- rc = find_file_posix_info(rsp, fp, work->response_buf);
- file_infoclass_size = sizeof(struct smb311_posix_qinfo);
+ file_infoclass_size = find_file_posix_info(rsp, fp,
+ work->response_buf);
}
break;
default:
@@ -5413,7 +5429,7 @@ static int smb2_rename(struct ksmbd_work *work,
if (!pathname)
return -ENOMEM;
- abs_oldname = d_path(&fp->filp->f_path, pathname, PATH_MAX);
+ abs_oldname = file_path(fp->filp, pathname, PATH_MAX);
if (IS_ERR(abs_oldname)) {
rc = -EINVAL;
goto out;
@@ -5548,7 +5564,7 @@ static int smb2_create_link(struct ksmbd_work *work,
}
ksmbd_debug(SMB, "link name is %s\n", link_name);
- target_name = d_path(&filp->f_path, pathname, PATH_MAX);
+ target_name = file_path(filp, pathname, PATH_MAX);
if (IS_ERR(target_name)) {
rc = -EINVAL;
goto out;
@@ -6266,8 +6282,8 @@ int smb2_read(struct ksmbd_work *work)
goto out;
}
- ksmbd_debug(SMB, "filename %pd, offset %lld, len %zu\n",
- fp->filp->f_path.dentry, offset, length);
+ ksmbd_debug(SMB, "filename %pD, offset %lld, len %zu\n",
+ fp->filp, offset, length);
work->aux_payload_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO);
if (!work->aux_payload_buf) {
@@ -6531,8 +6547,8 @@ int smb2_write(struct ksmbd_work *work)
data_buf = (char *)(((char *)&req->hdr.ProtocolId) +
le16_to_cpu(req->DataOffset));
- ksmbd_debug(SMB, "filename %pd, offset %lld, len %zu\n",
- fp->filp->f_path.dentry, offset, length);
+ ksmbd_debug(SMB, "filename %pD, offset %lld, len %zu\n",
+ fp->filp, offset, length);
err = ksmbd_vfs_write(work, fp, data_buf, length, &offset,
writethrough, &nbytes);
if (err < 0)
@@ -6737,7 +6753,7 @@ static int smb2_set_flock_flags(struct file_lock *flock, int flags)
case SMB2_LOCKFLAG_UNLOCK:
ksmbd_debug(SMB, "received unlock request\n");
flock->fl_type = F_UNLCK;
- cmd = 0;
+ cmd = F_SETLK;
break;
}
@@ -6841,6 +6857,7 @@ int smb2_lock(struct ksmbd_work *work)
if (lock_start > U64_MAX - lock_length) {
pr_err("Invalid lock range requested\n");
rsp->hdr.Status = STATUS_INVALID_LOCK_RANGE;
+ locks_free_lock(flock);
goto out;
}
@@ -6860,6 +6877,7 @@ int smb2_lock(struct ksmbd_work *work)
"the end offset(%llx) is smaller than the start offset(%llx)\n",
flock->fl_end, flock->fl_start);
rsp->hdr.Status = STATUS_INVALID_LOCK_RANGE;
+ locks_free_lock(flock);
goto out;
}
@@ -6871,6 +6889,7 @@ int smb2_lock(struct ksmbd_work *work)
flock->fl_type != F_UNLCK) {
pr_err("conflict two locks in one request\n");
err = -EINVAL;
+ locks_free_lock(flock);
goto out;
}
}
@@ -6879,6 +6898,7 @@ int smb2_lock(struct ksmbd_work *work)
smb_lock = smb2_lock_init(flock, cmd, flags, &lock_list);
if (!smb_lock) {
err = -EINVAL;
+ locks_free_lock(flock);
goto out;
}
}
@@ -7115,7 +7135,7 @@ out:
rlock->fl_start = smb_lock->start;
rlock->fl_end = smb_lock->end;
- rc = vfs_lock_file(filp, 0, rlock, NULL);
+ rc = vfs_lock_file(filp, F_SETLK, rlock, NULL);
if (rc)
pr_err("rollback unlock fail : %d\n", rc);
@@ -7643,11 +7663,16 @@ int smb2_ioctl(struct ksmbd_work *work)
goto out;
}
- if (in_buf_len < sizeof(struct validate_negotiate_info_req))
- return -EINVAL;
+ if (in_buf_len < offsetof(struct validate_negotiate_info_req,
+ Dialects)) {
+ ret = -EINVAL;
+ goto out;
+ }
- if (out_buf_len < sizeof(struct validate_negotiate_info_rsp))
- return -EINVAL;
+ if (out_buf_len < sizeof(struct validate_negotiate_info_rsp)) {
+ ret = -EINVAL;
+ goto out;
+ }
ret = fsctl_validate_negotiate_info(conn,
(struct validate_negotiate_info_req *)&req->Buffer[0],
@@ -8573,7 +8598,7 @@ int smb3_encrypt_resp(struct ksmbd_work *work)
buf_size += iov[1].iov_len;
work->resp_hdr_sz = iov[1].iov_len;
- rc = ksmbd_crypt_message(work->conn, iov, rq_nvec, 1);
+ rc = ksmbd_crypt_message(work, iov, rq_nvec, 1);
if (rc)
return rc;
@@ -8592,7 +8617,6 @@ bool smb3_is_transform_hdr(void *buf)
int smb3_decrypt_req(struct ksmbd_work *work)
{
- struct ksmbd_conn *conn = work->conn;
struct ksmbd_session *sess;
char *buf = work->request_buf;
unsigned int pdu_length = get_rfc1002_len(buf);
@@ -8612,7 +8636,7 @@ int smb3_decrypt_req(struct ksmbd_work *work)
return -ECONNABORTED;
}
- sess = ksmbd_session_lookup_all(conn, le64_to_cpu(tr_hdr->SessionId));
+ sess = ksmbd_session_lookup_all(work->conn, le64_to_cpu(tr_hdr->SessionId));
if (!sess) {
pr_err("invalid session id(%llx) in transform header\n",
le64_to_cpu(tr_hdr->SessionId));
@@ -8623,7 +8647,7 @@ int smb3_decrypt_req(struct ksmbd_work *work)
iov[0].iov_len = sizeof(struct smb2_transform_hdr) + 4;
iov[1].iov_base = buf + sizeof(struct smb2_transform_hdr) + 4;
iov[1].iov_len = buf_data_size;
- rc = ksmbd_crypt_message(conn, iov, 2, 0);
+ rc = ksmbd_crypt_message(work, iov, 2, 0);
if (rc)
return rc;
diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h
index af455278d005..aa5dbe54f5a1 100644
--- a/fs/ksmbd/smb2pdu.h
+++ b/fs/ksmbd/smb2pdu.h
@@ -158,7 +158,8 @@ struct create_posix_rsp {
__le32 nlink;
__le32 reparse_tag;
__le32 mode;
- u8 SidBuffer[40];
+ /* SidBuffer contain two sids(Domain sid(28), UNIX group sid(16)) */
+ u8 SidBuffer[44];
} __packed;
struct smb2_buffer_desc_v1 {
@@ -439,9 +440,10 @@ struct smb2_posix_info {
__le32 HardLinks;
__le32 ReparseTag;
__le32 Mode;
- u8 SidBuffer[40];
+ /* SidBuffer contain two sids (UNIX user sid(16), UNIX group sid(16)) */
+ u8 SidBuffer[32];
__le32 name_len;
- u8 name[1];
+ u8 name[];
/*
* var sized owner SID
* var sized group SID
@@ -492,6 +494,7 @@ int smb3_decrypt_req(struct ksmbd_work *work);
int smb3_encrypt_resp(struct ksmbd_work *work);
bool smb3_11_final_sess_setup_resp(struct ksmbd_work *work);
int smb2_set_rsp_credits(struct ksmbd_work *work);
+bool smb3_encryption_negotiated(struct ksmbd_conn *conn);
/* smb2 misc functions */
int ksmbd_smb2_check_message(struct ksmbd_work *work);
diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c
index 7f8ab14fb8ec..2a4fbbd55b91 100644
--- a/fs/ksmbd/smb_common.c
+++ b/fs/ksmbd/smb_common.c
@@ -4,6 +4,8 @@
* Copyright (C) 2018 Namjae Jeon <linkinjeon@kernel.org>
*/
+#include <linux/user_namespace.h>
+
#include "smb_common.h"
#include "server.h"
#include "misc.h"
@@ -621,12 +623,12 @@ int ksmbd_override_fsids(struct ksmbd_work *work)
if (share->force_gid != KSMBD_SHARE_INVALID_GID)
gid = share->force_gid;
- cred = prepare_kernel_cred(NULL);
+ cred = prepare_kernel_cred(&init_task);
if (!cred)
return -ENOMEM;
- cred->fsuid = make_kuid(current_user_ns(), uid);
- cred->fsgid = make_kgid(current_user_ns(), gid);
+ cred->fsuid = make_kuid(&init_user_ns, uid);
+ cred->fsgid = make_kgid(&init_user_ns, gid);
gi = groups_alloc(0);
if (!gi) {
diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h
index 318c16fa81da..e663ab9ea759 100644
--- a/fs/ksmbd/smb_common.h
+++ b/fs/ksmbd/smb_common.h
@@ -277,14 +277,14 @@ struct file_directory_info {
__le64 AllocationSize;
__le32 ExtFileAttributes;
__le32 FileNameLength;
- char FileName[1];
+ char FileName[];
} __packed; /* level 0x101 FF resp data */
struct file_names_info {
__le32 NextEntryOffset;
__u32 FileIndex;
__le32 FileNameLength;
- char FileName[1];
+ char FileName[];
} __packed; /* level 0xc FF resp data */
struct file_full_directory_info {
@@ -299,7 +299,7 @@ struct file_full_directory_info {
__le32 ExtFileAttributes;
__le32 FileNameLength;
__le32 EaSize;
- char FileName[1];
+ char FileName[];
} __packed; /* level 0x102 FF resp */
struct file_both_directory_info {
@@ -317,7 +317,7 @@ struct file_both_directory_info {
__u8 ShortNameLength;
__u8 Reserved;
__u8 ShortName[24];
- char FileName[1];
+ char FileName[];
} __packed; /* level 0x104 FFrsp data */
struct file_id_both_directory_info {
@@ -337,7 +337,7 @@ struct file_id_both_directory_info {
__u8 ShortName[24];
__le16 Reserved2;
__le64 UniqueId;
- char FileName[1];
+ char FileName[];
} __packed;
struct file_id_full_dir_info {
@@ -354,7 +354,7 @@ struct file_id_full_dir_info {
__le32 EaSize; /* EA size */
__le32 Reserved;
__le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/
- char FileName[1];
+ char FileName[];
} __packed; /* level 0x105 FF rsp data */
struct smb_version_values {
diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c
index 3781bca2c8fc..ab5c68cc0e13 100644
--- a/fs/ksmbd/smbacl.c
+++ b/fs/ksmbd/smbacl.c
@@ -275,7 +275,8 @@ static int sid_to_id(struct user_namespace *user_ns,
uid_t id;
id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]);
- uid = mapped_kuid_user(user_ns, &init_user_ns, KUIDT_INIT(id));
+ uid = KUIDT_INIT(id);
+ uid = from_vfsuid(user_ns, &init_user_ns, VFSUIDT_INIT(uid));
if (uid_valid(uid)) {
fattr->cf_uid = uid;
rc = 0;
@@ -285,7 +286,8 @@ static int sid_to_id(struct user_namespace *user_ns,
gid_t id;
id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]);
- gid = mapped_kgid_user(user_ns, &init_user_ns, KGIDT_INIT(id));
+ gid = KGIDT_INIT(id);
+ gid = from_vfsgid(user_ns, &init_user_ns, VFSGIDT_INIT(gid));
if (gid_valid(gid)) {
fattr->cf_gid = gid;
rc = 0;
@@ -991,7 +993,7 @@ static void smb_set_ace(struct smb_ace *ace, const struct smb_sid *sid, u8 type,
}
int smb_inherit_dacl(struct ksmbd_conn *conn,
- struct path *path,
+ const struct path *path,
unsigned int uid, unsigned int gid)
{
const struct smb_sid *psid, *creator = NULL;
@@ -1185,7 +1187,7 @@ bool smb_inherit_flags(int flags, bool is_dir)
return false;
}
-int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path,
+int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path,
__le32 *pdaccess, int uid)
{
struct user_namespace *user_ns = mnt_user_ns(path->mnt);
@@ -1287,7 +1289,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path,
}
if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) {
- posix_acls = get_acl(d_inode(path->dentry), ACL_TYPE_ACCESS);
+ posix_acls = get_inode_acl(d_inode(path->dentry), ACL_TYPE_ACCESS);
if (posix_acls && !found) {
unsigned int id = -1;
@@ -1352,7 +1354,7 @@ err_out:
}
int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
- struct path *path, struct smb_ntsd *pntsd, int ntsd_len,
+ const struct path *path, struct smb_ntsd *pntsd, int ntsd_len,
bool type_check)
{
int rc;
@@ -1384,14 +1386,14 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
ksmbd_vfs_remove_acl_xattrs(user_ns, path->dentry);
/* Update posix acls */
if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && fattr.cf_dacls) {
- rc = set_posix_acl(user_ns, inode,
+ rc = set_posix_acl(user_ns, path->dentry,
ACL_TYPE_ACCESS, fattr.cf_acls);
if (rc < 0)
ksmbd_debug(SMB,
"Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n",
rc);
if (S_ISDIR(inode->i_mode) && fattr.cf_dacls) {
- rc = set_posix_acl(user_ns, inode,
+ rc = set_posix_acl(user_ns, path->dentry,
ACL_TYPE_DEFAULT, fattr.cf_dacls);
if (rc)
ksmbd_debug(SMB,
diff --git a/fs/ksmbd/smbacl.h b/fs/ksmbd/smbacl.h
index fcb2c83f2992..618f2e0236b3 100644
--- a/fs/ksmbd/smbacl.h
+++ b/fs/ksmbd/smbacl.h
@@ -201,12 +201,12 @@ void posix_state_to_acl(struct posix_acl_state *state,
struct posix_acl_entry *pace);
int compare_sids(const struct smb_sid *ctsid, const struct smb_sid *cwsid);
bool smb_inherit_flags(int flags, bool is_dir);
-int smb_inherit_dacl(struct ksmbd_conn *conn, struct path *path,
+int smb_inherit_dacl(struct ksmbd_conn *conn, const struct path *path,
unsigned int uid, unsigned int gid);
-int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path,
+int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path,
__le32 *pdaccess, int uid);
int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
- struct path *path, struct smb_ntsd *pntsd, int ntsd_len,
+ const struct path *path, struct smb_ntsd *pntsd, int ntsd_len,
bool type_check);
void id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid);
void ksmbd_init_domain(u32 *sub_auth);
@@ -214,25 +214,25 @@ void ksmbd_init_domain(u32 *sub_auth);
static inline uid_t posix_acl_uid_translate(struct user_namespace *mnt_userns,
struct posix_acl_entry *pace)
{
- kuid_t kuid;
+ vfsuid_t vfsuid;
/* If this is an idmapped mount, apply the idmapping. */
- kuid = mapped_kuid_fs(mnt_userns, &init_user_ns, pace->e_uid);
+ vfsuid = make_vfsuid(mnt_userns, &init_user_ns, pace->e_uid);
/* Translate the kuid into a userspace id ksmbd would see. */
- return from_kuid(&init_user_ns, kuid);
+ return from_kuid(&init_user_ns, vfsuid_into_kuid(vfsuid));
}
static inline gid_t posix_acl_gid_translate(struct user_namespace *mnt_userns,
struct posix_acl_entry *pace)
{
- kgid_t kgid;
+ vfsgid_t vfsgid;
/* If this is an idmapped mount, apply the idmapping. */
- kgid = mapped_kgid_fs(mnt_userns, &init_user_ns, pace->e_gid);
+ vfsgid = make_vfsgid(mnt_userns, &init_user_ns, pace->e_gid);
/* Translate the kgid into a userspace id ksmbd would see. */
- return from_kgid(&init_user_ns, kgid);
+ return from_kgid(&init_user_ns, vfsgid_into_kgid(vfsgid));
}
#endif /* _SMBACL_H */
diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c
index 7cb0eeb07c80..c9aca21637d5 100644
--- a/fs/ksmbd/transport_ipc.c
+++ b/fs/ksmbd/transport_ipc.c
@@ -197,6 +197,7 @@ static struct genl_family ksmbd_genl_family = {
.module = THIS_MODULE,
.ops = ksmbd_genl_ops,
.n_ops = ARRAY_SIZE(ksmbd_genl_ops),
+ .resv_start_op = KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE + 1,
};
static void ksmbd_nl_init_fixup(void)
diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c
index 35b55ee94fe5..096eda9ef873 100644
--- a/fs/ksmbd/transport_rdma.c
+++ b/fs/ksmbd/transport_rdma.c
@@ -32,7 +32,7 @@
/* SMB_DIRECT negotiation timeout in seconds */
#define SMB_DIRECT_NEGOTIATE_TIMEOUT 120
-#define SMB_DIRECT_MAX_SEND_SGES 8
+#define SMB_DIRECT_MAX_SEND_SGES 6
#define SMB_DIRECT_MAX_RECV_SGES 1
/*
@@ -62,13 +62,13 @@ static int smb_direct_receive_credit_max = 255;
static int smb_direct_send_credit_target = 255;
/* The maximum single message size can be sent to remote peer */
-static int smb_direct_max_send_size = 8192;
+static int smb_direct_max_send_size = 1364;
/* The maximum fragmented upper-layer payload receive size supported */
static int smb_direct_max_fragmented_recv_size = 1024 * 1024;
/* The maximum single-message size which can be received */
-static int smb_direct_max_receive_size = 8192;
+static int smb_direct_max_receive_size = 1364;
static int smb_direct_max_read_write_size = SMBD_DEFAULT_IOSIZE;
@@ -1527,6 +1527,8 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id,
}
case RDMA_CM_EVENT_DEVICE_REMOVAL:
case RDMA_CM_EVENT_DISCONNECTED: {
+ ib_drain_qp(t->qp);
+
t->status = SMB_DIRECT_CS_DISCONNECTED;
wake_up_interruptible(&t->wait_status);
wake_up_interruptible(&t->wait_reassembly_queue);
diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c
index 143bba4e4db8..63d55f543bd2 100644
--- a/fs/ksmbd/transport_tcp.c
+++ b/fs/ksmbd/transport_tcp.c
@@ -399,7 +399,8 @@ static int create_socket(struct interface *iface)
ret = sock_create(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &ksmbd_socket);
if (ret) {
- pr_err("Can't create socket for ipv6, try ipv4: %d\n", ret);
+ if (ret != -EAFNOSUPPORT)
+ pr_err("Can't create socket for ipv6, fallback to ipv4: %d\n", ret);
ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP,
&ksmbd_socket);
if (ret) {
diff --git a/fs/ksmbd/unicode.h b/fs/ksmbd/unicode.h
index 5593024230ae..076f6034a789 100644
--- a/fs/ksmbd/unicode.h
+++ b/fs/ksmbd/unicode.h
@@ -24,6 +24,7 @@
#include <asm/byteorder.h>
#include <linux/types.h>
#include <linux/nls.h>
+#include <linux/unicode.h>
#define UNIUPR_NOLOWER /* Example to not expand lower case tables */
@@ -69,7 +70,7 @@ char *smb_strndup_from_utf16(const char *src, const int maxlen,
const struct nls_table *codepage);
int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
const struct nls_table *cp, int mapchars);
-char *ksmbd_extract_sharename(char *treename);
+char *ksmbd_extract_sharename(struct unicode_map *um, const char *treename);
#endif
/*
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 78d01033604c..ff0e7a4fcd4d 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -321,7 +321,7 @@ static int check_lock_range(struct file *filp, loff_t start, loff_t end,
unsigned char type)
{
struct file_lock *flock;
- struct file_lock_context *ctx = file_inode(filp)->i_flctx;
+ struct file_lock_context *ctx = locks_inode_context(file_inode(filp));
int error = 0;
if (!ctx || list_empty_careful(&ctx->flc_posix))
@@ -377,8 +377,7 @@ int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count,
if (work->conn->connection_type) {
if (!(fp->daccess & (FILE_READ_DATA_LE | FILE_EXECUTE_LE))) {
- pr_err("no right to read(%pd)\n",
- fp->filp->f_path.dentry);
+ pr_err("no right to read(%pD)\n", fp->filp);
return -EACCES;
}
}
@@ -487,8 +486,7 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp,
if (work->conn->connection_type) {
if (!(fp->daccess & FILE_WRITE_DATA_LE)) {
- pr_err("no right to write(%pd)\n",
- fp->filp->f_path.dentry);
+ pr_err("no right to write(%pD)\n", fp->filp);
err = -EACCES;
goto out;
}
@@ -527,8 +525,8 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp,
if (sync) {
err = vfs_fsync_range(filp, offset, offset + *written, 0);
if (err < 0)
- pr_err("fsync failed for filename = %pd, err = %d\n",
- fp->filp->f_path.dentry, err);
+ pr_err("fsync failed for filename = %pD, err = %d\n",
+ fp->filp, err);
}
out:
@@ -543,7 +541,7 @@ out:
*
* Return: 0 on success, otherwise error
*/
-int ksmbd_vfs_getattr(struct path *path, struct kstat *stat)
+int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat)
{
int err;
@@ -1105,7 +1103,7 @@ int ksmbd_vfs_unlink(struct user_namespace *user_ns,
return err;
}
-static int __dir_empty(struct dir_context *ctx, const char *name, int namlen,
+static bool __dir_empty(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ksmbd_readdir_data *buf;
@@ -1113,9 +1111,7 @@ static int __dir_empty(struct dir_context *ctx, const char *name, int namlen,
buf = container_of(ctx, struct ksmbd_readdir_data, ctx);
buf->dirent_count++;
- if (buf->dirent_count > 2)
- return -ENOTEMPTY;
- return 0;
+ return buf->dirent_count <= 2;
}
/**
@@ -1142,22 +1138,33 @@ int ksmbd_vfs_empty_dir(struct ksmbd_file *fp)
return err;
}
-static int __caseless_lookup(struct dir_context *ctx, const char *name,
+static bool __caseless_lookup(struct dir_context *ctx, const char *name,
int namlen, loff_t offset, u64 ino,
unsigned int d_type)
{
struct ksmbd_readdir_data *buf;
+ int cmp = -EINVAL;
buf = container_of(ctx, struct ksmbd_readdir_data, ctx);
if (buf->used != namlen)
- return 0;
- if (!strncasecmp((char *)buf->private, name, namlen)) {
+ return true;
+ if (IS_ENABLED(CONFIG_UNICODE) && buf->um) {
+ const struct qstr q_buf = {.name = buf->private,
+ .len = buf->used};
+ const struct qstr q_name = {.name = name,
+ .len = namlen};
+
+ cmp = utf8_strncasecmp(buf->um, &q_buf, &q_name);
+ }
+ if (cmp < 0)
+ cmp = strncasecmp((char *)buf->private, name, namlen);
+ if (!cmp) {
memcpy((char *)buf->private, name, namlen);
buf->dirent_count = 1;
- return -EEXIST;
+ return false;
}
- return 0;
+ return true;
}
/**
@@ -1168,7 +1175,8 @@ static int __caseless_lookup(struct dir_context *ctx, const char *name,
*
* Return: 0 on success, otherwise error
*/
-static int ksmbd_vfs_lookup_in_dir(struct path *dir, char *name, size_t namelen)
+static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name,
+ size_t namelen, struct unicode_map *um)
{
int ret;
struct file *dfilp;
@@ -1178,6 +1186,7 @@ static int ksmbd_vfs_lookup_in_dir(struct path *dir, char *name, size_t namelen)
.private = name,
.used = namelen,
.dirent_count = 0,
+ .um = um,
};
dfilp = dentry_open(dir, flags, current_cred());
@@ -1240,7 +1249,8 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name,
break;
err = ksmbd_vfs_lookup_in_dir(&parent, filename,
- filename_len);
+ filename_len,
+ work->conn->um);
path_put(&parent);
if (err)
goto out;
@@ -1311,7 +1321,7 @@ int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns,
sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1) ||
!strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT,
sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1)) {
- err = ksmbd_vfs_remove_xattr(user_ns, dentry, name);
+ err = vfs_remove_acl(user_ns, dentry, name);
if (err)
ksmbd_debug(SMB,
"remove acl xattr failed : %s\n", name);
@@ -1365,7 +1375,7 @@ static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct user_namespac
if (!IS_ENABLED(CONFIG_FS_POSIX_ACL))
return NULL;
- posix_acls = get_acl(inode, acl_type);
+ posix_acls = get_inode_acl(inode, acl_type);
if (!posix_acls)
return NULL;
@@ -1743,11 +1753,11 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work,
*total_size_written = 0;
if (!(src_fp->daccess & (FILE_READ_DATA_LE | FILE_EXECUTE_LE))) {
- pr_err("no right to read(%pd)\n", src_fp->filp->f_path.dentry);
+ pr_err("no right to read(%pD)\n", src_fp->filp);
return -EACCES;
}
if (!(dst_fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE))) {
- pr_err("no right to write(%pd)\n", dst_fp->filp->f_path.dentry);
+ pr_err("no right to write(%pD)\n", dst_fp->filp);
return -EACCES;
}
@@ -1784,9 +1794,9 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work,
ret = vfs_copy_file_range(src_fp->filp, src_off,
dst_fp->filp, dst_off, len, 0);
if (ret == -EOPNOTSUPP || ret == -EXDEV)
- ret = generic_copy_file_range(src_fp->filp, src_off,
- dst_fp->filp, dst_off,
- len, 0);
+ ret = vfs_copy_file_range(src_fp->filp, src_off,
+ dst_fp->filp, dst_off, len,
+ COPY_FILE_SPLICE);
if (ret < 0)
return ret;
@@ -1814,10 +1824,11 @@ void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock)
}
int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns,
- struct inode *inode)
+ struct dentry *dentry)
{
struct posix_acl_state acl_state;
struct posix_acl *acls;
+ struct inode *inode = d_inode(dentry);
int rc;
if (!IS_ENABLED(CONFIG_FS_POSIX_ACL))
@@ -1846,14 +1857,13 @@ int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns,
return -ENOMEM;
}
posix_state_to_acl(&acl_state, acls->a_entries);
- rc = set_posix_acl(user_ns, inode, ACL_TYPE_ACCESS, acls);
+ rc = set_posix_acl(user_ns, dentry, ACL_TYPE_ACCESS, acls);
if (rc < 0)
ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n",
rc);
else if (S_ISDIR(inode->i_mode)) {
posix_state_to_acl(&acl_state, acls->a_entries);
- rc = set_posix_acl(user_ns, inode, ACL_TYPE_DEFAULT,
- acls);
+ rc = set_posix_acl(user_ns, dentry, ACL_TYPE_DEFAULT, acls);
if (rc < 0)
ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n",
rc);
@@ -1864,16 +1874,17 @@ int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns,
}
int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns,
- struct inode *inode, struct inode *parent_inode)
+ struct dentry *dentry, struct inode *parent_inode)
{
struct posix_acl *acls;
struct posix_acl_entry *pace;
+ struct inode *inode = d_inode(dentry);
int rc, i;
if (!IS_ENABLED(CONFIG_FS_POSIX_ACL))
return -EOPNOTSUPP;
- acls = get_acl(parent_inode, ACL_TYPE_DEFAULT);
+ acls = get_inode_acl(parent_inode, ACL_TYPE_DEFAULT);
if (!acls)
return -ENOENT;
pace = acls->a_entries;
@@ -1885,12 +1896,12 @@ int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns,
}
}
- rc = set_posix_acl(user_ns, inode, ACL_TYPE_ACCESS, acls);
+ rc = set_posix_acl(user_ns, dentry, ACL_TYPE_ACCESS, acls);
if (rc < 0)
ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n",
rc);
if (S_ISDIR(inode->i_mode)) {
- rc = set_posix_acl(user_ns, inode, ACL_TYPE_DEFAULT,
+ rc = set_posix_acl(user_ns, dentry, ACL_TYPE_DEFAULT,
acls);
if (rc < 0)
ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n",
diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h
index 70da4c0ba7ad..0d73d735cc39 100644
--- a/fs/ksmbd/vfs.h
+++ b/fs/ksmbd/vfs.h
@@ -12,6 +12,7 @@
#include <linux/namei.h>
#include <uapi/linux/xattr.h>
#include <linux/posix_acl.h>
+#include <linux/unicode.h>
#include "smbacl.h"
#include "xattr.h"
@@ -60,6 +61,7 @@ struct ksmbd_readdir_data {
unsigned int used;
unsigned int dirent_count;
unsigned int file_attr;
+ struct unicode_map *um;
};
/* ksmbd kstat wrapper to get valid create time when reading dir entry */
@@ -85,7 +87,7 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id);
int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name);
int ksmbd_vfs_link(struct ksmbd_work *work,
const char *oldname, const char *newname);
-int ksmbd_vfs_getattr(struct path *path, struct kstat *stat);
+int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat);
int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp,
char *newname);
int ksmbd_vfs_truncate(struct ksmbd_work *work,
@@ -158,8 +160,8 @@ int ksmbd_vfs_get_dos_attrib_xattr(struct user_namespace *user_ns,
struct dentry *dentry,
struct xattr_dos_attrib *da);
int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns,
- struct inode *inode);
+ struct dentry *dentry);
int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns,
- struct inode *inode,
+ struct dentry *dentry,
struct inode *parent_inode);
#endif /* __KSMBD_VFS_H__ */
diff --git a/fs/libfs.c b/fs/libfs.c
index 31b0ddf01c31..aada4e7c8713 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -15,6 +15,7 @@
#include <linux/mutex.h>
#include <linux/namei.h>
#include <linux/exportfs.h>
+#include <linux/iversion.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h> /* sync_mapping_buffers */
#include <linux/fs_context.h>
@@ -994,8 +995,8 @@ out:
EXPORT_SYMBOL_GPL(simple_attr_read);
/* interpret the buffer as a number to call the set function with */
-ssize_t simple_attr_write(struct file *file, const char __user *buf,
- size_t len, loff_t *ppos)
+static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos, bool is_signed)
{
struct simple_attr *attr;
unsigned long long val;
@@ -1016,7 +1017,10 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
goto out;
attr->set_buf[size] = '\0';
- ret = kstrtoull(attr->set_buf, 0, &val);
+ if (is_signed)
+ ret = kstrtoll(attr->set_buf, 0, &val);
+ else
+ ret = kstrtoull(attr->set_buf, 0, &val);
if (ret)
goto out;
ret = attr->set(attr->data, val);
@@ -1026,8 +1030,21 @@ out:
mutex_unlock(&attr->mutex);
return ret;
}
+
+ssize_t simple_attr_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return simple_attr_write_xsigned(file, buf, len, ppos, false);
+}
EXPORT_SYMBOL_GPL(simple_attr_write);
+ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return simple_attr_write_xsigned(file, buf, len, ppos, true);
+}
+EXPORT_SYMBOL_GPL(simple_attr_write_signed);
+
/**
* generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
* @sb: filesystem to do the file handle conversion on
@@ -1520,3 +1537,48 @@ void generic_set_encrypted_ci_d_ops(struct dentry *dentry)
#endif
}
EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops);
+
+/**
+ * inode_maybe_inc_iversion - increments i_version
+ * @inode: inode with the i_version that should be updated
+ * @force: increment the counter even if it's not necessary?
+ *
+ * Every time the inode is modified, the i_version field must be seen to have
+ * changed by any observer.
+ *
+ * If "force" is set or the QUERIED flag is set, then ensure that we increment
+ * the value, and clear the queried flag.
+ *
+ * In the common case where neither is set, then we can return "false" without
+ * updating i_version.
+ *
+ * If this function returns false, and no other metadata has changed, then we
+ * can avoid logging the metadata.
+ */
+bool inode_maybe_inc_iversion(struct inode *inode, bool force)
+{
+ u64 cur, new;
+
+ /*
+ * The i_version field is not strictly ordered with any other inode
+ * information, but the legacy inode_inc_iversion code used a spinlock
+ * to serialize increments.
+ *
+ * Here, we add full memory barriers to ensure that any de-facto
+ * ordering with other info is preserved.
+ *
+ * This barrier pairs with the barrier in inode_query_iversion()
+ */
+ smp_mb();
+ cur = inode_peek_iversion_raw(inode);
+ do {
+ /* If flag is clear then we needn't do anything */
+ if (!force && !(cur & I_VERSION_QUERIED))
+ return false;
+
+ /* Since lowest bit is flag, add 2 to avoid it */
+ new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;
+ } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new));
+ return true;
+}
+EXPORT_SYMBOL(inode_maybe_inc_iversion);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index f802223e71ab..cdc8e12cdac4 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -164,7 +164,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
host->h_addrbuf = nsm->sm_addrbuf;
host->net = ni->net;
host->h_cred = get_cred(ni->cred);
- strlcpy(host->nodename, utsname()->nodename, sizeof(host->nodename));
+ strscpy(host->nodename, utsname()->nodename, sizeof(host->nodename));
out:
return host;
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index bf274f23969b..b72023a6b4c1 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -52,6 +52,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
*filp = file;
/* Set up the missing parts of the file_lock structure */
+ lock->fl.fl_flags = FL_POSIX;
lock->fl.fl_file = file->f_file[mode];
lock->fl.fl_pid = current->tgid;
lock->fl.fl_start = (loff_t)lock->lock_start;
@@ -521,6 +522,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_void,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_void),
+ .pc_argzero = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "NULL",
@@ -530,6 +532,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_testargs,
.pc_encode = nlm4svc_encode_testres,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St+2+No+Rg,
.pc_name = "TEST",
@@ -539,6 +542,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_lockargs,
.pc_encode = nlm4svc_encode_res,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
.pc_name = "LOCK",
@@ -548,6 +552,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_cancargs,
.pc_encode = nlm4svc_encode_res,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
.pc_name = "CANCEL",
@@ -557,6 +562,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_unlockargs,
.pc_encode = nlm4svc_encode_res,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
.pc_name = "UNLOCK",
@@ -566,6 +572,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_testargs,
.pc_encode = nlm4svc_encode_res,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
.pc_name = "GRANTED",
@@ -575,6 +582,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_testargs,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "TEST_MSG",
@@ -584,6 +592,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_lockargs,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "LOCK_MSG",
@@ -593,6 +602,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_cancargs,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "CANCEL_MSG",
@@ -602,6 +612,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_unlockargs,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "UNLOCK_MSG",
@@ -611,6 +622,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_testargs,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "GRANTED_MSG",
@@ -620,6 +632,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_void,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_res),
+ .pc_argzero = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "TEST_RES",
@@ -629,6 +642,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_void,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_res),
+ .pc_argzero = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "LOCK_RES",
@@ -638,6 +652,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_void,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_res),
+ .pc_argzero = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "CANCEL_RES",
@@ -647,6 +662,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_void,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_res),
+ .pc_argzero = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "UNLOCK_RES",
@@ -656,6 +672,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_res,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_res),
+ .pc_argzero = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "GRANTED_RES",
@@ -665,6 +682,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_reboot,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_reboot),
+ .pc_argzero = sizeof(struct nlm_reboot),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "SM_NOTIFY",
@@ -674,6 +692,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_void,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_void),
+ .pc_argzero = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = 0,
.pc_name = "UNUSED",
@@ -683,6 +702,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_void,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_void),
+ .pc_argzero = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = 0,
.pc_name = "UNUSED",
@@ -692,6 +712,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_void,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_void),
+ .pc_argzero = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = 0,
.pc_name = "UNUSED",
@@ -701,6 +722,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_shareargs,
.pc_encode = nlm4svc_encode_shareres,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St+1,
.pc_name = "SHARE",
@@ -710,6 +732,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_shareargs,
.pc_encode = nlm4svc_encode_shareres,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St+1,
.pc_name = "UNSHARE",
@@ -719,6 +742,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_lockargs,
.pc_encode = nlm4svc_encode_res,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
.pc_name = "NM_LOCK",
@@ -728,6 +752,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_notify,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "FREE_ALL",
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 9c1aa75441e1..4e30f3c50970 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -659,11 +659,13 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock)
nlmsvc_cancel_blocked(net, file, lock);
lock->fl.fl_type = F_UNLCK;
- if (file->f_file[O_RDONLY])
- error = vfs_lock_file(file->f_file[O_RDONLY], F_SETLK,
+ lock->fl.fl_file = file->f_file[O_RDONLY];
+ if (lock->fl.fl_file)
+ error = vfs_lock_file(lock->fl.fl_file, F_SETLK,
&lock->fl, NULL);
- if (file->f_file[O_WRONLY])
- error = vfs_lock_file(file->f_file[O_WRONLY], F_SETLK,
+ lock->fl.fl_file = file->f_file[O_WRONLY];
+ if (lock->fl.fl_file)
+ error |= vfs_lock_file(lock->fl.fl_file, F_SETLK,
&lock->fl, NULL);
return (error < 0)? nlm_lck_denied_nolocks : nlm_granted;
@@ -697,9 +699,10 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l
block = nlmsvc_lookup_block(file, lock);
mutex_unlock(&file->f_mutex);
if (block != NULL) {
- mode = lock_to_openmode(&lock->fl);
- vfs_cancel_lock(block->b_file->f_file[mode],
- &block->b_call->a_args.lock.fl);
+ struct file_lock *fl = &block->b_call->a_args.lock.fl;
+
+ mode = lock_to_openmode(fl);
+ vfs_cancel_lock(block->b_file->f_file[mode], fl);
status = nlmsvc_unlink_block(block);
nlmsvc_release_block(block);
}
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index b09ca35b527c..32784f508c81 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -77,6 +77,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Set up the missing parts of the file_lock structure */
mode = lock_to_openmode(&lock->fl);
+ lock->fl.fl_flags = FL_POSIX;
lock->fl.fl_file = file->f_file[mode];
lock->fl.fl_pid = current->tgid;
lock->fl.fl_lmops = &nlmsvc_lock_operations;
@@ -555,6 +556,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_void,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_void),
+ .pc_argzero = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "NULL",
@@ -564,6 +566,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_testargs,
.pc_encode = nlmsvc_encode_testres,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St+2+No+Rg,
.pc_name = "TEST",
@@ -573,6 +576,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_lockargs,
.pc_encode = nlmsvc_encode_res,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
.pc_name = "LOCK",
@@ -582,6 +586,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_cancargs,
.pc_encode = nlmsvc_encode_res,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
.pc_name = "CANCEL",
@@ -591,6 +596,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_unlockargs,
.pc_encode = nlmsvc_encode_res,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
.pc_name = "UNLOCK",
@@ -600,6 +606,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_testargs,
.pc_encode = nlmsvc_encode_res,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
.pc_name = "GRANTED",
@@ -609,6 +616,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_testargs,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "TEST_MSG",
@@ -618,6 +626,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_lockargs,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "LOCK_MSG",
@@ -627,6 +636,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_cancargs,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "CANCEL_MSG",
@@ -636,6 +646,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_unlockargs,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "UNLOCK_MSG",
@@ -645,6 +656,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_testargs,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "GRANTED_MSG",
@@ -654,6 +666,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_void,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_res),
+ .pc_argzero = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "TEST_RES",
@@ -663,6 +676,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_void,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_res),
+ .pc_argzero = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "LOCK_RES",
@@ -672,6 +686,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_void,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_res),
+ .pc_argzero = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "CANCEL_RES",
@@ -681,6 +696,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_void,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_res),
+ .pc_argzero = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "UNLOCK_RES",
@@ -690,6 +706,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_res,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_res),
+ .pc_argzero = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "GRANTED_RES",
@@ -699,6 +716,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_reboot,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_reboot),
+ .pc_argzero = sizeof(struct nlm_reboot),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "SM_NOTIFY",
@@ -708,6 +726,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_void,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_void),
+ .pc_argzero = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "UNUSED",
@@ -717,6 +736,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_void,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_void),
+ .pc_argzero = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "UNUSED",
@@ -726,6 +746,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_void,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_void),
+ .pc_argzero = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "UNUSED",
@@ -735,6 +756,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_shareargs,
.pc_encode = nlmsvc_encode_shareres,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St+1,
.pc_name = "SHARE",
@@ -744,6 +766,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_shareargs,
.pc_encode = nlmsvc_encode_shareres,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St+1,
.pc_name = "UNSHARE",
@@ -753,6 +776,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_lockargs,
.pc_encode = nlmsvc_encode_res,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
.pc_name = "NM_LOCK",
@@ -762,6 +786,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_notify,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = 0,
.pc_name = "FREE_ALL",
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index e1c4617de771..e3b6229e7ae5 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -176,7 +176,7 @@ nlm_delete_file(struct nlm_file *file)
}
}
-static int nlm_unlock_files(struct nlm_file *file, fl_owner_t owner)
+static int nlm_unlock_files(struct nlm_file *file, const struct file_lock *fl)
{
struct file_lock lock;
@@ -184,12 +184,15 @@ static int nlm_unlock_files(struct nlm_file *file, fl_owner_t owner)
lock.fl_type = F_UNLCK;
lock.fl_start = 0;
lock.fl_end = OFFSET_MAX;
- lock.fl_owner = owner;
- if (file->f_file[O_RDONLY] &&
- vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, &lock, NULL))
+ lock.fl_owner = fl->fl_owner;
+ lock.fl_pid = fl->fl_pid;
+ lock.fl_flags = FL_POSIX;
+
+ lock.fl_file = file->f_file[O_RDONLY];
+ if (lock.fl_file && vfs_lock_file(lock.fl_file, F_SETLK, &lock, NULL))
goto out_err;
- if (file->f_file[O_WRONLY] &&
- vfs_lock_file(file->f_file[O_WRONLY], F_SETLK, &lock, NULL))
+ lock.fl_file = file->f_file[O_WRONLY];
+ if (lock.fl_file && vfs_lock_file(lock.fl_file, F_SETLK, &lock, NULL))
goto out_err;
return 0;
out_err:
@@ -207,7 +210,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
{
struct inode *inode = nlmsvc_file_inode(file);
struct file_lock *fl;
- struct file_lock_context *flctx = inode->i_flctx;
+ struct file_lock_context *flctx = locks_inode_context(inode);
struct nlm_host *lockhost;
if (!flctx || list_empty_careful(&flctx->flc_posix))
@@ -226,7 +229,7 @@ again:
if (match(lockhost, host)) {
spin_unlock(&flctx->flc_lock);
- if (nlm_unlock_files(file, fl->fl_owner))
+ if (nlm_unlock_files(file, fl))
return 1;
goto again;
}
@@ -262,7 +265,7 @@ nlm_file_inuse(struct nlm_file *file)
{
struct inode *inode = nlmsvc_file_inode(file);
struct file_lock *fl;
- struct file_lock_context *flctx = inode->i_flctx;
+ struct file_lock_context *flctx = locks_inode_context(inode);
if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
return 1;
diff --git a/fs/locks.c b/fs/locks.c
index 607f94a0e789..8f01bee17715 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -175,7 +175,7 @@ locks_get_lock_context(struct inode *inode, int type)
struct file_lock_context *ctx;
/* paired with cmpxchg() below */
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (likely(ctx) || type == F_UNLCK)
goto out;
@@ -194,7 +194,7 @@ locks_get_lock_context(struct inode *inode, int type)
*/
if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
kmem_cache_free(flctx_cache, ctx);
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
}
out:
trace_locks_get_lock_context(inode, type, ctx);
@@ -247,7 +247,7 @@ locks_check_ctx_file_list(struct file *filp, struct list_head *list,
void
locks_free_lock_context(struct inode *inode)
{
- struct file_lock_context *ctx = inode->i_flctx;
+ struct file_lock_context *ctx = locks_inode_context(inode);
if (unlikely(ctx)) {
locks_check_ctx_lists(inode);
@@ -891,7 +891,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
void *owner;
void (*func)(void);
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (!ctx || list_empty_careful(&ctx->flc_posix)) {
fl->fl_type = F_UNLCK;
return;
@@ -1483,7 +1483,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
new_fl->fl_flags = type;
/* typically we will check that ctx is non-NULL before calling */
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (!ctx) {
WARN_ON_ONCE(1);
goto free_lock;
@@ -1588,7 +1588,7 @@ void lease_get_mtime(struct inode *inode, struct timespec64 *time)
struct file_lock_context *ctx;
struct file_lock *fl;
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (ctx && !list_empty_careful(&ctx->flc_lease)) {
spin_lock(&ctx->flc_lock);
fl = list_first_entry_or_null(&ctx->flc_lease,
@@ -1634,7 +1634,7 @@ int fcntl_getlease(struct file *filp)
int type = F_UNLCK;
LIST_HEAD(dispose);
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (ctx && !list_empty_careful(&ctx->flc_lease)) {
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
@@ -1823,7 +1823,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
struct file_lock_context *ctx;
LIST_HEAD(dispose);
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (!ctx) {
trace_generic_delete_lease(inode, NULL);
return error;
@@ -2096,7 +2096,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
* throw a warning to let people know that they don't actually work.
*/
if (cmd & LOCK_MAND) {
- pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n");
+ pr_warn_once("%s(%d): Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n", current->comm, current->pid);
return 0;
}
@@ -2146,6 +2146,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
*/
int vfs_test_lock(struct file *filp, struct file_lock *fl)
{
+ WARN_ON_ONCE(filp != fl->fl_file);
if (filp->f_op->lock)
return filp->f_op->lock(filp, F_GETLK, fl);
posix_test_lock(filp, fl);
@@ -2295,6 +2296,7 @@ out:
*/
int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
{
+ WARN_ON_ONCE(filp != fl->fl_file);
if (filp->f_op->lock)
return filp->f_op->lock(filp, cmd, fl);
else
@@ -2561,7 +2563,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
* posix_lock_file(). Another process could be setting a lock on this
* file at the same time, but we wouldn't remove that lock anyway.
*/
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (!ctx || list_empty(&ctx->flc_posix))
return;
@@ -2634,7 +2636,7 @@ void locks_remove_file(struct file *filp)
{
struct file_lock_context *ctx;
- ctx = smp_load_acquire(&locks_inode(filp)->i_flctx);
+ ctx = locks_inode_context(locks_inode(filp));
if (!ctx)
return;
@@ -2663,12 +2665,36 @@ void locks_remove_file(struct file *filp)
*/
int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
{
+ WARN_ON_ONCE(filp != fl->fl_file);
if (filp->f_op->lock)
return filp->f_op->lock(filp, F_CANCELLK, fl);
return 0;
}
EXPORT_SYMBOL_GPL(vfs_cancel_lock);
+/**
+ * vfs_inode_has_locks - are any file locks held on @inode?
+ * @inode: inode to check for locks
+ *
+ * Return true if there are any FL_POSIX or FL_FLOCK locks currently
+ * set on @inode.
+ */
+bool vfs_inode_has_locks(struct inode *inode)
+{
+ struct file_lock_context *ctx;
+ bool ret;
+
+ ctx = locks_inode_context(inode);
+ if (!ctx)
+ return false;
+
+ spin_lock(&ctx->flc_lock);
+ ret = !list_empty(&ctx->flc_posix) || !list_empty(&ctx->flc_flock);
+ spin_unlock(&ctx->flc_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(vfs_inode_has_locks);
+
#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -2839,7 +2865,7 @@ void show_fd_locks(struct seq_file *f,
struct file_lock_context *ctx;
int id = 0;
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (!ctx)
return;
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 47ccfcbe0a22..2a4b8b549e93 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -90,12 +90,19 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
return -ENOMEM;
INIT_LIST_HEAD(&entry->e_list);
- /* Initial hash reference */
- atomic_set(&entry->e_refcnt, 1);
+ /*
+ * We create entry with two references. One reference is kept by the
+ * hash table, the other reference is used to protect us from
+ * mb_cache_entry_delete_or_get() until the entry is fully setup. This
+ * avoids nesting of cache->c_list_lock into hash table bit locks which
+ * is problematic for RT.
+ */
+ atomic_set(&entry->e_refcnt, 2);
entry->e_key = key;
entry->e_value = value;
- entry->e_reusable = reusable;
- entry->e_referenced = 0;
+ entry->e_flags = 0;
+ if (reusable)
+ set_bit(MBE_REUSABLE_B, &entry->e_flags);
head = mb_cache_entry_head(cache, key);
hlist_bl_lock(head);
hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
@@ -106,15 +113,12 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
}
}
hlist_bl_add_head(&entry->e_hash_list, head);
- /*
- * Add entry to LRU list before it can be found by
- * mb_cache_entry_delete() to avoid races
- */
+ hlist_bl_unlock(head);
spin_lock(&cache->c_list_lock);
list_add_tail(&entry->e_list, &cache->c_list);
cache->c_entry_count++;
spin_unlock(&cache->c_list_lock);
- hlist_bl_unlock(head);
+ mb_cache_entry_put(cache, entry);
return 0;
}
@@ -162,7 +166,8 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
while (node) {
entry = hlist_bl_entry(node, struct mb_cache_entry,
e_hash_list);
- if (entry->e_key == key && entry->e_reusable &&
+ if (entry->e_key == key &&
+ test_bit(MBE_REUSABLE_B, &entry->e_flags) &&
atomic_inc_not_zero(&entry->e_refcnt))
goto out;
node = node->next;
@@ -281,7 +286,7 @@ EXPORT_SYMBOL(mb_cache_entry_delete_or_get);
void mb_cache_entry_touch(struct mb_cache *cache,
struct mb_cache_entry *entry)
{
- entry->e_referenced = 1;
+ set_bit(MBE_REFERENCED_B, &entry->e_flags);
}
EXPORT_SYMBOL(mb_cache_entry_touch);
@@ -306,9 +311,9 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
entry = list_first_entry(&cache->c_list,
struct mb_cache_entry, e_list);
/* Drop initial hash reference if there is no user */
- if (entry->e_referenced ||
+ if (test_bit(MBE_REFERENCED_B, &entry->e_flags) ||
atomic_cmpxchg(&entry->e_refcnt, 1, 0) != 1) {
- entry->e_referenced = 0;
+ clear_bit(MBE_REFERENCED_B, &entry->e_flags);
list_move_tail(&entry->e_list, &cache->c_list);
continue;
}
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 937fa5fae2b8..8afdc408ca4f 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -53,16 +53,16 @@ static int minix_mknod(struct user_namespace *mnt_userns, struct inode *dir,
}
static int minix_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+ struct file *file, umode_t mode)
{
int error;
struct inode *inode = minix_new_inode(dir, mode, &error);
if (inode) {
minix_set_inode(inode, 0);
mark_inode_dirty(inode);
- d_tmpfile(dentry, inode);
+ d_tmpfile(file, inode);
}
- return error;
+ return finish_open_simple(file, error);
}
static int minix_create(struct user_namespace *mnt_userns, struct inode *dir,
diff --git a/fs/namei.c b/fs/namei.c
index 53b4bc094db2..309ae6fc8c99 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -297,13 +297,13 @@ static int check_acl(struct user_namespace *mnt_userns,
acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
if (!acl)
return -EAGAIN;
- /* no ->get_acl() calls in RCU mode... */
+ /* no ->get_inode_acl() calls in RCU mode... */
if (is_uncached_acl(acl))
return -ECHILD;
return posix_acl_permission(mnt_userns, inode, acl, mask);
}
- acl = get_acl(inode, ACL_TYPE_ACCESS);
+ acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl) {
@@ -336,11 +336,11 @@ static int acl_permission_check(struct user_namespace *mnt_userns,
struct inode *inode, int mask)
{
unsigned int mode = inode->i_mode;
- kuid_t i_uid;
+ vfsuid_t vfsuid;
/* Are we the owner? If so, ACL's don't matter */
- i_uid = i_uid_into_mnt(mnt_userns, inode);
- if (likely(uid_eq(current_fsuid(), i_uid))) {
+ vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) {
mask &= 7;
mode >>= 6;
return (mask & ~mode) ? -EACCES : 0;
@@ -362,8 +362,8 @@ static int acl_permission_check(struct user_namespace *mnt_userns,
* about? Need to check group ownership if so.
*/
if (mask & (mode ^ (mode >> 3))) {
- kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
- if (in_group_p(kgid))
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+ if (vfsgid_in_group_p(vfsgid))
mode >>= 3;
}
@@ -581,7 +581,7 @@ struct nameidata {
struct nameidata *saved;
unsigned root_seq;
int dfd;
- kuid_t dir_uid;
+ vfsuid_t dir_vfsuid;
umode_t dir_mode;
} __randomize_layout;
@@ -986,7 +986,7 @@ static int nd_jump_root(struct nameidata *nd)
* Helper to directly jump to a known parsed path from ->get_link,
* caller must have taken a reference to path beforehand.
*/
-int nd_jump_link(struct path *path)
+int nd_jump_link(const struct path *path)
{
int error = -ELOOP;
struct nameidata *nd = current->nameidata;
@@ -1095,15 +1095,15 @@ fs_initcall(init_fs_namei_sysctls);
static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
{
struct user_namespace *mnt_userns;
- kuid_t i_uid;
+ vfsuid_t vfsuid;
if (!sysctl_protected_symlinks)
return 0;
mnt_userns = mnt_user_ns(nd->path.mnt);
- i_uid = i_uid_into_mnt(mnt_userns, inode);
+ vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
/* Allowed if owner and follower match. */
- if (uid_eq(current_cred()->fsuid, i_uid))
+ if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
return 0;
/* Allowed if parent directory not sticky and world-writable. */
@@ -1111,7 +1111,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod
return 0;
/* Allowed if parent directory and link owner match. */
- if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, i_uid))
+ if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid))
return 0;
if (nd->flags & LOOKUP_RCU)
@@ -1178,13 +1178,13 @@ static bool safe_hardlink_source(struct user_namespace *mnt_userns,
*
* Returns 0 if successful, -ve on error.
*/
-int may_linkat(struct user_namespace *mnt_userns, struct path *link)
+int may_linkat(struct user_namespace *mnt_userns, const struct path *link)
{
struct inode *inode = link->dentry->d_inode;
/* Inode writeback is not safe when the uid or gid are invalid. */
- if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
- !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
+ if (!vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) ||
+ !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode)))
return -EOVERFLOW;
if (!sysctl_protected_hardlinks)
@@ -1232,13 +1232,13 @@ static int may_create_in_sticky(struct user_namespace *mnt_userns,
struct nameidata *nd, struct inode *const inode)
{
umode_t dir_mode = nd->dir_mode;
- kuid_t dir_uid = nd->dir_uid;
+ vfsuid_t dir_vfsuid = nd->dir_vfsuid;
if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
(!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
likely(!(dir_mode & S_ISVTX)) ||
- uid_eq(i_uid_into_mnt(mnt_userns, inode), dir_uid) ||
- uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)))
+ vfsuid_eq(i_uid_into_vfsuid(mnt_userns, inode), dir_vfsuid) ||
+ vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), current_fsuid()))
return 0;
if (likely(dir_mode & 0002) ||
@@ -2307,7 +2307,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
OK:
/* pathname or trailing symlink, done */
if (!depth) {
- nd->dir_uid = i_uid_into_mnt(mnt_userns, nd->inode);
+ nd->dir_vfsuid = i_uid_into_vfsuid(mnt_userns, nd->inode);
nd->dir_mode = nd->inode->i_mode;
nd->flags &= ~LOOKUP_PARENT;
return 0;
@@ -2885,9 +2885,9 @@ int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir,
{
kuid_t fsuid = current_fsuid();
- if (uid_eq(i_uid_into_mnt(mnt_userns, inode), fsuid))
+ if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), fsuid))
return 0;
- if (uid_eq(i_uid_into_mnt(mnt_userns, dir), fsuid))
+ if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, dir), fsuid))
return 0;
return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER);
}
@@ -2926,8 +2926,8 @@ static int may_delete(struct user_namespace *mnt_userns, struct inode *dir,
BUG_ON(victim->d_parent->d_inode != dir);
/* Inode writeback is not safe when the uid or gid are invalid. */
- if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
- !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
+ if (!vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) ||
+ !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode)))
return -EOVERFLOW;
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
@@ -3211,7 +3211,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
if (error)
return error;
- error = security_path_truncate(path);
+ error = security_file_truncate(filp);
if (!error) {
error = do_truncate(mnt_userns, path->dentry, 0,
ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
@@ -3583,72 +3583,95 @@ static int do_open(struct nameidata *nd,
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply passs init_user_ns.
*/
-struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
- struct dentry *dentry, umode_t mode, int open_flag)
+static int vfs_tmpfile(struct user_namespace *mnt_userns,
+ const struct path *parentpath,
+ struct file *file, umode_t mode)
{
- struct dentry *child = NULL;
- struct inode *dir = dentry->d_inode;
+ struct dentry *child;
+ struct inode *dir = d_inode(parentpath->dentry);
struct inode *inode;
int error;
+ int open_flag = file->f_flags;
/* we want directory to be writable */
error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
if (error)
- goto out_err;
- error = -EOPNOTSUPP;
+ return error;
if (!dir->i_op->tmpfile)
- goto out_err;
- error = -ENOMEM;
- child = d_alloc(dentry, &slash_name);
+ return -EOPNOTSUPP;
+ child = d_alloc(parentpath->dentry, &slash_name);
if (unlikely(!child))
- goto out_err;
+ return -ENOMEM;
+ file->f_path.mnt = parentpath->mnt;
+ file->f_path.dentry = child;
mode = vfs_prepare_mode(mnt_userns, dir, mode, mode, mode);
- error = dir->i_op->tmpfile(mnt_userns, dir, child, mode);
+ error = dir->i_op->tmpfile(mnt_userns, dir, file, mode);
+ dput(child);
if (error)
- goto out_err;
- error = -ENOENT;
- inode = child->d_inode;
- if (unlikely(!inode))
- goto out_err;
+ return error;
+ /* Don't check for other permissions, the inode was just created */
+ error = may_open(mnt_userns, &file->f_path, 0, file->f_flags);
+ if (error)
+ return error;
+ inode = file_inode(file);
if (!(open_flag & O_EXCL)) {
spin_lock(&inode->i_lock);
inode->i_state |= I_LINKABLE;
spin_unlock(&inode->i_lock);
}
ima_post_create_tmpfile(mnt_userns, inode);
- return child;
+ return 0;
+}
-out_err:
- dput(child);
- return ERR_PTR(error);
+/**
+ * vfs_tmpfile_open - open a tmpfile for kernel internal use
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @parentpath: path of the base directory
+ * @mode: mode of the new tmpfile
+ * @open_flag: flags
+ * @cred: credentials for open
+ *
+ * Create and open a temporary file. The file is not accounted in nr_files,
+ * hence this is only for kernel internal use, and must not be installed into
+ * file tables or such.
+ */
+struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns,
+ const struct path *parentpath,
+ umode_t mode, int open_flag, const struct cred *cred)
+{
+ struct file *file;
+ int error;
+
+ file = alloc_empty_file_noaccount(open_flag, cred);
+ if (!IS_ERR(file)) {
+ error = vfs_tmpfile(mnt_userns, parentpath, file, mode);
+ if (error) {
+ fput(file);
+ file = ERR_PTR(error);
+ }
+ }
+ return file;
}
-EXPORT_SYMBOL(vfs_tmpfile);
+EXPORT_SYMBOL(vfs_tmpfile_open);
static int do_tmpfile(struct nameidata *nd, unsigned flags,
const struct open_flags *op,
struct file *file)
{
struct user_namespace *mnt_userns;
- struct dentry *child;
struct path path;
int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
+
if (unlikely(error))
return error;
error = mnt_want_write(path.mnt);
if (unlikely(error))
goto out;
mnt_userns = mnt_user_ns(path.mnt);
- child = vfs_tmpfile(mnt_userns, path.dentry, op->mode, op->open_flag);
- error = PTR_ERR(child);
- if (IS_ERR(child))
+ error = vfs_tmpfile(mnt_userns, &path, file, op->mode);
+ if (error)
goto out2;
- dput(path.dentry);
- path.dentry = child;
- audit_inode(nd->name, child, 0);
- /* Don't check for other permissions, the inode was just created */
- error = may_open(mnt_userns, &path, 0, op->open_flag);
- if (!error)
- error = vfs_open(&path, file);
+ audit_inode(nd->name, file->f_path.dentry, 0);
out2:
mnt_drop_write(path.mnt);
out:
@@ -5088,7 +5111,7 @@ int page_symlink(struct inode *inode, const char *symname, int len)
const struct address_space_operations *aops = mapping->a_ops;
bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS);
struct page *page;
- void *fsdata;
+ void *fsdata = NULL;
int err;
unsigned int flags;
diff --git a/fs/namespace.c b/fs/namespace.c
index df137ba19d37..ab467ee58341 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -75,6 +75,22 @@ static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
+struct mnt_idmap {
+ struct user_namespace *owner;
+ refcount_t count;
+};
+
+/*
+ * Carries the initial idmapping of 0:0:4294967295 which is an identity
+ * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is
+ * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...].
+ */
+struct mnt_idmap nop_mnt_idmap = {
+ .owner = &init_user_ns,
+ .count = REFCOUNT_INIT(1),
+};
+EXPORT_SYMBOL_GPL(nop_mnt_idmap);
+
struct mount_kattr {
unsigned int attr_set;
unsigned int attr_clr;
@@ -82,6 +98,7 @@ struct mount_kattr {
unsigned int lookup_flags;
bool recurse;
struct user_namespace *mnt_userns;
+ struct mnt_idmap *mnt_idmap;
};
/* /sys/fs */
@@ -193,6 +210,104 @@ int mnt_get_count(struct mount *mnt)
#endif
}
+/**
+ * mnt_idmap_owner - retrieve owner of the mount's idmapping
+ * @idmap: mount idmapping
+ *
+ * This helper will go away once the conversion to use struct mnt_idmap
+ * everywhere has finished at which point the helper will be unexported.
+ *
+ * Only code that needs to perform permission checks based on the owner of the
+ * idmapping will get access to it. All other code will solely rely on
+ * idmappings. This will get us type safety so it's impossible to conflate
+ * filesystems idmappings with mount idmappings.
+ *
+ * Return: The owner of the idmapping.
+ */
+struct user_namespace *mnt_idmap_owner(const struct mnt_idmap *idmap)
+{
+ return idmap->owner;
+}
+EXPORT_SYMBOL_GPL(mnt_idmap_owner);
+
+/**
+ * mnt_user_ns - retrieve owner of an idmapped mount
+ * @mnt: the relevant vfsmount
+ *
+ * This helper will go away once the conversion to use struct mnt_idmap
+ * everywhere has finished at which point the helper will be unexported.
+ *
+ * Only code that needs to perform permission checks based on the owner of the
+ * idmapping will get access to it. All other code will solely rely on
+ * idmappings. This will get us type safety so it's impossible to conflate
+ * filesystems idmappings with mount idmappings.
+ *
+ * Return: The owner of the idmapped.
+ */
+struct user_namespace *mnt_user_ns(const struct vfsmount *mnt)
+{
+ struct mnt_idmap *idmap = mnt_idmap(mnt);
+
+ /* Return the actual owner of the filesystem instead of the nop. */
+ if (idmap == &nop_mnt_idmap &&
+ !initial_idmapping(mnt->mnt_sb->s_user_ns))
+ return mnt->mnt_sb->s_user_ns;
+ return mnt_idmap_owner(idmap);
+}
+EXPORT_SYMBOL_GPL(mnt_user_ns);
+
+/**
+ * alloc_mnt_idmap - allocate a new idmapping for the mount
+ * @mnt_userns: owning userns of the idmapping
+ *
+ * Allocate a new struct mnt_idmap which carries the idmapping of the mount.
+ *
+ * Return: On success a new idmap, on error an error pointer is returned.
+ */
+static struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns)
+{
+ struct mnt_idmap *idmap;
+
+ idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT);
+ if (!idmap)
+ return ERR_PTR(-ENOMEM);
+
+ idmap->owner = get_user_ns(mnt_userns);
+ refcount_set(&idmap->count, 1);
+ return idmap;
+}
+
+/**
+ * mnt_idmap_get - get a reference to an idmapping
+ * @idmap: the idmap to bump the reference on
+ *
+ * If @idmap is not the @nop_mnt_idmap bump the reference count.
+ *
+ * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed.
+ */
+static inline struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap)
+{
+ if (idmap != &nop_mnt_idmap)
+ refcount_inc(&idmap->count);
+
+ return idmap;
+}
+
+/**
+ * mnt_idmap_put - put a reference to an idmapping
+ * @idmap: the idmap to put the reference on
+ *
+ * If this is a non-initial idmapping, put the reference count when a mount is
+ * released and free it if we're the last user.
+ */
+static inline void mnt_idmap_put(struct mnt_idmap *idmap)
+{
+ if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) {
+ put_user_ns(idmap->owner);
+ kfree(idmap);
+ }
+}
+
static struct mount *alloc_vfsmnt(const char *name)
{
struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -232,7 +347,7 @@ static struct mount *alloc_vfsmnt(const char *name)
INIT_HLIST_NODE(&mnt->mnt_mp_list);
INIT_LIST_HEAD(&mnt->mnt_umounting);
INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
- mnt->mnt.mnt_userns = &init_user_ns;
+ mnt->mnt.mnt_idmap = &nop_mnt_idmap;
}
return mnt;
@@ -602,11 +717,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
static void free_vfsmnt(struct mount *mnt)
{
- struct user_namespace *mnt_userns;
-
- mnt_userns = mnt_user_ns(&mnt->mnt);
- if (!initial_idmapping(mnt_userns))
- put_user_ns(mnt_userns);
+ mnt_idmap_put(mnt_idmap(&mnt->mnt));
kfree_const(mnt->mnt_devname);
#ifdef CONFIG_SMP
free_percpu(mnt->mnt_pcp);
@@ -1009,7 +1120,6 @@ static struct mount *skip_mnt_tree(struct mount *p)
struct vfsmount *vfs_create_mount(struct fs_context *fc)
{
struct mount *mnt;
- struct user_namespace *fs_userns;
if (!fc->root)
return ERR_PTR(-EINVAL);
@@ -1027,10 +1137,6 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc)
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
mnt->mnt_parent = mnt;
- fs_userns = mnt->mnt.mnt_sb->s_user_ns;
- if (!initial_idmapping(fs_userns))
- mnt->mnt.mnt_userns = get_user_ns(fs_userns);
-
lock_mount_hash();
list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
unlock_mount_hash();
@@ -1120,9 +1226,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
atomic_inc(&sb->s_active);
- mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt);
- if (!initial_idmapping(mnt->mnt.mnt_userns))
- mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns);
+ mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
+
mnt->mnt.mnt_sb = sb;
mnt->mnt.mnt_root = dget(root);
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
@@ -3515,8 +3620,9 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
q = next_mnt(q, new);
if (!q)
break;
+ // an mntns binding we'd skipped?
while (p->mnt.mnt_root != q->mnt.mnt_root)
- p = next_mnt(p, old);
+ p = next_mnt(skip_mnt_tree(p), old);
}
namespace_unlock();
@@ -3981,14 +4087,14 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
struct vfsmount *m = &mnt->mnt;
struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;
- if (!kattr->mnt_userns)
+ if (!kattr->mnt_idmap)
return 0;
/*
* Creating an idmapped mount with the filesystem wide idmapping
* doesn't make sense so block that. We don't allow mushy semantics.
*/
- if (kattr->mnt_userns == fs_userns)
+ if (mnt_idmap_owner(kattr->mnt_idmap) == fs_userns)
return -EINVAL;
/*
@@ -4028,7 +4134,7 @@ static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
{
return (!(kattr->attr_set & MNT_READONLY) ||
(mnt->mnt.mnt_flags & MNT_READONLY)) &&
- !kattr->mnt_userns;
+ !kattr->mnt_idmap;
}
static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
@@ -4082,27 +4188,18 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
{
- struct user_namespace *mnt_userns, *old_mnt_userns;
-
- if (!kattr->mnt_userns)
+ if (!kattr->mnt_idmap)
return;
/*
- * We're the only ones able to change the mount's idmapping. So
- * mnt->mnt.mnt_userns is stable and we can retrieve it directly.
- */
- old_mnt_userns = mnt->mnt.mnt_userns;
-
- mnt_userns = get_user_ns(kattr->mnt_userns);
- /* Pairs with smp_load_acquire() in mnt_user_ns(). */
- smp_store_release(&mnt->mnt.mnt_userns, mnt_userns);
-
- /*
- * If this is an idmapped filesystem drop the reference we've taken
- * in vfs_create_mount() before.
+ * Pairs with smp_load_acquire() in mnt_idmap().
+ *
+ * Since we only allow a mount to change the idmapping once and
+ * verified this in can_idmap_mount() we know that the mount has
+ * @nop_mnt_idmap attached to it. So there's no need to drop any
+ * references.
*/
- if (!initial_idmapping(old_mnt_userns))
- put_user_ns(old_mnt_userns);
+ smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
}
static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
@@ -4136,6 +4233,15 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
if (path->dentry != mnt->mnt.mnt_root)
return -EINVAL;
+ if (kattr->mnt_userns) {
+ struct mnt_idmap *mnt_idmap;
+
+ mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns);
+ if (IS_ERR(mnt_idmap))
+ return PTR_ERR(mnt_idmap);
+ kattr->mnt_idmap = mnt_idmap;
+ }
+
if (kattr->propagation) {
/*
* Only take namespace_lock() if we're actually changing
@@ -4323,6 +4429,9 @@ static void finish_mount_kattr(struct mount_kattr *kattr)
{
put_user_ns(kattr->mnt_userns);
kattr->mnt_userns = NULL;
+
+ if (kattr->mnt_idmap)
+ mnt_idmap_put(kattr->mnt_idmap);
}
SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 0ce535852151..7679a68e8193 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -17,9 +17,9 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
struct folio *folio;
- unsigned int iopos, account = 0;
pgoff_t start_page = rreq->start / PAGE_SIZE;
pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
+ size_t account = 0;
bool subreq_failed = false;
XA_STATE(xas, &rreq->mapping->i_pages, start_page);
@@ -39,18 +39,23 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
*/
subreq = list_first_entry(&rreq->subrequests,
struct netfs_io_subrequest, rreq_link);
- iopos = 0;
subreq_failed = (subreq->error < 0);
trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
rcu_read_lock();
xas_for_each(&xas, folio, last_page) {
- unsigned int pgpos = (folio_index(folio) - start_page) * PAGE_SIZE;
- unsigned int pgend = pgpos + folio_size(folio);
+ loff_t pg_end;
bool pg_failed = false;
+ if (xas_retry(&xas, folio))
+ continue;
+
+ pg_end = folio_pos(folio) + folio_size(folio) - 1;
+
for (;;) {
+ loff_t sreq_end;
+
if (!subreq) {
pg_failed = true;
break;
@@ -58,11 +63,11 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
folio_start_fscache(folio);
pg_failed |= subreq_failed;
- if (pgend < iopos + subreq->len)
+ sreq_end = subreq->start + subreq->len - 1;
+ if (pg_end < sreq_end)
break;
account += subreq->transferred;
- iopos += subreq->len;
if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
subreq = list_next_entry(subreq, rreq_link);
subreq_failed = (subreq->error < 0);
@@ -70,7 +75,8 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
subreq = NULL;
subreq_failed = false;
}
- if (pgend == iopos)
+
+ if (pg_end == sreq_end)
break;
}
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index 428925899282..7f753380e047 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -23,7 +23,7 @@ static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
{
struct iov_iter iter;
- iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages,
+ iov_iter_xarray(&iter, ITER_DEST, &subreq->rreq->mapping->i_pages,
subreq->start + subreq->transferred,
subreq->len - subreq->transferred);
iov_iter_zero(iov_iter_count(&iter), &iter);
@@ -49,7 +49,7 @@ static void netfs_read_from_cache(struct netfs_io_request *rreq,
struct iov_iter iter;
netfs_stat(&netfs_n_rh_read);
- iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages,
+ iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages,
subreq->start + subreq->transferred,
subreq->len - subreq->transferred);
@@ -121,6 +121,9 @@ static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq,
XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE);
xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) {
+ if (xas_retry(&xas, folio))
+ continue;
+
/* We might have multiple writes from the same huge
* folio, but we mustn't unlock a folio more than once.
*/
@@ -205,7 +208,7 @@ static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq)
continue;
}
- iov_iter_xarray(&iter, WRITE, &rreq->mapping->i_pages,
+ iov_iter_xarray(&iter, ITER_SOURCE, &rreq->mapping->i_pages,
subreq->start, subreq->len);
atomic_inc(&rreq->nr_copy_ops);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 14a72224b657..1ead5bd740c2 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -209,8 +209,8 @@ config NFS_DISABLE_UDP_SUPPORT
config NFS_V4_2_READ_PLUS
bool "NFS: Enable support for the NFSv4.2 READ_PLUS operation"
depends on NFS_V4_2
- default n
+ default y
help
- This is intended for developers only. The READ_PLUS operation has
- been shown to have issues under specific conditions and should not
- be used in production.
+ Choose Y here to enable the use of READ_PLUS over NFS v4.2. READ_PLUS
+ attempts to improve read performance by compressing out sparse holes
+ in the file contents.
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 8dcb08e1a885..d0cccddb7d08 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -1065,6 +1065,7 @@ static const struct svc_procedure nfs4_callback_procedures1[] = {
.pc_func = nfs4_callback_compound,
.pc_encode = nfs4_encode_void,
.pc_argsize = 256,
+ .pc_argzero = 256,
.pc_ressize = 256,
.pc_xdrressize = NFS4_CALLBACK_BUFSIZE,
.pc_name = "COMPOUND",
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index da8da5cdbbc1..f50e025ae406 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -280,7 +280,7 @@ EXPORT_SYMBOL_GPL(nfs_put_client);
static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data)
{
struct nfs_client *clp;
- const struct sockaddr *sap = data->addr;
+ const struct sockaddr *sap = (struct sockaddr *)data->addr;
struct nfs_net *nn = net_generic(data->net, nfs_net_id);
int error;
@@ -666,7 +666,7 @@ static int nfs_init_server(struct nfs_server *server,
struct rpc_timeout timeparms;
struct nfs_client_initdata cl_init = {
.hostname = ctx->nfs_server.hostname,
- .addr = (const struct sockaddr *)&ctx->nfs_server.address,
+ .addr = &ctx->nfs_server._address,
.addrlen = ctx->nfs_server.addrlen,
.nfs_mod = ctx->nfs_mod,
.proto = ctx->nfs_server.protocol,
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 5c97cad741a7..cf7365581031 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -146,7 +146,7 @@ static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_state
{
struct inode *inode = state->inode;
struct file_lock *fl;
- struct file_lock_context *flctx = inode->i_flctx;
+ struct file_lock_context *flctx = locks_inode_context(inode);
struct list_head *list;
int status = 0;
@@ -228,8 +228,7 @@ again:
*
*/
void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred,
- fmode_t type,
- const nfs4_stateid *stateid,
+ fmode_t type, const nfs4_stateid *stateid,
unsigned long pagemod_limit)
{
struct nfs_delegation *delegation;
@@ -239,25 +238,24 @@ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred,
delegation = rcu_dereference(NFS_I(inode)->delegation);
if (delegation != NULL) {
spin_lock(&delegation->lock);
- if (nfs4_is_valid_delegation(delegation, 0)) {
- nfs4_stateid_copy(&delegation->stateid, stateid);
- delegation->type = type;
- delegation->pagemod_limit = pagemod_limit;
- oldcred = delegation->cred;
- delegation->cred = get_cred(cred);
- clear_bit(NFS_DELEGATION_NEED_RECLAIM,
- &delegation->flags);
- spin_unlock(&delegation->lock);
- rcu_read_unlock();
- put_cred(oldcred);
- trace_nfs4_reclaim_delegation(inode, type);
- return;
- }
- /* We appear to have raced with a delegation return. */
+ nfs4_stateid_copy(&delegation->stateid, stateid);
+ delegation->type = type;
+ delegation->pagemod_limit = pagemod_limit;
+ oldcred = delegation->cred;
+ delegation->cred = get_cred(cred);
+ clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+ if (test_and_clear_bit(NFS_DELEGATION_REVOKED,
+ &delegation->flags))
+ atomic_long_inc(&nfs_active_delegations);
spin_unlock(&delegation->lock);
+ rcu_read_unlock();
+ put_cred(oldcred);
+ trace_nfs4_reclaim_delegation(inode, type);
+ } else {
+ rcu_read_unlock();
+ nfs_inode_set_delegation(inode, cred, type, stateid,
+ pagemod_limit);
}
- rcu_read_unlock();
- nfs_inode_set_delegation(inode, cred, type, stateid, pagemod_limit);
}
static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5d6c2ddc7ea6..ea1ceffa1d3a 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1074,6 +1074,8 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
return res;
}
+#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL)
+
/*
* Once we've found the start of the dirent within a page: fill 'er up...
*/
@@ -1083,6 +1085,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
struct file *file = desc->file;
struct nfs_cache_array *array;
unsigned int i;
+ bool first_emit = !desc->dir_cookie;
array = kmap_local_page(desc->page);
for (i = desc->cache_entry_index; i < array->size; i++) {
@@ -1106,6 +1109,10 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
desc->ctx->pos = desc->dir_cookie;
else
desc->ctx->pos++;
+ if (first_emit && i > NFS_READDIR_CACHE_MISS_THRESHOLD + 1) {
+ desc->eob = true;
+ break;
+ }
}
if (array->page_is_eof)
desc->eof = !desc->eob;
@@ -1187,8 +1194,6 @@ out:
return status;
}
-#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL)
-
static bool nfs_readdir_handle_cache_misses(struct inode *inode,
struct nfs_readdir_descriptor *desc,
unsigned int cache_misses,
@@ -2022,7 +2027,7 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
err = finish_open(file, dentry, do_open);
if (err)
goto out;
- if (S_ISREG(file->f_path.dentry->d_inode->i_mode))
+ if (S_ISREG(file_inode(file)->i_mode))
nfs_file_set_open_context(file, ctx);
else
err = -EOPENSTALE;
@@ -2489,9 +2494,8 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
spin_unlock(&dentry->d_lock);
goto out;
}
- if (dentry->d_fsdata)
- /* old devname */
- kfree(dentry->d_fsdata);
+ /* old devname */
+ kfree(dentry->d_fsdata);
dentry->d_fsdata = NFS_FSDATA_BLOCKED;
spin_unlock(&dentry->d_lock);
@@ -2949,9 +2953,28 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, co
return NULL;
}
+static u64 nfs_access_login_time(const struct task_struct *task,
+ const struct cred *cred)
+{
+ const struct task_struct *parent;
+ u64 ret;
+
+ rcu_read_lock();
+ for (;;) {
+ parent = rcu_dereference(task->real_parent);
+ if (parent == task || cred_fscmp(parent->cred, cred) != 0)
+ break;
+ task = parent;
+ }
+ ret = task->start_time;
+ rcu_read_unlock();
+ return ret;
+}
+
static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *cred, u32 *mask, bool may_block)
{
struct nfs_inode *nfsi = NFS_I(inode);
+ u64 login_time = nfs_access_login_time(current, cred);
struct nfs_access_entry *cache;
bool retry = true;
int err;
@@ -2979,6 +3002,9 @@ static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *
spin_lock(&inode->i_lock);
retry = false;
}
+ err = -ENOENT;
+ if ((s64)(login_time - cache->timestamp) > 0)
+ goto out;
*mask = cache->mask;
list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru);
err = 0;
@@ -3058,6 +3084,7 @@ static void nfs_access_add_rbtree(struct inode *inode,
else
goto found;
}
+ set->timestamp = ktime_get_ns();
rb_link_node(&set->rb_node, parent, p);
rb_insert_color(&set->rb_node, root_node);
list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index e87d500ad95a..6603b5cee029 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -16,8 +16,9 @@
#include "dns_resolve.h"
ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
- struct sockaddr *sa, size_t salen)
+ struct sockaddr_storage *ss, size_t salen)
{
+ struct sockaddr *sa = (struct sockaddr *)ss;
ssize_t ret;
char *ip_addr = NULL;
int ip_len;
@@ -341,7 +342,7 @@ out:
}
ssize_t nfs_dns_resolve_name(struct net *net, char *name,
- size_t namelen, struct sockaddr *sa, size_t salen)
+ size_t namelen, struct sockaddr_storage *ss, size_t salen)
{
struct nfs_dns_ent key = {
.hostname = name,
@@ -354,7 +355,7 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name,
ret = do_cache_lookup_wait(nn->nfs_dns_resolve, &key, &item);
if (ret == 0) {
if (salen >= item->addrlen) {
- memcpy(sa, &item->addr, item->addrlen);
+ memcpy(ss, &item->addr, item->addrlen);
ret = item->addrlen;
} else
ret = -EOVERFLOW;
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
index 576ff4b54c82..fe3b172c4de1 100644
--- a/fs/nfs/dns_resolve.h
+++ b/fs/nfs/dns_resolve.h
@@ -32,6 +32,6 @@ extern void nfs_dns_resolver_cache_destroy(struct net *net);
#endif
extern ssize_t nfs_dns_resolve_name(struct net *net, char *name,
- size_t namelen, struct sockaddr *sa, size_t salen);
+ size_t namelen, struct sockaddr_storage *sa, size_t salen);
#endif
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index e032fe201a36..d8ec889a4b3f 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -567,7 +567,8 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf)
}
wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING,
- nfs_wait_bit_killable, TASK_KILLABLE);
+ nfs_wait_bit_killable,
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
lock_page(page);
mapping = page_file_mapping(page);
@@ -655,9 +656,9 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
if (mntflags & NFS_MOUNT_WRITE_WAIT) {
- result = filemap_fdatawait_range(file->f_mapping,
- iocb->ki_pos - written,
- iocb->ki_pos - 1);
+ filemap_fdatawait_range(file->f_mapping,
+ iocb->ki_pos - written,
+ iocb->ki_pos - 1);
}
result = generic_write_sync(iocb, written);
if (result < 0)
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 7d285561e59f..7deb3cd76abe 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -30,14 +30,20 @@
#define FF_LAYOUT_POLL_RETRY_MAX (15*HZ)
#define FF_LAYOUTRETURN_MAXERR 20
+enum nfs4_ff_op_type {
+ NFS4_FF_OP_LAYOUTSTATS,
+ NFS4_FF_OP_LAYOUTRETURN,
+};
+
static unsigned short io_maxretrans;
static const struct pnfs_commit_ops ff_layout_commit_ops;
static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
struct nfs_pgio_header *hdr);
-static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
+static int
+ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
struct nfs42_layoutstat_devinfo *devinfo,
- int dev_limit);
+ int dev_limit, enum nfs4_ff_op_type type);
static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
const struct nfs42_layoutstat_devinfo *devinfo,
struct nfs4_ff_layout_mirror *mirror);
@@ -487,10 +493,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
gid = make_kgid(&init_user_ns, id);
if (gfp_flags & __GFP_FS)
- kcred = prepare_kernel_cred(NULL);
+ kcred = prepare_kernel_cred(&init_task);
else {
unsigned int nofs_flags = memalloc_nofs_save();
- kcred = prepare_kernel_cred(NULL);
+ kcred = prepare_kernel_cred(&init_task);
memalloc_nofs_restore(nofs_flags);
}
rc = -ENOMEM;
@@ -1373,6 +1379,11 @@ static int ff_layout_read_prepare_common(struct rpc_task *task,
return -EIO;
}
+ if (!pnfs_is_valid_lseg(hdr->lseg)) {
+ rpc_exit(task, -EAGAIN);
+ return -EAGAIN;
+ }
+
ff_layout_read_record_layoutstats_start(task, hdr);
return 0;
}
@@ -1553,6 +1564,11 @@ static int ff_layout_write_prepare_common(struct rpc_task *task,
return -EIO;
}
+ if (!pnfs_is_valid_lseg(hdr->lseg)) {
+ rpc_exit(task, -EAGAIN);
+ return -EAGAIN;
+ }
+
ff_layout_write_record_layoutstats_start(task, hdr);
return 0;
}
@@ -1645,15 +1661,23 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task,
set_bit(NFS_LSEG_LAYOUTRETURN, &cdata->lseg->pls_flags);
}
-static void ff_layout_commit_prepare_common(struct rpc_task *task,
- struct nfs_commit_data *cdata)
+static int ff_layout_commit_prepare_common(struct rpc_task *task,
+ struct nfs_commit_data *cdata)
{
+ if (!pnfs_is_valid_lseg(cdata->lseg)) {
+ rpc_exit(task, -EAGAIN);
+ return -EAGAIN;
+ }
+
ff_layout_commit_record_layoutstats_start(task, cdata);
+ return 0;
}
static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
{
- ff_layout_commit_prepare_common(task, data);
+ if (ff_layout_commit_prepare_common(task, data))
+ return;
+
rpc_call_start(task);
}
@@ -1949,6 +1973,65 @@ ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
ff_layout_initiate_commit);
}
+static bool ff_layout_match_rw(const struct rpc_task *task,
+ const struct nfs_pgio_header *hdr,
+ const struct pnfs_layout_segment *lseg)
+{
+ return hdr->lseg == lseg;
+}
+
+static bool ff_layout_match_commit(const struct rpc_task *task,
+ const struct nfs_commit_data *cdata,
+ const struct pnfs_layout_segment *lseg)
+{
+ return cdata->lseg == lseg;
+}
+
+static bool ff_layout_match_io(const struct rpc_task *task, const void *data)
+{
+ const struct rpc_call_ops *ops = task->tk_ops;
+
+ if (ops == &ff_layout_read_call_ops_v3 ||
+ ops == &ff_layout_read_call_ops_v4 ||
+ ops == &ff_layout_write_call_ops_v3 ||
+ ops == &ff_layout_write_call_ops_v4)
+ return ff_layout_match_rw(task, task->tk_calldata, data);
+ if (ops == &ff_layout_commit_call_ops_v3 ||
+ ops == &ff_layout_commit_call_ops_v4)
+ return ff_layout_match_commit(task, task->tk_calldata, data);
+ return false;
+}
+
+static void ff_layout_cancel_io(struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_ff_layout_ds *mirror_ds;
+ struct nfs4_pnfs_ds *ds;
+ struct nfs_client *ds_clp;
+ struct rpc_clnt *clnt;
+ u32 idx;
+
+ for (idx = 0; idx < flseg->mirror_array_cnt; idx++) {
+ mirror = flseg->mirror_array[idx];
+ mirror_ds = mirror->mirror_ds;
+ if (!mirror_ds)
+ continue;
+ ds = mirror->mirror_ds->ds;
+ if (!ds)
+ continue;
+ ds_clp = ds->ds_clp;
+ if (!ds_clp)
+ continue;
+ clnt = ds_clp->cl_rpcclient;
+ if (!clnt)
+ continue;
+ if (!rpc_cancel_tasks(clnt, -EAGAIN, ff_layout_match_io, lseg))
+ continue;
+ rpc_clnt_disconnect(clnt);
+ }
+}
+
static struct pnfs_ds_commit_info *
ff_layout_get_ds_info(struct inode *inode)
{
@@ -2161,8 +2244,9 @@ ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args *args)
FF_LAYOUTRETURN_MAXERR);
spin_lock(&args->inode->i_lock);
- ff_args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr,
- &ff_args->devinfo[0], ARRAY_SIZE(ff_args->devinfo));
+ ff_args->num_dev = ff_layout_mirror_prepare_stats(
+ &ff_layout->generic_hdr, &ff_args->devinfo[0],
+ ARRAY_SIZE(ff_args->devinfo), NFS4_FF_OP_LAYOUTRETURN);
spin_unlock(&args->inode->i_lock);
args->ld_private->ops = &layoutreturn_ops;
@@ -2396,7 +2480,7 @@ static const struct nfs4_xdr_opaque_ops layoutstat_ops = {
static int
ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
struct nfs42_layoutstat_devinfo *devinfo,
- int dev_limit)
+ int dev_limit, enum nfs4_ff_op_type type)
{
struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
struct nfs4_ff_layout_mirror *mirror;
@@ -2408,7 +2492,9 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
break;
if (IS_ERR_OR_NULL(mirror->mirror_ds))
continue;
- if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags))
+ if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL,
+ &mirror->flags) &&
+ type != NFS4_FF_OP_LAYOUTRETURN)
continue;
/* mirror refcount put in cleanup_layoutstats */
if (!refcount_inc_not_zero(&mirror->ref))
@@ -2448,7 +2534,9 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
spin_lock(&args->inode->i_lock);
ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout);
args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr,
- &args->devinfo[0], dev_count);
+ &args->devinfo[0],
+ dev_count,
+ NFS4_FF_OP_LAYOUTSTATS);
spin_unlock(&args->inode->i_lock);
if (!args->num_dev) {
kfree(args->devinfo);
@@ -2501,6 +2589,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
.prepare_layoutreturn = ff_layout_prepare_layoutreturn,
.sync = pnfs_nfs_generic_sync,
.prepare_layoutstats = ff_layout_prepare_layoutstats,
+ .cancel_io = ff_layout_cancel_io,
};
static int __init nfs4flexfilelayout_init(void)
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index 4da701fd1424..9bcd53d5c7d4 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -273,9 +273,9 @@ static const struct constant_table nfs_secflavor_tokens[] = {
* Address family must be initialized, and address must not be
* the ANY address for that family.
*/
-static int nfs_verify_server_address(struct sockaddr *addr)
+static int nfs_verify_server_address(struct sockaddr_storage *addr)
{
- switch (addr->sa_family) {
+ switch (addr->ss_family) {
case AF_INET: {
struct sockaddr_in *sa = (struct sockaddr_in *)addr;
return sa->sin_addr.s_addr != htonl(INADDR_ANY);
@@ -684,6 +684,8 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
return ret;
break;
case Opt_vers:
+ if (!param->string)
+ goto out_invalid_value;
trace_nfs_mount_assign(param->key, param->string);
ret = nfs_parse_version_string(fc, param->string);
if (ret < 0)
@@ -696,6 +698,8 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
break;
case Opt_proto:
+ if (!param->string)
+ goto out_invalid_value;
trace_nfs_mount_assign(param->key, param->string);
protofamily = AF_INET;
switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) {
@@ -732,6 +736,8 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
break;
case Opt_mountproto:
+ if (!param->string)
+ goto out_invalid_value;
trace_nfs_mount_assign(param->key, param->string);
mountfamily = AF_INET;
switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) {
@@ -969,7 +975,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc,
{
struct nfs_fs_context *ctx = nfs_fc2context(fc);
struct nfs_fh *mntfh = ctx->mntfh;
- struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address;
+ struct sockaddr_storage *sap = &ctx->nfs_server._address;
int extra_flags = NFS_MOUNT_LEGACY_INTERFACE;
int ret;
@@ -1044,7 +1050,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc,
memcpy(sap, &data->addr, sizeof(data->addr));
ctx->nfs_server.addrlen = sizeof(data->addr);
ctx->nfs_server.port = ntohs(data->addr.sin_port);
- if (sap->sa_family != AF_INET ||
+ if (sap->ss_family != AF_INET ||
!nfs_verify_server_address(sap))
goto out_no_address;
@@ -1200,7 +1206,7 @@ static int nfs4_parse_monolithic(struct fs_context *fc,
struct nfs4_mount_data *data)
{
struct nfs_fs_context *ctx = nfs_fc2context(fc);
- struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address;
+ struct sockaddr_storage *sap = &ctx->nfs_server._address;
int ret;
char *c;
@@ -1314,7 +1320,7 @@ static int nfs_fs_context_validate(struct fs_context *fc)
{
struct nfs_fs_context *ctx = nfs_fc2context(fc);
struct nfs_subversion *nfs_mod;
- struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address;
+ struct sockaddr_storage *sap = &ctx->nfs_server._address;
int max_namelen = PAGE_SIZE;
int max_pathlen = NFS_MAXPATHLEN;
int port = 0;
@@ -1540,7 +1546,7 @@ static int nfs_init_fs_context(struct fs_context *fc)
ctx->version = nfss->nfs_client->rpc_ops->version;
ctx->minorversion = nfss->nfs_client->cl_minorversion;
- memcpy(&ctx->nfs_server.address, &nfss->nfs_client->cl_addr,
+ memcpy(&ctx->nfs_server._address, &nfss->nfs_client->cl_addr,
ctx->nfs_server.addrlen);
if (fc->net_ns != net) {
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index e861d7bae305..e731c00a9fcb 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -252,7 +252,7 @@ static int fscache_fallback_read_page(struct inode *inode, struct page *page)
bvec[0].bv_page = page;
bvec[0].bv_offset = 0;
bvec[0].bv_len = PAGE_SIZE;
- iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
+ iov_iter_bvec(&iter, ITER_DEST, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
ret = fscache_begin_read_operation(&cres, cookie);
if (ret < 0)
@@ -282,7 +282,7 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page,
bvec[0].bv_page = page;
bvec[0].bv_offset = 0;
bvec[0].bv_len = PAGE_SIZE;
- iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
+ iov_iter_bvec(&iter, ITER_SOURCE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
ret = fscache_begin_write_operation(&cres, cookie);
if (ret < 0)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bea7c005119c..e98ee7599eeb 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -72,18 +72,13 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
return nfs_fileid_to_ino_t(fattr->fileid);
}
-static int nfs_wait_killable(int mode)
+int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
{
- freezable_schedule_unsafe();
+ schedule();
if (signal_pending_state(mode, current))
return -ERESTARTSYS;
return 0;
}
-
-int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
-{
- return nfs_wait_killable(mode);
-}
EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
/**
@@ -318,7 +313,7 @@ struct nfs_find_desc {
static int
nfs_find_actor(struct inode *inode, void *opaque)
{
- struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque;
+ struct nfs_find_desc *desc = opaque;
struct nfs_fh *fh = desc->fh;
struct nfs_fattr *fattr = desc->fattr;
@@ -336,7 +331,7 @@ nfs_find_actor(struct inode *inode, void *opaque)
static int
nfs_init_locked(struct inode *inode, void *opaque)
{
- struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque;
+ struct nfs_find_desc *desc = opaque;
struct nfs_fattr *fattr = desc->fattr;
set_nfs_fileid(inode, fattr->fileid);
@@ -1173,7 +1168,8 @@ int nfs_open(struct inode *inode, struct file *filp)
{
struct nfs_open_context *ctx;
- ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp);
+ ctx = alloc_nfs_open_context(file_dentry(filp),
+ flags_to_mode(filp->f_flags), filp);
if (IS_ERR(ctx))
return PTR_ERR(ctx);
nfs_file_set_open_context(filp, ctx);
@@ -1332,7 +1328,8 @@ int nfs_clear_invalid_mapping(struct address_space *mapping)
*/
for (;;) {
ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING,
- nfs_wait_bit_killable, TASK_KILLABLE);
+ nfs_wait_bit_killable,
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
if (ret)
goto out;
spin_lock(&inode->i_lock);
@@ -2271,7 +2268,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
static void init_once(void *foo)
{
- struct nfs_inode *nfsi = (struct nfs_inode *) foo;
+ struct nfs_inode *nfsi = foo;
inode_init_once(&nfsi->vfs_inode);
INIT_LIST_HEAD(&nfsi->open_files);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 898dd95bc7a7..ae7d4a8c728c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -69,7 +69,7 @@ static inline fmode_t flags_to_mode(int flags)
struct nfs_client_initdata {
unsigned long init_flags;
const char *hostname; /* Hostname of the server */
- const struct sockaddr *addr; /* Address of the server */
+ const struct sockaddr_storage *addr; /* Address of the server */
const char *nodename; /* Hostname of the client */
const char *ip_addr; /* IP address of the client */
size_t addrlen;
@@ -180,7 +180,7 @@ static inline struct nfs_fs_context *nfs_fc2context(const struct fs_context *fc)
/* mount_clnt.c */
struct nfs_mount_request {
- struct sockaddr *sap;
+ struct sockaddr_storage *sap;
size_t salen;
char *hostname;
char *dirpath;
@@ -223,7 +223,7 @@ extern void nfs4_server_set_init_caps(struct nfs_server *);
extern struct nfs_server *nfs4_create_server(struct fs_context *);
extern struct nfs_server *nfs4_create_referral_server(struct fs_context *);
extern int nfs4_update_server(struct nfs_server *server, const char *hostname,
- struct sockaddr *sap, size_t salen,
+ struct sockaddr_storage *sap, size_t salen,
struct net *net);
extern void nfs_free_server(struct nfs_server *server);
extern struct nfs_server *nfs_clone_server(struct nfs_server *,
@@ -235,7 +235,7 @@ extern int nfs_client_init_status(const struct nfs_client *clp);
extern int nfs_wait_client_init_complete(const struct nfs_client *clp);
extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
- const struct sockaddr *ds_addr,
+ const struct sockaddr_storage *ds_addr,
int ds_addrlen, int ds_proto,
unsigned int ds_timeo,
unsigned int ds_retrans,
@@ -243,7 +243,7 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
struct inode *);
extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
- const struct sockaddr *ds_addr, int ds_addrlen,
+ const struct sockaddr_storage *ds_addr, int ds_addrlen,
int ds_proto, unsigned int ds_timeo,
unsigned int ds_retrans);
#ifdef CONFIG_PROC_FS
@@ -435,7 +435,6 @@ extern void nfs_zap_acl_cache(struct inode *inode);
extern void nfs_set_cache_invalid(struct inode *inode, unsigned long flags);
extern bool nfs_check_cache_invalid(struct inode *, unsigned long);
extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
-extern int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode);
/* super.c */
extern const struct super_operations nfs_sops;
@@ -503,7 +502,6 @@ extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
const struct nfs_pgio_completion_ops *compl_ops);
extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
extern void nfs_commit_free(struct nfs_commit_data *p);
-extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);
extern int nfs_initiate_commit(struct rpc_clnt *clnt,
struct nfs_commit_data *data,
@@ -741,12 +739,10 @@ unsigned long nfs_io_size(unsigned long iosize, enum xprt_transports proto)
iosize = NFS_DEF_FILE_IO_SIZE;
else if (iosize >= NFS_MAX_FILE_IO_SIZE)
iosize = NFS_MAX_FILE_IO_SIZE;
- else
- iosize = iosize & PAGE_MASK;
- if (proto == XPRT_TRANSPORT_UDP)
+ if (proto == XPRT_TRANSPORT_UDP || iosize < PAGE_SIZE)
return nfs_block_bits(iosize, NULL);
- return iosize;
+ return iosize & PAGE_MASK;
}
/*
@@ -896,13 +892,13 @@ static inline bool nfs_error_is_fatal_on_server(int err)
* Select between a default port value and a user-specified port value.
* If a zero value is set, then autobind will be used.
*/
-static inline void nfs_set_port(struct sockaddr *sap, int *port,
+static inline void nfs_set_port(struct sockaddr_storage *sap, int *port,
const unsigned short default_port)
{
if (*port == NFS_UNSPEC_PORT)
*port = default_port;
- rpc_set_port(sap, *port);
+ rpc_set_port((struct sockaddr *)sap, *port);
}
struct nfs_direct_req {
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index c5e3b6b3366a..68e76b626371 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -158,7 +158,7 @@ int nfs_mount(struct nfs_mount_request *info, int timeo, int retrans)
struct rpc_create_args args = {
.net = info->net,
.protocol = info->protocol,
- .address = info->sap,
+ .address = (struct sockaddr *)info->sap,
.addrsize = info->salen,
.timeout = &mnt_timeout,
.servername = info->hostname,
@@ -245,7 +245,7 @@ void nfs_umount(const struct nfs_mount_request *info)
struct rpc_create_args args = {
.net = info->net,
.protocol = IPPROTO_UDP,
- .address = info->sap,
+ .address = (struct sockaddr *)info->sap,
.addrsize = info->salen,
.timeout = &nfs_umnt_timeout,
.servername = info->hostname,
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 3295af4110f1..b0ef7e7ddb30 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -147,7 +147,7 @@ struct vfsmount *nfs_d_automount(struct path *path)
struct nfs_fs_context *ctx;
struct fs_context *fc;
struct vfsmount *mnt = ERR_PTR(-ENOMEM);
- struct nfs_server *server = NFS_SERVER(d_inode(path->dentry));
+ struct nfs_server *server = NFS_SB(path->dentry->d_sb);
struct nfs_client *client = server->nfs_client;
int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout);
int ret;
@@ -175,7 +175,7 @@ struct vfsmount *nfs_d_automount(struct path *path)
}
/* for submounts we want the same server; referrals will reassign */
- memcpy(&ctx->nfs_server.address, &client->cl_addr, client->cl_addrlen);
+ memcpy(&ctx->nfs_server._address, &client->cl_addr, client->cl_addrlen);
ctx->nfs_server.addrlen = client->cl_addrlen;
ctx->nfs_server.port = server->port;
@@ -354,7 +354,7 @@ static int param_get_nfs_timeout(char *buffer, const struct kernel_param *kp)
num = (num + (HZ - 1)) / HZ;
} else
num = -1;
- return scnprintf(buffer, PAGE_SIZE, "%li\n", num);
+ return sysfs_emit(buffer, "%li\n", num);
}
static const struct kernel_param_ops param_ops_nfs_timeout = {
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
index 03a4e679fd99..df9ca56db347 100644
--- a/fs/nfs/nfs3_fs.h
+++ b/fs/nfs/nfs3_fs.h
@@ -12,7 +12,7 @@
*/
#ifdef CONFIG_NFS_V3_ACL
extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu);
-extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
struct posix_acl *dfacl);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 93de0b58647a..74d11e3c4205 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -255,23 +255,24 @@ int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
}
-int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int nfs3_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
struct posix_acl *orig = acl, *dfacl = NULL, *alloc;
+ struct inode *inode = d_inode(dentry);
int status;
if (S_ISDIR(inode->i_mode)) {
switch(type) {
case ACL_TYPE_ACCESS:
- alloc = get_acl(inode, ACL_TYPE_DEFAULT);
+ alloc = get_inode_acl(inode, ACL_TYPE_DEFAULT);
if (IS_ERR(alloc))
goto fail;
dfacl = alloc;
break;
case ACL_TYPE_DEFAULT:
- alloc = get_acl(inode, ACL_TYPE_ACCESS);
+ alloc = get_inode_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR(alloc))
goto fail;
dfacl = acl;
@@ -312,7 +313,7 @@ nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data,
struct posix_acl *acl;
char *p = data + *result;
- acl = get_acl(inode, type);
+ acl = get_inode_acl(inode, type);
if (IS_ERR_OR_NULL(acl))
return 0;
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index b49359afac88..669cda757a5c 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -78,7 +78,7 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source,
* the MDS.
*/
struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
- const struct sockaddr *ds_addr, int ds_addrlen,
+ const struct sockaddr_storage *ds_addr, int ds_addrlen,
int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans)
{
struct rpc_timeout ds_timeout;
@@ -98,7 +98,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
char buf[INET6_ADDRSTRLEN + 1];
/* fake a hostname because lockd wants it */
- if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
+ if (rpc_ntop((struct sockaddr *)ds_addr, buf, sizeof(buf)) <= 0)
return ERR_PTR(-EINVAL);
cl_init.hostname = buf;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 1597eef40d54..4bf208a0a8e9 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -36,7 +36,8 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
res = rpc_call_sync(clnt, msg, flags);
if (res != -EJUKEBOX)
break;
- freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME);
+ __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
+ schedule_timeout(NFS_JUKEBOX_RETRY_TIME);
res = -ERESTARTSYS;
} while (!fatal_signal_pending(current));
return res;
@@ -997,7 +998,7 @@ static const struct inode_operations nfs3_dir_inode_operations = {
.setattr = nfs_setattr,
#ifdef CONFIG_NFS_V3_ACL
.listxattr = nfs3_listxattr,
- .get_acl = nfs3_get_acl,
+ .get_inode_acl = nfs3_get_acl,
.set_acl = nfs3_set_acl,
#endif
};
@@ -1008,7 +1009,7 @@ static const struct inode_operations nfs3_file_inode_operations = {
.setattr = nfs_setattr,
#ifdef CONFIG_NFS_V3_ACL
.listxattr = nfs3_listxattr,
- .get_acl = nfs3_get_acl,
+ .get_inode_acl = nfs3_get_acl,
.set_acl = nfs3_set_acl,
#endif
};
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 6dab9e408372..ecb428512fe1 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -341,7 +341,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
return status;
}
}
- status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping,
+ status = nfs_filemap_write_and_wait_range(src->f_mapping,
pos_src, pos_src + (loff_t)count - 1);
if (status)
return status;
@@ -1093,6 +1093,9 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
&args.seq_args, &res.seq_res, 0);
trace_nfs4_clone(src_inode, dst_inode, &args, status);
if (status == 0) {
+ /* a zero-length count means clone to EOF in src */
+ if (count == 0 && res.dst_fattr->valid & NFS_ATTR_FATTR_SIZE)
+ count = nfs_size_to_loff_t(res.dst_fattr->size) - dst_offset;
nfs42_copy_dest_done(dst_inode, dst_offset, count);
status = nfs_post_op_update_inode(dst_inode, res.dst_fattr);
}
@@ -1175,6 +1178,7 @@ static int _nfs42_proc_removexattr(struct inode *inode, const char *name)
ret = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
&res.seq_res, 1);
+ trace_nfs4_removexattr(inode, name, ret);
if (!ret)
nfs4_update_changeattr(inode, &res.cinfo, timestamp, 0);
@@ -1214,6 +1218,7 @@ static int _nfs42_proc_setxattr(struct inode *inode, const char *name,
ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
&res.seq_res, 1);
+ trace_nfs4_setxattr(inode, name, ret);
for (; np > 0; np--)
put_page(pages[np - 1]);
@@ -1246,6 +1251,7 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name,
ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
&res.seq_res, 0);
+ trace_nfs4_getxattr(inode, name, ret);
if (ret < 0)
return ret;
@@ -1317,6 +1323,7 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf,
ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
&res.seq_res, 0);
+ trace_nfs4_listxattr(inode, ret);
if (ret >= 0) {
ret = res.copied;
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index a9bf09fdf2c3..76ae11834206 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -981,7 +981,7 @@ nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc)
static void nfs4_xattr_cache_init_once(void *p)
{
- struct nfs4_xattr_cache *cache = (struct nfs4_xattr_cache *)p;
+ struct nfs4_xattr_cache *cache = p;
spin_lock_init(&cache->listxattr_lock);
atomic_long_set(&cache->nent, 0);
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index b56f05113d36..d80ee88ca996 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -47,13 +47,14 @@
#define decode_deallocate_maxsz (op_decode_hdr_maxsz)
#define encode_read_plus_maxsz (op_encode_hdr_maxsz + \
encode_stateid_maxsz + 3)
-#define NFS42_READ_PLUS_SEGMENT_SIZE (1 /* data_content4 */ + \
+#define NFS42_READ_PLUS_DATA_SEGMENT_SIZE \
+ (1 /* data_content4 */ + \
2 /* data_info4.di_offset */ + \
- 2 /* data_info4.di_length */)
+ 1 /* data_info4.di_length */)
#define decode_read_plus_maxsz (op_decode_hdr_maxsz + \
1 /* rpr_eof */ + \
1 /* rpr_contents count */ + \
- 2 * NFS42_READ_PLUS_SEGMENT_SIZE)
+ NFS42_READ_PLUS_DATA_SEGMENT_SIZE)
#define encode_seek_maxsz (op_encode_hdr_maxsz + \
encode_stateid_maxsz + \
2 /* offset */ + \
@@ -569,6 +570,14 @@ static int decode_listxattrs(struct xdr_stream *xdr,
*/
if (status == -ETOOSMALL)
status = -ERANGE;
+ /*
+ * Special case: for LISTXATTRS, NFS4ERR_NOXATTR
+ * should be translated to success with zero-length reply.
+ */
+ if (status == -ENODATA) {
+ res->eof = true;
+ status = 0;
+ }
goto out;
}
@@ -1134,7 +1143,7 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
if (!segs)
return -ENOMEM;
- xdr_set_scratch_buffer(xdr, &scratch_buf, 32);
+ xdr_set_scratch_buffer(xdr, &scratch_buf, sizeof(scratch_buf));
status = -EIO;
for (i = 0; i < segments; i++) {
status = decode_read_plus_segment(xdr, &segs[i]);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 79df6e83881b..5edd1704f735 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -149,6 +149,7 @@ struct nfs4_lock_state {
struct nfs4_state * ls_state; /* Pointer to open state */
#define NFS_LOCK_INITIALIZED 0
#define NFS_LOCK_LOST 1
+#define NFS_LOCK_UNLOCKING 2
unsigned long ls_flags;
struct nfs_seqid_counter ls_seqid;
nfs4_stateid ls_stateid;
@@ -281,7 +282,7 @@ struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *,
int nfs4_submount(struct fs_context *, struct nfs_server *);
int nfs4_replace_transport(struct nfs_server *server,
const struct nfs4_fs_locations *locations);
-size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa,
+size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr_storage *ss,
size_t salen, struct net *net, int port);
/* nfs4proc.c */
extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *);
@@ -459,7 +460,6 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *);
/* nfs4renewd.c */
extern void nfs4_schedule_state_renewal(struct nfs_client *);
-extern void nfs4_renewd_prepare_shutdown(struct nfs_server *);
extern void nfs4_kill_renewd(struct nfs_client *);
extern void nfs4_renew_state(struct work_struct *);
extern void nfs4_set_lease_period(struct nfs_client *clp, unsigned long lease);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 3c5678aec006..d3051b051a56 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -254,7 +254,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
goto error;
ip_addr = (const char *)buf;
}
- strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
+ strscpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
err = nfs_idmap_new(clp);
if (err < 0) {
@@ -346,6 +346,7 @@ int nfs40_init_client(struct nfs_client *clp)
ret = nfs4_setup_slot_table(tbl, NFS4_MAX_SLOT_TABLE,
"NFSv4.0 transport Slot table");
if (ret) {
+ nfs4_shutdown_slot_table(tbl);
kfree(tbl);
return ret;
}
@@ -889,7 +890,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
*/
static int nfs4_set_client(struct nfs_server *server,
const char *hostname,
- const struct sockaddr *addr,
+ const struct sockaddr_storage *addr,
const size_t addrlen,
const char *ip_addr,
int proto, const struct rpc_timeout *timeparms,
@@ -924,7 +925,7 @@ static int nfs4_set_client(struct nfs_server *server,
__set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
if (test_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status))
__set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags);
- server->port = rpc_get_port(addr);
+ server->port = rpc_get_port((struct sockaddr *)addr);
/* Allocate or find a client reference we can use */
clp = nfs_get_client(&cl_init);
@@ -960,7 +961,7 @@ static int nfs4_set_client(struct nfs_server *server,
* the MDS.
*/
struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
- const struct sockaddr *ds_addr, int ds_addrlen,
+ const struct sockaddr_storage *ds_addr, int ds_addrlen,
int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
u32 minor_version)
{
@@ -980,7 +981,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
};
char buf[INET6_ADDRSTRLEN + 1];
- if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
+ if (rpc_ntop((struct sockaddr *)ds_addr, buf, sizeof(buf)) <= 0)
return ERR_PTR(-EINVAL);
cl_init.hostname = buf;
@@ -1148,7 +1149,7 @@ static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc)
/* Get a client record */
error = nfs4_set_client(server,
ctx->nfs_server.hostname,
- &ctx->nfs_server.address,
+ &ctx->nfs_server._address,
ctx->nfs_server.addrlen,
ctx->client_address,
ctx->nfs_server.protocol,
@@ -1238,7 +1239,7 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc)
rpc_set_port(&ctx->nfs_server.address, NFS_RDMA_PORT);
error = nfs4_set_client(server,
ctx->nfs_server.hostname,
- &ctx->nfs_server.address,
+ &ctx->nfs_server._address,
ctx->nfs_server.addrlen,
parent_client->cl_ipaddr,
XPRT_TRANSPORT_RDMA,
@@ -1254,7 +1255,7 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc)
rpc_set_port(&ctx->nfs_server.address, NFS_PORT);
error = nfs4_set_client(server,
ctx->nfs_server.hostname,
- &ctx->nfs_server.address,
+ &ctx->nfs_server._address,
ctx->nfs_server.addrlen,
parent_client->cl_ipaddr,
XPRT_TRANSPORT_TCP,
@@ -1303,14 +1304,14 @@ error:
* Returns zero on success, or a negative errno value.
*/
int nfs4_update_server(struct nfs_server *server, const char *hostname,
- struct sockaddr *sap, size_t salen, struct net *net)
+ struct sockaddr_storage *sap, size_t salen, struct net *net)
{
struct nfs_client *clp = server->nfs_client;
struct rpc_clnt *clnt = server->client;
struct xprt_create xargs = {
.ident = clp->cl_proto,
.net = net,
- .dstaddr = sap,
+ .dstaddr = (struct sockaddr *)sap,
.addrlen = salen,
.servername = hostname,
};
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 9eb181287879..2563ed8580f3 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -32,7 +32,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
struct dentry *parent = NULL;
struct inode *dir;
unsigned openflags = filp->f_flags;
- fmode_t f_mode;
struct iattr attr;
int err;
@@ -51,17 +50,14 @@ nfs4_file_open(struct inode *inode, struct file *filp)
if (err)
return err;
- f_mode = filp->f_mode;
- if ((openflags & O_ACCMODE) == 3)
- f_mode |= flags_to_mode(openflags);
-
/* We can't create new files here */
openflags &= ~(O_CREAT|O_EXCL);
parent = dget_parent(dentry);
dir = d_inode(parent);
- ctx = alloc_nfs_open_context(file_dentry(filp), f_mode, filp);
+ ctx = alloc_nfs_open_context(file_dentry(filp),
+ flags_to_mode(openflags), filp);
err = PTR_ERR(ctx);
if (IS_ERR(ctx))
goto out;
@@ -366,8 +362,8 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
goto out_free_name;
}
- ctx = alloc_nfs_open_context(filep->f_path.dentry, filep->f_mode,
- filep);
+ ctx = alloc_nfs_open_context(filep->f_path.dentry,
+ flags_to_mode(filep->f_flags), filep);
if (IS_ERR(ctx)) {
res = ERR_CAST(ctx);
goto out_filep;
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index ec6afd3c4bca..25a7c771cfd8 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -203,7 +203,7 @@ int nfs_idmap_init(void)
printk(KERN_NOTICE "NFS: Registering the %s key type\n",
key_type_id_resolver.name);
- cred = prepare_kernel_cred(NULL);
+ cred = prepare_kernel_cred(&init_task);
if (!cred)
return -ENOMEM;
@@ -583,7 +583,7 @@ static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux)
struct request_key_auth *rka = get_request_key_auth(authkey);
struct rpc_pipe_msg *msg;
struct idmap_msg *im;
- struct idmap *idmap = (struct idmap *)aux;
+ struct idmap *idmap = aux;
struct key *key = rka->target_key;
int ret = -ENOKEY;
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index f2dbf904c598..9a98595bb160 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -164,16 +164,17 @@ static int nfs4_validate_fspath(struct dentry *dentry,
return 0;
}
-size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa,
+size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr_storage *ss,
size_t salen, struct net *net, int port)
{
+ struct sockaddr *sa = (struct sockaddr *)ss;
ssize_t ret;
ret = rpc_pton(net, string, len, sa, salen);
if (ret == 0) {
ret = rpc_uaddr2sockaddr(net, string, len, sa, salen);
if (ret == 0) {
- ret = nfs_dns_resolve_name(net, string, len, sa, salen);
+ ret = nfs_dns_resolve_name(net, string, len, ss, salen);
if (ret < 0)
ret = 0;
}
@@ -331,7 +332,7 @@ static int try_location(struct fs_context *fc,
ctx->nfs_server.addrlen =
nfs_parse_server_name(buf->data, buf->len,
- &ctx->nfs_server.address,
+ &ctx->nfs_server._address,
sizeof(ctx->nfs_server._address),
fc->net_ns, 0);
if (ctx->nfs_server.addrlen == 0)
@@ -483,14 +484,13 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server,
char *page, char *page2,
const struct nfs4_fs_location *location)
{
- const size_t addr_bufsize = sizeof(struct sockaddr_storage);
struct net *net = rpc_net_ns(server->client);
- struct sockaddr *sap;
+ struct sockaddr_storage *sap;
unsigned int s;
size_t salen;
int error;
- sap = kmalloc(addr_bufsize, GFP_KERNEL);
+ sap = kmalloc(sizeof(*sap), GFP_KERNEL);
if (sap == NULL)
return -ENOMEM;
@@ -506,10 +506,10 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server,
continue;
salen = nfs_parse_server_name(buf->data, buf->len,
- sap, addr_bufsize, net, 0);
+ sap, sizeof(*sap), net, 0);
if (salen == 0)
continue;
- rpc_set_port(sap, NFS_PORT);
+ rpc_set_port((struct sockaddr *)sap, NFS_PORT);
error = -ENOMEM;
hostname = kmemdup_nul(buf->data, buf->len, GFP_KERNEL);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3ed14a2a84a4..40d749f29ed3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -122,6 +122,11 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0)
return NULL;
+ label->lfs = 0;
+ label->pi = 0;
+ label->len = 0;
+ label->label = NULL;
+
err = security_dentry_init_security(dentry, sattr->ia_mode,
&dentry->d_name, NULL,
(void **)&label->label, &label->len);
@@ -416,8 +421,8 @@ static int nfs4_delay_killable(long *timeout)
{
might_sleep();
- freezable_schedule_timeout_killable_unsafe(
- nfs4_update_delay(timeout));
+ __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
+ schedule_timeout(nfs4_update_delay(timeout));
if (!__fatal_signal_pending(current))
return 0;
return -EINTR;
@@ -427,7 +432,8 @@ static int nfs4_delay_interruptible(long *timeout)
{
might_sleep();
- freezable_schedule_timeout_interruptible_unsafe(nfs4_update_delay(timeout));
+ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE_UNSAFE);
+ schedule_timeout(nfs4_update_delay(timeout));
if (!signal_pending(current))
return 0;
return __fatal_signal_pending(current) ? -EINTR :-ERESTARTSYS;
@@ -2125,18 +2131,18 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
}
static int nfs4_open_recover_helper(struct nfs4_opendata *opendata,
- fmode_t fmode)
+ fmode_t fmode)
{
struct nfs4_state *newstate;
+ struct nfs_server *server = NFS_SB(opendata->dentry->d_sb);
+ int openflags = opendata->o_arg.open_flags;
int ret;
if (!nfs4_mode_match_open_stateid(opendata->state, fmode))
return 0;
- opendata->o_arg.open_flags = 0;
opendata->o_arg.fmode = fmode;
- opendata->o_arg.share_access = nfs4_map_atomic_open_share(
- NFS_SB(opendata->dentry->d_sb),
- fmode, 0);
+ opendata->o_arg.share_access =
+ nfs4_map_atomic_open_share(server, fmode, openflags);
memset(&opendata->o_res, 0, sizeof(opendata->o_res));
memset(&opendata->c_res, 0, sizeof(opendata->c_res));
nfs4_init_opendata_res(opendata);
@@ -2624,8 +2630,7 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
*/
static int nfs4_opendata_access(const struct cred *cred,
struct nfs4_opendata *opendata,
- struct nfs4_state *state, fmode_t fmode,
- int openflags)
+ struct nfs4_state *state, fmode_t fmode)
{
struct nfs_access_entry cache;
u32 mask, flags;
@@ -2636,11 +2641,7 @@ static int nfs4_opendata_access(const struct cred *cred,
return 0;
mask = 0;
- /*
- * Use openflags to check for exec, because fmode won't
- * always have FMODE_EXEC set when file open for exec.
- */
- if (openflags & __FMODE_EXEC) {
+ if (fmode & FMODE_EXEC) {
/* ONLY check for exec rights */
if (S_ISDIR(state->inode->i_mode))
mask = NFS4_ACCESS_LOOKUP;
@@ -2718,10 +2719,15 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s
struct nfs4_opendata *opendata;
int ret;
- opendata = nfs4_open_recoverdata_alloc(ctx, state,
- NFS4_OPEN_CLAIM_FH);
+ opendata = nfs4_open_recoverdata_alloc(ctx, state, NFS4_OPEN_CLAIM_FH);
if (IS_ERR(opendata))
return PTR_ERR(opendata);
+ /*
+ * We're not recovering a delegation, so ask for no delegation.
+ * Otherwise the recovery thread could deadlock with an outstanding
+ * delegation return.
+ */
+ opendata->o_arg.open_flags = O_DIRECT;
ret = nfs4_open_recover(opendata, state);
if (ret == -ESTALE)
d_drop(ctx->dentry);
@@ -3023,7 +3029,7 @@ static unsigned nfs4_exclusive_attrset(struct nfs4_opendata *opendata,
}
static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
- int flags, struct nfs_open_context *ctx)
+ struct nfs_open_context *ctx)
{
struct nfs4_state_owner *sp = opendata->owner;
struct nfs_server *server = sp->so_server;
@@ -3084,8 +3090,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
/* Parse layoutget results before we check for access */
pnfs_parse_lgopen(state->inode, opendata->lgp, ctx);
- ret = nfs4_opendata_access(sp->so_cred, opendata, state,
- acc_mode, flags);
+ ret = nfs4_opendata_access(sp->so_cred, opendata, state, acc_mode);
if (ret != 0)
goto out;
@@ -3159,7 +3164,7 @@ static int _nfs4_do_open(struct inode *dir,
if (d_really_is_positive(dentry))
opendata->state = nfs4_get_open_state(d_inode(dentry), sp);
- status = _nfs4_open_and_get_state(opendata, flags, ctx);
+ status = _nfs4_open_and_get_state(opendata, ctx);
if (status != 0)
goto err_opendata_put;
state = ctx->state;
@@ -3795,7 +3800,7 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx,
int open_flags, struct iattr *attr, int *opened)
{
struct nfs4_state *state;
- struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL;
+ struct nfs4_label l, *label;
label = nfs4_label_init_security(dir, ctx->dentry, attr, &l);
@@ -3950,7 +3955,7 @@ static void test_fs_location_for_trunking(struct nfs4_fs_location *location,
for (i = 0; i < location->nservers; i++) {
struct nfs4_string *srv_loc = &location->servers[i];
- struct sockaddr addr;
+ struct sockaddr_storage addr;
size_t addrlen;
struct xprt_create xprt_args = {
.ident = 0,
@@ -3973,7 +3978,7 @@ static void test_fs_location_for_trunking(struct nfs4_fs_location *location,
clp->cl_net, server->port);
if (!addrlen)
return;
- xprt_args.dstaddr = &addr;
+ xprt_args.dstaddr = (struct sockaddr *)&addr;
xprt_args.addrlen = addrlen;
servername = kmalloc(srv_loc->len + 1, GFP_KERNEL);
if (!servername)
@@ -4012,7 +4017,7 @@ static int _nfs4_discover_trunking(struct nfs_server *server,
page = alloc_page(GFP_KERNEL);
if (!page)
- return -ENOMEM;
+ goto out_put_cred;
locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
if (!locations)
goto out_free;
@@ -4034,6 +4039,8 @@ out_free_2:
kfree(locations);
out_free:
__free_page(page);
+out_put_cred:
+ put_cred(cred);
return status;
}
@@ -4681,7 +4688,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
int flags)
{
struct nfs_server *server = NFS_SERVER(dir);
- struct nfs4_label l, *ilabel = NULL;
+ struct nfs4_label l, *ilabel;
struct nfs_open_context *ctx;
struct nfs4_state *state;
int status = 0;
@@ -5032,7 +5039,7 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
struct nfs4_exception exception = {
.interruptible = true,
};
- struct nfs4_label l, *label = NULL;
+ struct nfs4_label l, *label;
int err;
label = nfs4_label_init_security(dir, dentry, sattr, &l);
@@ -5073,7 +5080,7 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
struct nfs4_exception exception = {
.interruptible = true,
};
- struct nfs4_label l, *label = NULL;
+ struct nfs4_label l, *label;
int err;
label = nfs4_label_init_security(dir, dentry, sattr, &l);
@@ -5192,7 +5199,7 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
struct nfs4_exception exception = {
.interruptible = true,
};
- struct nfs4_label l, *label = NULL;
+ struct nfs4_label l, *label;
int err;
label = nfs4_label_init_security(dir, dentry, sattr, &l);
@@ -6607,7 +6614,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
struct nfs4_delegreturndata *d_data;
struct pnfs_layout_hdr *lo;
- d_data = (struct nfs4_delegreturndata *)data;
+ d_data = data;
if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task)) {
nfs4_sequence_done(task, &d_data->res.seq_res);
@@ -7016,12 +7023,13 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
mutex_unlock(&sp->so_delegreturn_mutex);
goto out;
}
+ lsp = request->fl_u.nfs4_fl.owner;
+ set_bit(NFS_LOCK_UNLOCKING, &lsp->ls_flags);
up_read(&nfsi->rwsem);
mutex_unlock(&sp->so_delegreturn_mutex);
if (status != 0)
goto out;
/* Is this a delegated lock? */
- lsp = request->fl_u.nfs4_fl.owner;
if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0)
goto out;
alloc_seqid = NFS_SERVER(inode)->nfs_client->cl_mvops->alloc_seqid;
@@ -7137,6 +7145,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
{
struct nfs4_lockdata *data = calldata;
struct nfs4_lock_state *lsp = data->lsp;
+ struct nfs_server *server = NFS_SERVER(d_inode(data->ctx->dentry));
if (!nfs4_sequence_done(task, &data->res.seq_res))
return;
@@ -7144,8 +7153,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
data->rpc_status = task->tk_status;
switch (task->tk_status) {
case 0:
- renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)),
- data->timestamp);
+ renew_lease(server, data->timestamp);
if (data->arg.new_lock && !data->cancelled) {
data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0)
@@ -7166,6 +7174,8 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
if (!nfs4_stateid_match(&data->arg.open_stateid,
&lsp->ls_state->open_stateid))
goto out_restart;
+ else if (nfs4_async_handle_error(task, server, lsp->ls_state, NULL) == -EAGAIN)
+ goto out_restart;
} else if (!nfs4_stateid_match(&data->arg.lock_stateid,
&lsp->ls_stateid))
goto out_restart;
@@ -7406,7 +7416,8 @@ nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd,
status = nfs4_proc_setlk(state, cmd, request);
if ((status != -EAGAIN) || IS_SETLK(cmd))
break;
- freezable_schedule_timeout_interruptible(timeout);
+ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+ schedule_timeout(timeout);
timeout *= 2;
timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout);
status = -ERESTARTSYS;
@@ -7474,10 +7485,8 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
break;
status = -ERESTARTSYS;
- freezer_do_not_count();
- wait_woken(&waiter.wait, TASK_INTERRUPTIBLE,
+ wait_woken(&waiter.wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE,
NFS4_LOCK_MAXTIMEOUT);
- freezer_count();
} while (!signalled());
remove_wait_queue(q, &waiter.wait);
@@ -8900,7 +8909,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred)
void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
void *data)
{
- struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data;
+ struct nfs4_add_xprt_data *adata = data;
struct rpc_task *task;
int status;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 9bab3e9c702a..2a0ca5c7f082 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -497,8 +497,7 @@ nfs4_alloc_state_owner(struct nfs_server *server,
sp = kzalloc(sizeof(*sp), gfp_flags);
if (!sp)
return NULL;
- sp->so_seqid.owner_id = ida_simple_get(&server->openowner_id, 0, 0,
- gfp_flags);
+ sp->so_seqid.owner_id = ida_alloc(&server->openowner_id, gfp_flags);
if (sp->so_seqid.owner_id < 0) {
kfree(sp);
return NULL;
@@ -534,7 +533,7 @@ static void nfs4_free_state_owner(struct nfs4_state_owner *sp)
{
nfs4_destroy_seqid_counter(&sp->so_seqid);
put_cred(sp->so_cred);
- ida_simple_remove(&sp->so_server->openowner_id, sp->so_seqid.owner_id);
+ ida_free(&sp->so_server->openowner_id, sp->so_seqid.owner_id);
kfree(sp);
}
@@ -877,8 +876,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
refcount_set(&lsp->ls_count, 1);
lsp->ls_state = state;
lsp->ls_owner = fl_owner;
- lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id,
- 0, 0, GFP_KERNEL_ACCOUNT);
+ lsp->ls_seqid.owner_id = ida_alloc(&server->lockowner_id, GFP_KERNEL_ACCOUNT);
if (lsp->ls_seqid.owner_id < 0)
goto out_free;
INIT_LIST_HEAD(&lsp->ls_locks);
@@ -890,7 +888,7 @@ out_free:
void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
{
- ida_simple_remove(&server->lockowner_id, lsp->ls_seqid.owner_id);
+ ida_free(&server->lockowner_id, lsp->ls_seqid.owner_id);
nfs4_destroy_seqid_counter(&lsp->ls_seqid);
kfree(lsp);
}
@@ -1232,6 +1230,8 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
if (IS_ERR(task)) {
printk(KERN_ERR "%s: kthread_run: %ld\n",
__func__, PTR_ERR(task));
+ if (!nfs_client_init_is_complete(clp))
+ nfs_mark_client_ready(clp, PTR_ERR(task));
nfs4_clear_state_manager_bit(clp);
clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
nfs_put_client(clp);
@@ -1314,7 +1314,8 @@ int nfs4_wait_clnt_recover(struct nfs_client *clp)
refcount_inc(&clp->cl_count);
res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
- nfs_wait_bit_killable, TASK_KILLABLE);
+ nfs_wait_bit_killable,
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
if (res)
goto out;
if (clp->cl_cons_state < 0)
@@ -1502,7 +1503,7 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
struct file_lock *fl;
struct nfs4_lock_state *lsp;
int status = 0;
- struct file_lock_context *flctx = inode->i_flctx;
+ struct file_lock_context *flctx = locks_inode_context(inode);
struct list_head *list;
if (flctx == NULL)
@@ -1620,7 +1621,8 @@ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_st
spin_lock(&state->state_lock);
list_for_each_entry(lock, &state->lock_states, ls_locks) {
trace_nfs4_state_lock_reclaim(state, lock);
- if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
+ if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags) &&
+ !test_bit(NFS_LOCK_UNLOCKING, &lock->ls_flags))
*lost_locks += 1;
}
spin_unlock(&state->state_lock);
@@ -1787,6 +1789,7 @@ static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp,
static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
{
+ set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
/* Mark all delegations for reclaim */
nfs_delegation_mark_reclaim(clp);
nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot);
@@ -2671,6 +2674,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
if (status < 0)
goto out_error;
nfs4_state_end_reclaim_reboot(clp);
+ continue;
}
/* Detect expired delegations... */
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 6ee6ad3674a2..214bc56f92d2 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -9,10 +9,10 @@
#define _TRACE_NFS4_H
#include <linux/tracepoint.h>
-#include <trace/events/sunrpc_base.h>
+#include <trace/misc/sunrpc.h>
-#include <trace/events/fs.h>
-#include <trace/events/nfs.h>
+#include <trace/misc/fs.h>
+#include <trace/misc/nfs.h>
#define show_nfs_fattr_flags(valid) \
__print_flags((unsigned long)valid, "|", \
@@ -1815,7 +1815,7 @@ TRACE_EVENT(pnfs_update_layout,
__entry->count = count;
__entry->iomode = iomode;
__entry->reason = reason;
- if (lo != NULL) {
+ if (lo != NULL && pnfs_layout_is_valid(lo)) {
__entry->layoutstateid_seq =
be32_to_cpu(lo->plh_stateid.seqid);
__entry->layoutstateid_hash =
@@ -1869,7 +1869,7 @@ DECLARE_EVENT_CLASS(pnfs_layout_event,
__entry->pos = pos;
__entry->count = count;
__entry->iomode = iomode;
- if (lo != NULL) {
+ if (lo != NULL && pnfs_layout_is_valid(lo)) {
__entry->layoutstateid_seq =
be32_to_cpu(lo->plh_stateid.seqid);
__entry->layoutstateid_hash =
@@ -2097,6 +2097,7 @@ TRACE_EVENT(ff_layout_commit_error,
)
);
+#ifdef CONFIG_NFS_V4_2
TRACE_DEFINE_ENUM(NFS4_CONTENT_DATA);
TRACE_DEFINE_ENUM(NFS4_CONTENT_HOLE);
@@ -2105,7 +2106,6 @@ TRACE_DEFINE_ENUM(NFS4_CONTENT_HOLE);
{ NFS4_CONTENT_DATA, "DATA" }, \
{ NFS4_CONTENT_HOLE, "HOLE" })
-#ifdef CONFIG_NFS_V4_2
TRACE_EVENT(nfs4_llseek,
TP_PROTO(
const struct inode *inode,
@@ -2496,6 +2496,54 @@ TRACE_EVENT(nfs4_offload_cancel,
__entry->stateid_seq, __entry->stateid_hash
)
);
+
+DECLARE_EVENT_CLASS(nfs4_xattr_event,
+ TP_PROTO(
+ const struct inode *inode,
+ const char *name,
+ int error
+ ),
+
+ TP_ARGS(inode, name, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __string(name, name)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error < 0 ? -error : 0;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __assign_str(name, name);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "name=%s",
+ -__entry->error, show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __get_str(name)
+ )
+);
+#define DEFINE_NFS4_XATTR_EVENT(name) \
+ DEFINE_EVENT(nfs4_xattr_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ const char *name, \
+ int error \
+ ), \
+ TP_ARGS(inode, name, error))
+DEFINE_NFS4_XATTR_EVENT(nfs4_getxattr);
+DEFINE_NFS4_XATTR_EVENT(nfs4_setxattr);
+DEFINE_NFS4_XATTR_EVENT(nfs4_removexattr);
+
+DEFINE_NFS4_INODE_EVENT(nfs4_listxattr);
#endif /* CONFIG_NFS_V4_2 */
#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index acfe5f4bda48..deec76cf5afe 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4234,19 +4234,17 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
p = xdr_inline_decode(xdr, len);
if (unlikely(!p))
return -EIO;
+ bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
if (len < NFS4_MAXLABELLEN) {
- if (label) {
- if (label->len) {
- if (label->len < len)
- return -ERANGE;
- memcpy(label->label, p, len);
- }
+ if (label && label->len) {
+ if (label->len < len)
+ return -ERANGE;
+ memcpy(label->label, p, len);
label->len = len;
label->pi = pi;
label->lfs = lfs;
status = NFS_ATTR_FATTR_V4_SECURITY_LABEL;
}
- bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
} else
printk(KERN_WARNING "%s: label too long (%u)!\n",
__func__, len);
@@ -4755,12 +4753,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
if (status < 0)
goto xdr_error;
- if (fattr->label) {
- status = decode_attr_security_label(xdr, bitmap, fattr->label);
- if (status < 0)
- goto xdr_error;
- fattr->valid |= status;
- }
+ status = decode_attr_security_label(xdr, bitmap, fattr->label);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
xdr_error:
dprintk("%s: xdr returned %d\n", __func__, -status);
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index fa148308822c..620329b7e6ae 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -139,7 +139,7 @@ static int __init nfs_root_setup(char *line)
ROOT_DEV = Root_NFS;
if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) {
- strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms));
+ strscpy(nfs_root_parms, line, sizeof(nfs_root_parms));
} else {
size_t n = strlen(line) + sizeof(NFS_ROOT) - 1;
if (n >= sizeof(nfs_root_parms))
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 8c6cc58679ff..642f6921852f 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -11,9 +11,9 @@
#include <linux/tracepoint.h>
#include <linux/iversion.h>
-#include <trace/events/fs.h>
-#include <trace/events/nfs.h>
-#include <trace/events/sunrpc_base.h>
+#include <trace/misc/fs.h>
+#include <trace/misc/nfs.h>
+#include <trace/misc/sunrpc.h>
#define nfs_show_cache_validity(v) \
__print_flags(v, "|", \
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 317cedfa52bf..16be6dae524f 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -1055,7 +1055,7 @@ static unsigned int nfs_coalesce_size(struct nfs_page *prev,
if (prev) {
if (!nfs_match_open_context(nfs_req_openctx(req), nfs_req_openctx(prev)))
return 0;
- flctx = d_inode(nfs_req_openctx(req)->dentry)->i_flctx;
+ flctx = locks_inode_context(d_inode(nfs_req_openctx(req)->dentry));
if (flctx != NULL &&
!(list_empty_careful(&flctx->flc_posix) &&
list_empty_careful(&flctx->flc_flock)) &&
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 2613b7e36eb9..a5db5158c634 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -710,6 +710,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
u32 seq)
{
struct pnfs_layout_segment *lseg, *next;
+ struct nfs_server *server = NFS_SERVER(lo->plh_inode);
int remaining = 0;
dprintk("%s:Begin lo %p\n", __func__, lo);
@@ -722,8 +723,10 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
"offset %llu length %llu\n", __func__,
lseg, lseg->pls_range.iomode, lseg->pls_seq,
lseg->pls_range.offset, lseg->pls_range.length);
- if (!mark_lseg_invalid(lseg, tmp_list))
- remaining++;
+ if (mark_lseg_invalid(lseg, tmp_list))
+ continue;
+ remaining++;
+ pnfs_lseg_cancel_io(server, lseg);
}
dprintk("%s:Return %i\n", __func__, remaining);
return remaining;
@@ -1908,7 +1911,7 @@ static int pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
pnfs_layoutcommit_inode(lo->plh_inode, false);
return wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
nfs_wait_bit_killable,
- TASK_KILLABLE);
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
}
static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo)
@@ -2485,6 +2488,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
u32 seq)
{
struct pnfs_layout_segment *lseg, *next;
+ struct nfs_server *server = NFS_SERVER(lo->plh_inode);
int remaining = 0;
dprintk("%s:Begin lo %p\n", __func__, lo);
@@ -2507,6 +2511,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
continue;
remaining++;
set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
+ pnfs_lseg_cancel_io(server, lseg);
}
if (remaining) {
@@ -3192,7 +3197,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
status = wait_on_bit_lock_action(&nfsi->flags,
NFS_INO_LAYOUTCOMMITTING,
nfs_wait_bit_killable,
- TASK_KILLABLE);
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
if (status)
goto out;
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index f331f067691b..e3e6a41f19de 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -169,6 +169,8 @@ struct pnfs_layoutdriver_type {
void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args);
int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args);
+
+ void (*cancel_io)(struct pnfs_layout_segment *lseg);
};
struct pnfs_commit_ops {
@@ -685,6 +687,13 @@ pnfs_lseg_request_intersecting(struct pnfs_layout_segment *lseg, struct nfs_page
req_offset(req), req_last);
}
+static inline void pnfs_lseg_cancel_io(struct nfs_server *server,
+ struct pnfs_layout_segment *lseg)
+{
+ if (server->pnfs_curr_ld->cancel_io)
+ server->pnfs_curr_ld->cancel_io(lseg);
+}
+
extern unsigned int layoutstats_timer;
#ifdef NFS_DEBUG
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 657c242a18ff..5d035dd2d7bf 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -374,12 +374,12 @@ pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets,
return NULL;
}
-/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head reqest
+/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head request
* for @page
* @cinfo - commit info for current inode
* @page - page to search for matching head request
*
- * Returns a the head request if one is found, otherwise returns NULL.
+ * Return: the head request if one is found, otherwise %NULL.
*/
struct nfs_page *
pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
@@ -821,7 +821,7 @@ static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
static struct nfs_client *(*get_v3_ds_connect)(
struct nfs_server *mds_srv,
- const struct sockaddr *ds_addr,
+ const struct sockaddr_storage *ds_addr,
int ds_addrlen,
int ds_proto,
unsigned int ds_timeo,
@@ -882,7 +882,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
continue;
}
clp = get_v3_ds_connect(mds_srv,
- (struct sockaddr *)&da->da_addr,
+ &da->da_addr,
da->da_addrlen, da->da_transport,
timeo, retrans);
if (IS_ERR(clp))
@@ -951,7 +951,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
put_cred(xprtdata.cred);
} else {
clp = nfs4_set_ds_client(mds_srv,
- (struct sockaddr *)&da->da_addr,
+ &da->da_addr,
da->da_addrlen,
da->da_transport, timeo,
retrans, minor_version);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ee66ffdb985e..05ae23657527 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -822,8 +822,7 @@ static int nfs_request_mount(struct fs_context *fc,
{
struct nfs_fs_context *ctx = nfs_fc2context(fc);
struct nfs_mount_request request = {
- .sap = (struct sockaddr *)
- &ctx->mount_server.address,
+ .sap = &ctx->mount_server._address,
.dirpath = ctx->nfs_server.export_path,
.protocol = ctx->mount_server.protocol,
.fh = root_fh,
@@ -854,7 +853,7 @@ static int nfs_request_mount(struct fs_context *fc,
* Construct the mount server's address.
*/
if (ctx->mount_server.address.sa_family == AF_UNSPEC) {
- memcpy(request.sap, &ctx->nfs_server.address,
+ memcpy(request.sap, &ctx->nfs_server._address,
ctx->nfs_server.addrlen);
ctx->mount_server.addrlen = ctx->nfs_server.addrlen;
}
diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
index a6f740366963..0cbcd2dfa732 100644
--- a/fs/nfs/sysfs.c
+++ b/fs/nfs/sysfs.c
@@ -26,7 +26,7 @@ static void nfs_netns_object_release(struct kobject *kobj)
}
static const struct kobj_ns_type_operations *nfs_netns_object_child_ns_type(
- struct kobject *kobj)
+ const struct kobject *kobj)
{
return &net_ns_type_operations;
}
@@ -82,7 +82,7 @@ static ssize_t nfs_netns_identifier_show(struct kobject *kobj,
ssize_t ret;
rcu_read_lock();
- ret = scnprintf(buf, PAGE_SIZE, "%s\n", rcu_dereference(c->identifier));
+ ret = sysfs_emit(buf, "%s\n", rcu_dereference(c->identifier));
rcu_read_unlock();
return ret;
}
@@ -130,7 +130,7 @@ static void nfs_netns_client_release(struct kobject *kobj)
kfree(c);
}
-static const void *nfs_netns_client_namespace(struct kobject *kobj)
+static const void *nfs_netns_client_namespace(const struct kobject *kobj)
{
return container_of(kobj, struct nfs_netns_client, kobject)->net;
}
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 9697cd5d2561..150a953a8be9 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -139,6 +139,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct inode *inode, struct nf
*/
spin_lock(&alias->d_lock);
if (d_really_is_positive(alias) &&
+ !nfs_compare_fh(NFS_FH(inode), NFS_FH(d_inode(alias))) &&
!(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
devname_garbage = alias->d_fsdata;
alias->d_fsdata = data;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f41d24b54fd1..80c240e50952 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1185,7 +1185,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
struct nfs_lock_context *l_ctx;
- struct file_lock_context *flctx = file_inode(file)->i_flctx;
+ struct file_lock_context *flctx = locks_inode_context(file_inode(file));
struct nfs_page *req;
int do_flush, status;
/*
@@ -1321,7 +1321,7 @@ static int nfs_can_extend_write(struct file *file, struct page *page,
struct inode *inode, unsigned int pagelen)
{
int ret;
- struct file_lock_context *flctx = inode->i_flctx;
+ struct file_lock_context *flctx = locks_inode_context(inode);
struct file_lock *fl;
if (file->f_flags & O_DSYNC)
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index f6a2fd3015e7..7c441f2bd444 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -8,6 +8,7 @@ config NFSD
select SUNRPC
select EXPORTFS
select NFS_ACL_SUPPORT if NFSD_V2_ACL
+ select NFS_ACL_SUPPORT if NFSD_V3_ACL
depends on MULTIUSER
help
Choose Y here if you want to allow other computers to access
@@ -26,19 +27,29 @@ config NFSD
Below you can choose which versions of the NFS protocol are
available to clients mounting the NFS server on this system.
- Support for NFS version 2 (RFC 1094) is always available when
+ Support for NFS version 3 (RFC 1813) is always available when
CONFIG_NFSD is selected.
If unsure, say N.
-config NFSD_V2_ACL
- bool
+config NFSD_V2
+ bool "NFS server support for NFS version 2 (DEPRECATED)"
depends on NFSD
+ default n
+ help
+ NFSv2 (RFC 1094) was the first publicly-released version of NFS.
+ Unless you are hosting ancient (1990's era) NFS clients, you don't
+ need this.
+
+ If unsure, say N.
+
+config NFSD_V2_ACL
+ bool "NFS server support for the NFSv2 ACL protocol extension"
+ depends on NFSD_V2
config NFSD_V3_ACL
bool "NFS server support for the NFSv3 ACL protocol extension"
depends on NFSD
- select NFSD_V2_ACL
help
Solaris NFS servers support an auxiliary NFSv3 ACL protocol that
never became an official part of the NFS version 3 protocol.
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 805c06d5f1b4..6fffc8f03f74 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -10,9 +10,10 @@ obj-$(CONFIG_NFSD) += nfsd.o
# this one should be compiled first, as the tracing macros can easily blow up
nfsd-y += trace.o
-nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
- export.o auth.o lockd.o nfscache.o nfsxdr.o \
+nfsd-y += nfssvc.o nfsctl.o nfsfh.o vfs.o \
+ export.o auth.o lockd.o nfscache.o \
stats.o filecache.o nfs3proc.o nfs3xdr.o
+nfsd-$(CONFIG_NFSD_V2) += nfsproc.o nfsxdr.o
nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index b6d01d51a746..04697f8dc37d 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -12,6 +12,7 @@
#include "blocklayoutxdr.h"
#include "pnfs.h"
#include "filecache.h"
+#include "vfs.h"
#define NFSDDBG_FACILITY NFSDDBG_PNFS
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 442543304930..8e9c1a0f8d38 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -9,6 +9,7 @@
#include "nfsd.h"
#include "blocklayoutxdr.h"
+#include "vfs.h"
#define NFSDDBG_FACILITY NFSDDBG_PNFS
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index 65c331f75e9c..f21259ead64b 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -84,6 +84,6 @@ int nfsd_reply_cache_init(struct nfsd_net *);
void nfsd_reply_cache_shutdown(struct nfsd_net *);
int nfsd_cache_lookup(struct svc_rqst *);
void nfsd_cache_update(struct svc_rqst *, int, __be32 *);
-int nfsd_reply_cache_stats_open(struct inode *, struct file *);
+int nfsd_reply_cache_stats_show(struct seq_file *m, void *v);
#endif /* NFSCACHE_H */
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index ee0e3aba4a6e..d03f7f6a8642 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -115,7 +115,6 @@ struct svc_export * rqst_find_fsidzero_export(struct svc_rqst *);
int exp_rootfh(struct net *, struct auth_domain *,
char *path, struct knfsd_fh *, int maxsize);
__be32 exp_pseudoroot(struct svc_rqst *, struct svc_fh *);
-__be32 nfserrno(int errno);
static inline void exp_put(struct svc_export *exp)
{
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index eeed4ae5b4ad..45b2c9e3f636 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -1,7 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * Open file cache.
+ * The NFSD open file cache.
*
* (c) 2015 - Jeff Layton <jeff.layton@primarydata.com>
+ *
+ * An nfsd_file object is a per-file collection of open state that binds
+ * together:
+ * - a struct file *
+ * - a user credential
+ * - a network namespace
+ * - a read-ahead context
+ * - monitoring for writeback errors
+ *
+ * nfsd_file objects are reference-counted. Consumers acquire a new
+ * object via the nfsd_file_acquire API. They manage their interest in
+ * the acquired object, and hence the object's reference count, via
+ * nfsd_file_get and nfsd_file_put. There are two varieties of nfsd_file
+ * object:
+ *
+ * * non-garbage-collected: When a consumer wants to precisely control
+ * the lifetime of a file's open state, it acquires a non-garbage-
+ * collected nfsd_file. The final nfsd_file_put releases the open
+ * state immediately.
+ *
+ * * garbage-collected: When a consumer does not control the lifetime
+ * of open state, it acquires a garbage-collected nfsd_file. The
+ * final nfsd_file_put allows the open state to linger for a period
+ * during which it may be re-used.
*/
#include <linux/hash.h>
@@ -33,7 +58,6 @@ static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions);
static DEFINE_PER_CPU(unsigned long, nfsd_file_releases);
static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age);
-static DEFINE_PER_CPU(unsigned long, nfsd_file_pages_flushed);
static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions);
struct nfsd_fcache_disposal {
@@ -63,6 +87,7 @@ struct nfsd_file_lookup_key {
struct net *net;
const struct cred *cred;
unsigned char need;
+ bool gc;
enum nfsd_file_lookup_type type;
};
@@ -162,6 +187,8 @@ static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
return 1;
if (!nfsd_match_cred(nf->nf_cred, key->cred))
return 1;
+ if (!!test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc)
+ return 1;
if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0)
return 1;
break;
@@ -184,12 +211,9 @@ static const struct rhashtable_params nfsd_file_rhash_params = {
static void
nfsd_file_schedule_laundrette(void)
{
- if ((atomic_read(&nfsd_file_rhash_tbl.nelems) == 0) ||
- test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 0)
- return;
-
- queue_delayed_work(system_wq, &nfsd_filecache_laundrette,
- NFSD_LAUNDRETTE_DELAY);
+ if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags))
+ queue_delayed_work(system_wq, &nfsd_filecache_laundrette,
+ NFSD_LAUNDRETTE_DELAY);
}
static void
@@ -297,56 +321,28 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
nf->nf_flags = 0;
__set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
__set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
+ if (key->gc)
+ __set_bit(NFSD_FILE_GC, &nf->nf_flags);
nf->nf_inode = key->inode;
- /* nf_ref is pre-incremented for hash table */
- refcount_set(&nf->nf_ref, 2);
+ refcount_set(&nf->nf_ref, 1);
nf->nf_may = key->need;
nf->nf_mark = NULL;
}
return nf;
}
-static bool
-nfsd_file_free(struct nfsd_file *nf)
-{
- s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime));
- bool flush = false;
-
- this_cpu_inc(nfsd_file_releases);
- this_cpu_add(nfsd_file_total_age, age);
-
- trace_nfsd_file_put_final(nf);
- if (nf->nf_mark)
- nfsd_file_mark_put(nf->nf_mark);
- if (nf->nf_file) {
- get_file(nf->nf_file);
- filp_close(nf->nf_file, NULL);
- fput(nf->nf_file);
- flush = true;
- }
-
- /*
- * If this item is still linked via nf_lru, that's a bug.
- * WARN and leak it to preserve system stability.
- */
- if (WARN_ON_ONCE(!list_empty(&nf->nf_lru)))
- return flush;
-
- call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
- return flush;
-}
-
-static bool
-nfsd_file_check_writeback(struct nfsd_file *nf)
+static void
+nfsd_file_fsync(struct nfsd_file *nf)
{
struct file *file = nf->nf_file;
- struct address_space *mapping;
+ int ret;
if (!file || !(file->f_mode & FMODE_WRITE))
- return false;
- mapping = file->f_mapping;
- return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) ||
- mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
+ return;
+ ret = vfs_fsync(file, 1);
+ trace_nfsd_file_fsync(nf, ret);
+ if (ret)
+ nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
}
static int
@@ -360,31 +356,6 @@ nfsd_file_check_write_error(struct nfsd_file *nf)
}
static void
-nfsd_file_flush(struct nfsd_file *nf)
-{
- struct file *file = nf->nf_file;
-
- if (!file || !(file->f_mode & FMODE_WRITE))
- return;
- this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages);
- if (vfs_fsync(file, 1) != 0)
- nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
-}
-
-static void nfsd_file_lru_add(struct nfsd_file *nf)
-{
- set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
- if (list_lru_add(&nfsd_file_lru, &nf->nf_lru))
- trace_nfsd_file_lru_add(nf);
-}
-
-static void nfsd_file_lru_remove(struct nfsd_file *nf)
-{
- if (list_lru_del(&nfsd_file_lru, &nf->nf_lru))
- trace_nfsd_file_lru_del(nf);
-}
-
-static void
nfsd_file_hash_remove(struct nfsd_file *nf)
{
trace_nfsd_file_unhash(nf);
@@ -405,68 +376,77 @@ nfsd_file_unhash(struct nfsd_file *nf)
return false;
}
-/*
- * Return true if the file was unhashed.
- */
-static bool
-nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
+static void
+nfsd_file_free(struct nfsd_file *nf)
{
- trace_nfsd_file_unhash_and_dispose(nf);
- if (!nfsd_file_unhash(nf))
- return false;
- /* keep final reference for nfsd_file_lru_dispose */
- if (refcount_dec_not_one(&nf->nf_ref))
- return true;
+ s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime));
- nfsd_file_lru_remove(nf);
- list_add(&nf->nf_lru, dispose);
- return true;
-}
+ trace_nfsd_file_free(nf);
-static void
-nfsd_file_put_noref(struct nfsd_file *nf)
-{
- trace_nfsd_file_put(nf);
+ this_cpu_inc(nfsd_file_releases);
+ this_cpu_add(nfsd_file_total_age, age);
- if (refcount_dec_and_test(&nf->nf_ref)) {
- WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
- nfsd_file_lru_remove(nf);
- nfsd_file_free(nf);
+ nfsd_file_unhash(nf);
+
+ /*
+ * We call fsync here in order to catch writeback errors. It's not
+ * strictly required by the protocol, but an nfsd_file could get
+ * evicted from the cache before a COMMIT comes in. If another
+ * task were to open that file in the interim and scrape the error,
+ * then the client may never see it. By calling fsync here, we ensure
+ * that writeback happens before the entry is freed, and that any
+ * errors reported result in the write verifier changing.
+ */
+ nfsd_file_fsync(nf);
+
+ if (nf->nf_mark)
+ nfsd_file_mark_put(nf->nf_mark);
+ if (nf->nf_file) {
+ get_file(nf->nf_file);
+ filp_close(nf->nf_file, NULL);
+ fput(nf->nf_file);
}
+
+ /*
+ * If this item is still linked via nf_lru, that's a bug.
+ * WARN and leak it to preserve system stability.
+ */
+ if (WARN_ON_ONCE(!list_empty(&nf->nf_lru)))
+ return;
+
+ call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
}
-void
-nfsd_file_put(struct nfsd_file *nf)
+static bool
+nfsd_file_check_writeback(struct nfsd_file *nf)
{
- might_sleep();
+ struct file *file = nf->nf_file;
+ struct address_space *mapping;
- nfsd_file_lru_add(nf);
- if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {
- nfsd_file_flush(nf);
- nfsd_file_put_noref(nf);
- } else if (nf->nf_file) {
- nfsd_file_put_noref(nf);
- nfsd_file_schedule_laundrette();
- } else
- nfsd_file_put_noref(nf);
+ if (!file || !(file->f_mode & FMODE_WRITE))
+ return false;
+ mapping = file->f_mapping;
+ return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) ||
+ mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
}
-/**
- * nfsd_file_close - Close an nfsd_file
- * @nf: nfsd_file to close
- *
- * If this is the final reference for @nf, free it immediately.
- * This reflects an on-the-wire CLOSE or DELEGRETURN into the
- * VFS and exported filesystem.
- */
-void nfsd_file_close(struct nfsd_file *nf)
+static bool nfsd_file_lru_add(struct nfsd_file *nf)
{
- nfsd_file_put(nf);
- if (refcount_dec_if_one(&nf->nf_ref)) {
- nfsd_file_unhash(nf);
- nfsd_file_lru_remove(nf);
- nfsd_file_free(nf);
+ set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
+ if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
+ trace_nfsd_file_lru_add(nf);
+ return true;
+ }
+ return false;
+}
+
+static bool nfsd_file_lru_remove(struct nfsd_file *nf)
+{
+ if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
+ trace_nfsd_file_lru_del(nf);
+ return true;
}
+ return false;
}
struct nfsd_file *
@@ -477,36 +457,60 @@ nfsd_file_get(struct nfsd_file *nf)
return NULL;
}
-static void
-nfsd_file_dispose_list(struct list_head *dispose)
+/**
+ * nfsd_file_put - put the reference to a nfsd_file
+ * @nf: nfsd_file of which to put the reference
+ *
+ * Put a reference to a nfsd_file. In the non-GC case, we just put the
+ * reference immediately. In the GC case, if the reference would be
+ * the last one, the put it on the LRU instead to be cleaned up later.
+ */
+void
+nfsd_file_put(struct nfsd_file *nf)
{
- struct nfsd_file *nf;
+ might_sleep();
+ trace_nfsd_file_put(nf);
- while(!list_empty(dispose)) {
- nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
- list_del_init(&nf->nf_lru);
- nfsd_file_flush(nf);
- nfsd_file_put_noref(nf);
+ if (test_bit(NFSD_FILE_GC, &nf->nf_flags) &&
+ test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+ /*
+ * If this is the last reference (nf_ref == 1), then try to
+ * transfer it to the LRU.
+ */
+ if (refcount_dec_not_one(&nf->nf_ref))
+ return;
+
+ /* Try to add it to the LRU. If that fails, decrement. */
+ if (nfsd_file_lru_add(nf)) {
+ /* If it's still hashed, we're done */
+ if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+ nfsd_file_schedule_laundrette();
+ return;
+ }
+
+ /*
+ * We're racing with unhashing, so try to remove it from
+ * the LRU. If removal fails, then someone else already
+ * has our reference.
+ */
+ if (!nfsd_file_lru_remove(nf))
+ return;
+ }
}
+ if (refcount_dec_and_test(&nf->nf_ref))
+ nfsd_file_free(nf);
}
static void
-nfsd_file_dispose_list_sync(struct list_head *dispose)
+nfsd_file_dispose_list(struct list_head *dispose)
{
- bool flush = false;
struct nfsd_file *nf;
- while(!list_empty(dispose)) {
+ while (!list_empty(dispose)) {
nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
list_del_init(&nf->nf_lru);
- nfsd_file_flush(nf);
- if (!refcount_dec_and_test(&nf->nf_ref))
- continue;
- if (nfsd_file_free(nf))
- flush = true;
+ nfsd_file_free(nf);
}
- if (flush)
- flush_delayed_fput();
}
static void
@@ -562,8 +566,6 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose)
* @lock: LRU list lock (unused)
* @arg: dispose list
*
- * Note this can deadlock with nfsd_file_cache_purge.
- *
* Return values:
* %LRU_REMOVED: @item was removed from the LRU
* %LRU_ROTATE: @item is to be moved to the LRU tail
@@ -578,21 +580,8 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
struct list_head *head = arg;
struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
- /*
- * Do a lockless refcount check. The hashtable holds one reference, so
- * we look to see if anything else has a reference, or if any have
- * been put since the shrinker last ran. Those don't get unhashed and
- * released.
- *
- * Note that in the put path, we set the flag and then decrement the
- * counter. Here we check the counter and then test and clear the flag.
- * That order is deliberate to ensure that we can do this locklessly.
- */
- if (refcount_read(&nf->nf_ref) > 1) {
- list_lru_isolate(lru, &nf->nf_lru);
- trace_nfsd_file_gc_in_use(nf);
- return LRU_REMOVED;
- }
+ /* We should only be dealing with GC entries here */
+ WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags));
/*
* Don't throw out files that are still undergoing I/O or
@@ -603,40 +592,30 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
return LRU_SKIP;
}
+ /* If it was recently added to the list, skip it */
if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) {
trace_nfsd_file_gc_referenced(nf);
return LRU_ROTATE;
}
- if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
- trace_nfsd_file_gc_hashed(nf);
- return LRU_SKIP;
+ /*
+ * Put the reference held on behalf of the LRU. If it wasn't the last
+ * one, then just remove it from the LRU and ignore it.
+ */
+ if (!refcount_dec_and_test(&nf->nf_ref)) {
+ trace_nfsd_file_gc_in_use(nf);
+ list_lru_isolate(lru, &nf->nf_lru);
+ return LRU_REMOVED;
}
+ /* Refcount went to zero. Unhash it and queue it to the dispose list */
+ nfsd_file_unhash(nf);
list_lru_isolate_move(lru, &nf->nf_lru, head);
this_cpu_inc(nfsd_file_evictions);
trace_nfsd_file_gc_disposed(nf);
return LRU_REMOVED;
}
-/*
- * Unhash items on @dispose immediately, then queue them on the
- * disposal workqueue to finish releasing them in the background.
- *
- * cel: Note that between the time list_lru_shrink_walk runs and
- * now, these items are in the hash table but marked unhashed.
- * Why release these outside of lru_cb ? There's no lock ordering
- * problem since lru_cb currently takes no lock.
- */
-static void nfsd_file_gc_dispose_list(struct list_head *dispose)
-{
- struct nfsd_file *nf;
-
- list_for_each_entry(nf, dispose, nf_lru)
- nfsd_file_hash_remove(nf);
- nfsd_file_dispose_list_delayed(dispose);
-}
-
static void
nfsd_file_gc(void)
{
@@ -646,14 +625,15 @@ nfsd_file_gc(void)
ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb,
&dispose, list_lru_count(&nfsd_file_lru));
trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
- nfsd_file_gc_dispose_list(&dispose);
+ nfsd_file_dispose_list_delayed(&dispose);
}
static void
nfsd_file_gc_worker(struct work_struct *work)
{
nfsd_file_gc();
- nfsd_file_schedule_laundrette();
+ if (list_lru_count(&nfsd_file_lru))
+ nfsd_file_schedule_laundrette();
}
static unsigned long
@@ -671,7 +651,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
nfsd_file_lru_cb, &dispose);
trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
- nfsd_file_gc_dispose_list(&dispose);
+ nfsd_file_dispose_list_delayed(&dispose);
return ret;
}
@@ -681,75 +661,112 @@ static struct shrinker nfsd_file_shrinker = {
.seeks = 1,
};
-/*
- * Find all cache items across all net namespaces that match @inode and
- * move them to @dispose. The lookup is atomic wrt nfsd_file_acquire().
+/**
+ * nfsd_file_queue_for_close: try to close out any open nfsd_files for an inode
+ * @inode: inode on which to close out nfsd_files
+ * @dispose: list on which to gather nfsd_files to close out
+ *
+ * An nfsd_file represents a struct file being held open on behalf of nfsd. An
+ * open file however can block other activity (such as leases), or cause
+ * undesirable behavior (e.g. spurious silly-renames when reexporting NFS).
+ *
+ * This function is intended to find open nfsd_files when this sort of
+ * conflicting access occurs and then attempt to close those files out.
+ *
+ * Populates the dispose list with entries that have already had their
+ * refcounts go to zero. The actual free of an nfsd_file can be expensive,
+ * so we leave it up to the caller whether it wants to wait or not.
*/
-static unsigned int
-__nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
+static void
+nfsd_file_queue_for_close(struct inode *inode, struct list_head *dispose)
{
struct nfsd_file_lookup_key key = {
.type = NFSD_FILE_KEY_INODE,
.inode = inode,
};
- unsigned int count = 0;
struct nfsd_file *nf;
rcu_read_lock();
do {
+ int decrement = 1;
+
nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key,
nfsd_file_rhash_params);
if (!nf)
break;
- nfsd_file_unhash_and_dispose(nf, dispose);
- count++;
+
+ /* If we raced with someone else unhashing, ignore it */
+ if (!nfsd_file_unhash(nf))
+ continue;
+
+ /* If we can't get a reference, ignore it */
+ if (!nfsd_file_get(nf))
+ continue;
+
+ /* Extra decrement if we remove from the LRU */
+ if (nfsd_file_lru_remove(nf))
+ ++decrement;
+
+ /* If refcount goes to 0, then put on the dispose list */
+ if (refcount_sub_and_test(decrement, &nf->nf_ref)) {
+ list_add(&nf->nf_lru, dispose);
+ trace_nfsd_file_closing(nf);
+ }
} while (1);
rcu_read_unlock();
- return count;
}
/**
- * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
+ * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
* @inode: inode of the file to attempt to remove
*
- * Unhash and put, then flush and fput all cache items associated with @inode.
+ * Close out any open nfsd_files that can be reaped for @inode. The
+ * actual freeing is deferred to the dispose_list_delayed infrastructure.
+ *
+ * This is used by the fsnotify callbacks and setlease notifier.
*/
-void
-nfsd_file_close_inode_sync(struct inode *inode)
+static void
+nfsd_file_close_inode(struct inode *inode)
{
LIST_HEAD(dispose);
- unsigned int count;
- count = __nfsd_file_close_inode(inode, &dispose);
- trace_nfsd_file_close_inode_sync(inode, count);
- nfsd_file_dispose_list_sync(&dispose);
+ nfsd_file_queue_for_close(inode, &dispose);
+ nfsd_file_dispose_list_delayed(&dispose);
}
/**
- * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
+ * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
* @inode: inode of the file to attempt to remove
*
- * Unhash and put all cache item associated with @inode.
+ * Close out any open nfsd_files that can be reaped for @inode. The
+ * nfsd_files are closed out synchronously.
+ *
+ * This is called from nfsd_rename and nfsd_unlink to avoid silly-renames
+ * when reexporting NFS.
*/
-static void
-nfsd_file_close_inode(struct inode *inode)
+void
+nfsd_file_close_inode_sync(struct inode *inode)
{
+ struct nfsd_file *nf;
LIST_HEAD(dispose);
- unsigned int count;
- count = __nfsd_file_close_inode(inode, &dispose);
- trace_nfsd_file_close_inode(inode, count);
- nfsd_file_dispose_list_delayed(&dispose);
+ trace_nfsd_file_close(inode);
+
+ nfsd_file_queue_for_close(inode, &dispose);
+ while (!list_empty(&dispose)) {
+ nf = list_first_entry(&dispose, struct nfsd_file, nf_lru);
+ list_del_init(&nf->nf_lru);
+ nfsd_file_free(nf);
+ }
+ flush_delayed_fput();
}
/**
* nfsd_file_delayed_close - close unused nfsd_files
* @work: dummy
*
- * Walk the LRU list and close any entries that have not been used since
+ * Walk the LRU list and destroy any entries that have not been used since
* the last scan.
- *
- * Note this can deadlock with nfsd_file_cache_purge.
*/
static void
nfsd_file_delayed_close(struct work_struct *work)
@@ -770,7 +787,7 @@ nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg,
/* Only close files for F_SETLEASE leases */
if (fl->fl_flags & FL_LEASE)
- nfsd_file_close_inode_sync(file_inode(fl->fl_file));
+ nfsd_file_close_inode(file_inode(fl->fl_file));
return 0;
}
@@ -891,8 +908,12 @@ out_err:
goto out;
}
-/*
- * Note this can deadlock with nfsd_file_lru_cb.
+/**
+ * __nfsd_file_cache_purge: clean out the cache for shutdown
+ * @net: net-namespace to shut down the cache (may be NULL)
+ *
+ * Walk the nfsd_file cache and close out any that match @net. If @net is NULL,
+ * then close out everything. Called when an nfsd instance is being shut down.
*/
static void
__nfsd_file_cache_purge(struct net *net)
@@ -900,7 +921,6 @@ __nfsd_file_cache_purge(struct net *net)
struct rhashtable_iter iter;
struct nfsd_file *nf;
LIST_HEAD(dispose);
- bool del;
rhashtable_walk_enter(&nfsd_file_rhash_tbl, &iter);
do {
@@ -908,16 +928,11 @@ __nfsd_file_cache_purge(struct net *net)
nf = rhashtable_walk_next(&iter);
while (!IS_ERR_OR_NULL(nf)) {
- if (net && nf->nf_net != net)
- continue;
- del = nfsd_file_unhash_and_dispose(nf, &dispose);
-
- /*
- * Deadlock detected! Something marked this entry as
- * unhased, but hasn't removed it from the hash list.
- */
- WARN_ON_ONCE(!del);
-
+ if (!net || nf->nf_net == net) {
+ nfsd_file_unhash(nf);
+ nfsd_file_lru_remove(nf);
+ list_add(&nf->nf_lru, &dispose);
+ }
nf = rhashtable_walk_next(&iter);
}
@@ -1023,7 +1038,6 @@ nfsd_file_cache_shutdown(void)
per_cpu(nfsd_file_acquisitions, i) = 0;
per_cpu(nfsd_file_releases, i) = 0;
per_cpu(nfsd_file_total_age, i) = 0;
- per_cpu(nfsd_file_pages_flushed, i) = 0;
per_cpu(nfsd_file_evictions, i) = 0;
}
}
@@ -1057,16 +1071,19 @@ nfsd_file_is_cached(struct inode *inode)
static __be32
nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
- unsigned int may_flags, struct nfsd_file **pnf, bool open)
+ unsigned int may_flags, struct nfsd_file **pnf,
+ bool open, bool want_gc)
{
struct nfsd_file_lookup_key key = {
.type = NFSD_FILE_KEY_FULL,
.need = may_flags & NFSD_FILE_MAY_MASK,
.net = SVC_NET(rqstp),
+ .gc = want_gc,
};
- struct nfsd_file *nf, *new;
- bool retry = true;
+ bool open_retry = true;
+ struct nfsd_file *nf;
__be32 status;
+ int ret;
status = fh_verify(rqstp, fhp, S_IFREG,
may_flags|NFSD_MAY_OWNER_OVERRIDE);
@@ -1076,35 +1093,38 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
key.cred = get_current_cred();
retry:
- /* Avoid allocation if the item is already in cache */
- nf = rhashtable_lookup_fast(&nfsd_file_rhash_tbl, &key,
- nfsd_file_rhash_params);
+ rcu_read_lock();
+ nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key,
+ nfsd_file_rhash_params);
if (nf)
nf = nfsd_file_get(nf);
- if (nf)
+ rcu_read_unlock();
+
+ if (nf) {
+ if (nfsd_file_lru_remove(nf))
+ WARN_ON_ONCE(refcount_dec_and_test(&nf->nf_ref));
goto wait_for_construction;
+ }
- new = nfsd_file_alloc(&key, may_flags);
- if (!new) {
+ nf = nfsd_file_alloc(&key, may_flags);
+ if (!nf) {
status = nfserr_jukebox;
goto out_status;
}
- nf = rhashtable_lookup_get_insert_key(&nfsd_file_rhash_tbl,
- &key, &new->nf_rhash,
- nfsd_file_rhash_params);
- if (!nf) {
- nf = new;
- goto open_file;
- }
- if (IS_ERR(nf))
- goto insert_err;
- nf = nfsd_file_get(nf);
- if (nf == NULL) {
- nf = new;
+ ret = rhashtable_lookup_insert_key(&nfsd_file_rhash_tbl,
+ &key, &nf->nf_rhash,
+ nfsd_file_rhash_params);
+ if (likely(ret == 0))
goto open_file;
- }
- nfsd_file_slab_free(&new->nf_rcu);
+
+ nfsd_file_slab_free(&nf->nf_rcu);
+ nf = NULL;
+ if (ret == -EEXIST)
+ goto retry;
+ trace_nfsd_file_insert_err(rqstp, key.inode, may_flags, ret);
+ status = nfserr_jukebox;
+ goto out_status;
wait_for_construction:
wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE);
@@ -1112,16 +1132,16 @@ wait_for_construction:
/* Did construction of this file fail? */
if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
trace_nfsd_file_cons_err(rqstp, key.inode, may_flags, nf);
- if (!retry) {
+ if (!open_retry) {
status = nfserr_jukebox;
goto out;
}
- retry = false;
- nfsd_file_put_noref(nf);
+ open_retry = false;
+ if (refcount_dec_and_test(&nf->nf_ref))
+ nfsd_file_free(nf);
goto retry;
}
- nfsd_file_lru_remove(nf);
this_cpu_inc(nfsd_file_cache_hits);
status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags));
@@ -1131,7 +1151,8 @@ out:
this_cpu_inc(nfsd_file_acquisitions);
*pnf = nf;
} else {
- nfsd_file_put(nf);
+ if (refcount_dec_and_test(&nf->nf_ref))
+ nfsd_file_free(nf);
nf = NULL;
}
@@ -1157,20 +1178,36 @@ open_file:
* If construction failed, or we raced with a call to unlink()
* then unhash.
*/
- if (status != nfs_ok || key.inode->i_nlink == 0)
- if (nfsd_file_unhash(nf))
- nfsd_file_put_noref(nf);
+ if (status == nfs_ok && key.inode->i_nlink == 0)
+ status = nfserr_jukebox;
+ if (status != nfs_ok)
+ nfsd_file_unhash(nf);
clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
smp_mb__after_atomic();
wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
goto out;
+}
-insert_err:
- nfsd_file_slab_free(&new->nf_rcu);
- trace_nfsd_file_insert_err(rqstp, key.inode, may_flags, PTR_ERR(nf));
- nf = NULL;
- status = nfserr_jukebox;
- goto out_status;
+/**
+ * nfsd_file_acquire_gc - Get a struct nfsd_file with an open file
+ * @rqstp: the RPC transaction being executed
+ * @fhp: the NFS filehandle of the file to be opened
+ * @may_flags: NFSD_MAY_ settings for the file
+ * @pnf: OUT: new or found "struct nfsd_file" object
+ *
+ * The nfsd_file object returned by this API is reference-counted
+ * and garbage-collected. The object is retained for a few
+ * seconds after the final nfsd_file_put() in case the caller
+ * wants to re-use it.
+ *
+ * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
+ * network byte order is returned.
+ */
+__be32
+nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **pnf)
+{
+ return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true, true);
}
/**
@@ -1180,6 +1217,10 @@ insert_err:
* @may_flags: NFSD_MAY_ settings for the file
* @pnf: OUT: new or found "struct nfsd_file" object
*
+ * The nfsd_file_object returned by this API is reference-counted
+ * but not garbage-collected. The object is unhashed after the
+ * final nfsd_file_put().
+ *
* Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
* network byte order is returned.
*/
@@ -1187,7 +1228,7 @@ __be32
nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **pnf)
{
- return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true);
+ return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true, false);
}
/**
@@ -1197,6 +1238,10 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
* @may_flags: NFSD_MAY_ settings for the file
* @pnf: OUT: new or found "struct nfsd_file" object
*
+ * The nfsd_file_object returned by this API is reference-counted
+ * but not garbage-collected. The object is released immediately
+ * one RCU grace period after the final nfsd_file_put().
+ *
* Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
* network byte order is returned.
*/
@@ -1204,7 +1249,7 @@ __be32
nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **pnf)
{
- return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, false);
+ return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, false, false);
}
/*
@@ -1212,9 +1257,9 @@ nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
* scraping this file for info should test the labels to ensure they're
* getting the correct field.
*/
-static int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
+int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
{
- unsigned long releases = 0, pages_flushed = 0, evictions = 0;
+ unsigned long releases = 0, evictions = 0;
unsigned long hits = 0, acquisitions = 0;
unsigned int i, count = 0, buckets = 0;
unsigned long lru = 0, total_age = 0;
@@ -1242,7 +1287,6 @@ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
releases += per_cpu(nfsd_file_releases, i);
total_age += per_cpu(nfsd_file_total_age, i);
evictions += per_cpu(nfsd_file_evictions, i);
- pages_flushed += per_cpu(nfsd_file_pages_flushed, i);
}
seq_printf(m, "total entries: %u\n", count);
@@ -1256,11 +1300,5 @@ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
seq_printf(m, "mean age (ms): %ld\n", total_age / releases);
else
seq_printf(m, "mean age (ms): -\n");
- seq_printf(m, "pages flushed: %lu\n", pages_flushed);
return 0;
}
-
-int nfsd_file_cache_stats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, nfsd_file_cache_stats_show, NULL);
-}
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 8e8c0c47d67d..b7efb2c3ddb1 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -38,6 +38,7 @@ struct nfsd_file {
#define NFSD_FILE_HASHED (0)
#define NFSD_FILE_PENDING (1)
#define NFSD_FILE_REFERENCED (2)
+#define NFSD_FILE_GC (3)
unsigned long nf_flags;
struct inode *nf_inode; /* don't deref */
refcount_t nf_ref;
@@ -52,13 +53,14 @@ void nfsd_file_cache_shutdown(void);
int nfsd_file_cache_start_net(struct net *net);
void nfsd_file_cache_shutdown_net(struct net *net);
void nfsd_file_put(struct nfsd_file *nf);
-void nfsd_file_close(struct nfsd_file *nf);
struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
void nfsd_file_close_inode_sync(struct inode *inode);
bool nfsd_file_is_cached(struct inode *inode);
+__be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **nfp);
__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **nfp);
__be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **nfp);
-int nfsd_file_cache_stats_open(struct inode *, struct file *);
+int nfsd_file_cache_stats_show(struct seq_file *m, void *v);
#endif /* _FS_NFSD_FILECACHE_H */
diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c
index 070f90ed09b6..3ca5304440ff 100644
--- a/fs/nfsd/flexfilelayout.c
+++ b/fs/nfsd/flexfilelayout.c
@@ -15,6 +15,7 @@
#include "flexfilelayoutxdr.h"
#include "pnfs.h"
+#include "vfs.h"
#define NFSDDBG_FACILITY NFSDDBG_PNFS
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index ffe17743cc74..8c854ba3285b 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -192,6 +192,10 @@ struct nfsd_net {
atomic_t nfs4_client_count;
int nfs4_max_clients;
+
+ atomic_t nfsd_courtesy_clients;
+ struct shrinker nfsd_client_shrinker;
+ struct delayed_work nfsd_shrinker_work;
};
/* Simple check to find out if a given net was properly initialized */
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 9edd3c1a30fb..1457f59f447a 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -55,7 +55,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp)
goto out;
if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
- acl = get_acl(inode, ACL_TYPE_ACCESS);
+ acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
if (acl == NULL) {
/* Solaris returns the inode's minimum ACL. */
acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
@@ -69,7 +69,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp)
if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
/* Check how Solaris handles requests for the Default ACL
of a non-directory! */
- acl = get_acl(inode, ACL_TYPE_DEFAULT);
+ acl = get_inode_acl(inode, ACL_TYPE_DEFAULT);
if (IS_ERR(acl)) {
resp->status = nfserrno(PTR_ERR(acl));
goto fail;
@@ -113,11 +113,11 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp)
inode_lock(inode);
- error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS,
+ error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_ACCESS,
argp->acl_access);
if (error)
goto out_drop_lock;
- error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT,
+ error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_DEFAULT,
argp->acl_default);
if (error)
goto out_drop_lock;
@@ -246,7 +246,6 @@ nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
struct nfsd3_getaclres *resp = rqstp->rq_resp;
struct dentry *dentry = resp->fh.fh_dentry;
struct inode *inode;
- int w;
if (!svcxdr_encode_stat(xdr, resp->status))
return false;
@@ -260,15 +259,6 @@ nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
if (xdr_stream_encode_u32(xdr, resp->mask) < 0)
return false;
- rqstp->rq_res.page_len = w = nfsacl_size(
- (resp->mask & NFS_ACL) ? resp->acl_access : NULL,
- (resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
- while (w > 0) {
- if (!*(rqstp->rq_next_page++))
- return true;
- w -= PAGE_SIZE;
- }
-
if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access,
resp->mask & NFS_ACL, 0))
return false;
@@ -331,6 +321,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
.pc_decode = nfssvc_decode_voidarg,
.pc_encode = nfssvc_encode_voidres,
.pc_argsize = sizeof(struct nfsd_voidargs),
+ .pc_argzero = sizeof(struct nfsd_voidargs),
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST,
@@ -342,6 +333,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
.pc_encode = nfsaclsvc_encode_getaclres,
.pc_release = nfsaclsvc_release_getacl,
.pc_argsize = sizeof(struct nfsd3_getaclargs),
+ .pc_argzero = sizeof(struct nfsd3_getaclargs),
.pc_ressize = sizeof(struct nfsd3_getaclres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+1+2*(1+ACL),
@@ -353,6 +345,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
.pc_encode = nfssvc_encode_attrstatres,
.pc_release = nfssvc_release_attrstat,
.pc_argsize = sizeof(struct nfsd3_setaclargs),
+ .pc_argzero = sizeof(struct nfsd3_setaclargs),
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT,
@@ -364,6 +357,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
.pc_encode = nfssvc_encode_attrstatres,
.pc_release = nfssvc_release_attrstat,
.pc_argsize = sizeof(struct nfsd_fhandle),
+ .pc_argzero = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT,
@@ -375,6 +369,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
.pc_encode = nfsaclsvc_encode_accessres,
.pc_release = nfsaclsvc_release_access,
.pc_argsize = sizeof(struct nfsd3_accessargs),
+ .pc_argzero = sizeof(struct nfsd3_accessargs),
.pc_ressize = sizeof(struct nfsd3_accessres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT+1,
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 9446c6743664..647108138e8a 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -47,7 +47,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp)
resp->mask = argp->mask;
if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
- acl = get_acl(inode, ACL_TYPE_ACCESS);
+ acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
if (acl == NULL) {
/* Solaris returns the inode's minimum ACL. */
acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
@@ -61,7 +61,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp)
if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
/* Check how Solaris handles requests for the Default ACL
of a non-directory! */
- acl = get_acl(inode, ACL_TYPE_DEFAULT);
+ acl = get_inode_acl(inode, ACL_TYPE_DEFAULT);
if (IS_ERR(acl)) {
resp->status = nfserrno(PTR_ERR(acl));
goto fail;
@@ -103,11 +103,11 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp)
inode_lock(inode);
- error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS,
+ error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_ACCESS,
argp->acl_access);
if (error)
goto out_drop_lock;
- error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT,
+ error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_DEFAULT,
argp->acl_default);
out_drop_lock:
@@ -171,11 +171,7 @@ nfs3svc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
struct nfsd3_getaclres *resp = rqstp->rq_resp;
struct dentry *dentry = resp->fh.fh_dentry;
- struct kvec *head = rqstp->rq_res.head;
struct inode *inode;
- unsigned int base;
- int n;
- int w;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
return false;
@@ -187,26 +183,12 @@ nfs3svc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
if (xdr_stream_encode_u32(xdr, resp->mask) < 0)
return false;
- base = (char *)xdr->p - (char *)head->iov_base;
-
- rqstp->rq_res.page_len = w = nfsacl_size(
- (resp->mask & NFS_ACL) ? resp->acl_access : NULL,
- (resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
- while (w > 0) {
- if (!*(rqstp->rq_next_page++))
- return false;
- w -= PAGE_SIZE;
- }
-
- n = nfsacl_encode(&rqstp->rq_res, base, inode,
- resp->acl_access,
- resp->mask & NFS_ACL, 0);
- if (n > 0)
- n = nfsacl_encode(&rqstp->rq_res, base + n, inode,
- resp->acl_default,
- resp->mask & NFS_DFACL,
- NFS_ACL_DEFAULT);
- if (n <= 0)
+ if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access,
+ resp->mask & NFS_ACL, 0))
+ return false;
+ if (!nfs_stream_encode_acl(xdr, inode, resp->acl_default,
+ resp->mask & NFS_DFACL,
+ NFS_ACL_DEFAULT))
return false;
break;
default:
@@ -252,6 +234,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = {
.pc_decode = nfssvc_decode_voidarg,
.pc_encode = nfssvc_encode_voidres,
.pc_argsize = sizeof(struct nfsd_voidargs),
+ .pc_argzero = sizeof(struct nfsd_voidargs),
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST,
@@ -263,6 +246,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = {
.pc_encode = nfs3svc_encode_getaclres,
.pc_release = nfs3svc_release_getacl,
.pc_argsize = sizeof(struct nfsd3_getaclargs),
+ .pc_argzero = sizeof(struct nfsd3_getaclargs),
.pc_ressize = sizeof(struct nfsd3_getaclres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+1+2*(1+ACL),
@@ -274,6 +258,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = {
.pc_encode = nfs3svc_encode_setaclres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd3_setaclargs),
+ .pc_argzero = sizeof(struct nfsd3_setaclargs),
.pc_ressize = sizeof(struct nfsd3_attrstat),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+pAT,
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index a41cca619338..d01b29aba662 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -13,6 +13,7 @@
#include "cache.h"
#include "xdr3.h"
#include "vfs.h"
+#include "filecache.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -150,7 +151,6 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
{
struct nfsd3_readargs *argp = rqstp->rq_argp;
struct nfsd3_readres *resp = rqstp->rq_resp;
- u32 max_blocksize = svc_max_payload(rqstp);
unsigned int len;
int v;
@@ -159,7 +159,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
(unsigned long) argp->count,
(unsigned long long) argp->offset);
- argp->count = min_t(u32, argp->count, max_blocksize);
+ argp->count = min_t(u32, argp->count, svc_max_payload(rqstp));
+ argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen);
if (argp->offset > (u64)OFFSET_MAX)
argp->offset = (u64)OFFSET_MAX;
if (argp->offset + argp->count > (u64)OFFSET_MAX)
@@ -563,25 +564,18 @@ static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp,
{
struct xdr_buf *buf = &resp->dirlist;
struct xdr_stream *xdr = &resp->xdr;
-
- count = clamp(count, (u32)(XDR_UNIT * 2), svc_max_payload(rqstp));
+ unsigned int sendbuf = min_t(unsigned int, rqstp->rq_res.buflen,
+ svc_max_payload(rqstp));
memset(buf, 0, sizeof(*buf));
/* Reserve room for the NULL ptr & eof flag (-2 words) */
- buf->buflen = count - XDR_UNIT * 2;
+ buf->buflen = clamp(count, (u32)(XDR_UNIT * 2), sendbuf);
+ buf->buflen -= XDR_UNIT * 2;
buf->pages = rqstp->rq_next_page;
rqstp->rq_next_page += (buf->buflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
- /* This is xdr_init_encode(), but it assumes that
- * the head kvec has already been consumed. */
- xdr_set_scratch_buffer(xdr, NULL, 0);
- xdr->buf = buf;
- xdr->page_ptr = buf->pages;
- xdr->iov = NULL;
- xdr->p = page_address(*buf->pages);
- xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE);
- xdr->rqst = NULL;
+ xdr_init_encode_pages(xdr, buf, buf->pages, NULL);
}
/*
@@ -770,6 +764,7 @@ nfsd3_proc_commit(struct svc_rqst *rqstp)
{
struct nfsd3_commitargs *argp = rqstp->rq_argp;
struct nfsd3_commitres *resp = rqstp->rq_resp;
+ struct nfsd_file *nf;
dprintk("nfsd: COMMIT(3) %s %u@%Lu\n",
SVCFH_fmt(&argp->fh),
@@ -777,8 +772,14 @@ nfsd3_proc_commit(struct svc_rqst *rqstp)
(unsigned long long) argp->offset);
fh_copy(&resp->fh, &argp->fh);
- resp->status = nfsd_commit(rqstp, &resp->fh, argp->offset,
+ resp->status = nfsd_file_acquire_gc(rqstp, &resp->fh, NFSD_MAY_WRITE |
+ NFSD_MAY_NOT_BREAK_LEASE, &nf);
+ if (resp->status)
+ goto out;
+ resp->status = nfsd_commit(rqstp, &resp->fh, nf, argp->offset,
argp->count, resp->verf);
+ nfsd_file_put(nf);
+out:
return rpc_success;
}
@@ -808,6 +809,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_decode = nfssvc_decode_voidarg,
.pc_encode = nfssvc_encode_voidres,
.pc_argsize = sizeof(struct nfsd_voidargs),
+ .pc_argzero = sizeof(struct nfsd_voidargs),
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST,
@@ -819,6 +821,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_getattrres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd_fhandle),
+ .pc_argzero = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd3_attrstatres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT,
@@ -830,6 +833,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_wccstatres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd3_sattrargs),
+ .pc_argzero = sizeof(struct nfsd3_sattrargs),
.pc_ressize = sizeof(struct nfsd3_wccstatres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+WC,
@@ -841,6 +845,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_lookupres,
.pc_release = nfs3svc_release_fhandle2,
.pc_argsize = sizeof(struct nfsd3_diropargs),
+ .pc_argzero = sizeof(struct nfsd3_diropargs),
.pc_ressize = sizeof(struct nfsd3_diropres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+FH+pAT+pAT,
@@ -852,6 +857,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_accessres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd3_accessargs),
+ .pc_argzero = sizeof(struct nfsd3_accessargs),
.pc_ressize = sizeof(struct nfsd3_accessres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+pAT+1,
@@ -863,6 +869,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_readlinkres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd_fhandle),
+ .pc_argzero = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd3_readlinkres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+pAT+1+NFS3_MAXPATHLEN/4,
@@ -874,6 +881,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_readres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd3_readargs),
+ .pc_argzero = sizeof(struct nfsd3_readargs),
.pc_ressize = sizeof(struct nfsd3_readres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+pAT+4+NFSSVC_MAXBLKSIZE/4,
@@ -885,6 +893,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_writeres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd3_writeargs),
+ .pc_argzero = sizeof(struct nfsd3_writeargs),
.pc_ressize = sizeof(struct nfsd3_writeres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+WC+4,
@@ -896,6 +905,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_createres,
.pc_release = nfs3svc_release_fhandle2,
.pc_argsize = sizeof(struct nfsd3_createargs),
+ .pc_argzero = sizeof(struct nfsd3_createargs),
.pc_ressize = sizeof(struct nfsd3_createres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+(1+FH+pAT)+WC,
@@ -907,6 +917,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_createres,
.pc_release = nfs3svc_release_fhandle2,
.pc_argsize = sizeof(struct nfsd3_mkdirargs),
+ .pc_argzero = sizeof(struct nfsd3_mkdirargs),
.pc_ressize = sizeof(struct nfsd3_createres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+(1+FH+pAT)+WC,
@@ -918,6 +929,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_createres,
.pc_release = nfs3svc_release_fhandle2,
.pc_argsize = sizeof(struct nfsd3_symlinkargs),
+ .pc_argzero = sizeof(struct nfsd3_symlinkargs),
.pc_ressize = sizeof(struct nfsd3_createres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+(1+FH+pAT)+WC,
@@ -929,6 +941,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_createres,
.pc_release = nfs3svc_release_fhandle2,
.pc_argsize = sizeof(struct nfsd3_mknodargs),
+ .pc_argzero = sizeof(struct nfsd3_mknodargs),
.pc_ressize = sizeof(struct nfsd3_createres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+(1+FH+pAT)+WC,
@@ -940,6 +953,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_wccstatres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd3_diropargs),
+ .pc_argzero = sizeof(struct nfsd3_diropargs),
.pc_ressize = sizeof(struct nfsd3_wccstatres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+WC,
@@ -951,6 +965,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_wccstatres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd3_diropargs),
+ .pc_argzero = sizeof(struct nfsd3_diropargs),
.pc_ressize = sizeof(struct nfsd3_wccstatres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+WC,
@@ -962,6 +977,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_renameres,
.pc_release = nfs3svc_release_fhandle2,
.pc_argsize = sizeof(struct nfsd3_renameargs),
+ .pc_argzero = sizeof(struct nfsd3_renameargs),
.pc_ressize = sizeof(struct nfsd3_renameres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+WC+WC,
@@ -973,6 +989,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_linkres,
.pc_release = nfs3svc_release_fhandle2,
.pc_argsize = sizeof(struct nfsd3_linkargs),
+ .pc_argzero = sizeof(struct nfsd3_linkargs),
.pc_ressize = sizeof(struct nfsd3_linkres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+pAT+WC,
@@ -984,6 +1001,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_readdirres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd3_readdirargs),
+ .pc_argzero = sizeof(struct nfsd3_readdirargs),
.pc_ressize = sizeof(struct nfsd3_readdirres),
.pc_cachetype = RC_NOCACHE,
.pc_name = "READDIR",
@@ -994,6 +1012,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_readdirres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd3_readdirplusargs),
+ .pc_argzero = sizeof(struct nfsd3_readdirplusargs),
.pc_ressize = sizeof(struct nfsd3_readdirres),
.pc_cachetype = RC_NOCACHE,
.pc_name = "READDIRPLUS",
@@ -1003,6 +1022,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_decode = nfs3svc_decode_fhandleargs,
.pc_encode = nfs3svc_encode_fsstatres,
.pc_argsize = sizeof(struct nfsd3_fhandleargs),
+ .pc_argzero = sizeof(struct nfsd3_fhandleargs),
.pc_ressize = sizeof(struct nfsd3_fsstatres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+pAT+2*6+1,
@@ -1013,6 +1033,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_decode = nfs3svc_decode_fhandleargs,
.pc_encode = nfs3svc_encode_fsinfores,
.pc_argsize = sizeof(struct nfsd3_fhandleargs),
+ .pc_argzero = sizeof(struct nfsd3_fhandleargs),
.pc_ressize = sizeof(struct nfsd3_fsinfores),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+pAT+12,
@@ -1023,6 +1044,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_decode = nfs3svc_decode_fhandleargs,
.pc_encode = nfs3svc_encode_pathconfres,
.pc_argsize = sizeof(struct nfsd3_fhandleargs),
+ .pc_argzero = sizeof(struct nfsd3_fhandleargs),
.pc_ressize = sizeof(struct nfsd3_pathconfres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+pAT+6,
@@ -1034,6 +1056,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_commitres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd3_commitargs),
+ .pc_argzero = sizeof(struct nfsd3_commitargs),
.pc_ressize = sizeof(struct nfsd3_commitres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+WC+2,
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 0293b8d65f10..3308dd671ef0 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -571,10 +571,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
args->count = max_blocksize;
args->len = max_blocksize;
}
- if (!xdr_stream_subsegment(xdr, &args->payload, args->count))
- return false;
- return true;
+ return xdr_stream_subsegment(xdr, &args->payload, args->count);
}
bool
@@ -616,8 +614,6 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
struct nfsd3_symlinkargs *args = rqstp->rq_argp;
struct kvec *head = rqstp->rq_arg.head;
- struct kvec *tail = rqstp->rq_arg.tail;
- size_t remaining;
if (!svcxdr_decode_diropargs3(xdr, &args->ffh, &args->fname, &args->flen))
return false;
@@ -626,16 +622,10 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
if (xdr_stream_decode_u32(xdr, &args->tlen) < 0)
return false;
- /* request sanity */
- remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len;
- remaining -= xdr_stream_pos(xdr);
- if (remaining < xdr_align_size(args->tlen))
- return false;
-
- args->first.iov_base = xdr->p;
+ /* symlink_data */
args->first.iov_len = head->iov_len - xdr_stream_pos(xdr);
-
- return true;
+ args->first.iov_base = xdr_inline_decode(xdr, args->tlen);
+ return args->first.iov_base != NULL;
}
bool
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index bb8e2f6d7d03..518203821790 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -135,7 +135,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
unsigned int flags = 0;
int size = 0;
- pacl = get_acl(inode, ACL_TYPE_ACCESS);
+ pacl = get_inode_acl(inode, ACL_TYPE_ACCESS);
if (!pacl)
pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
@@ -147,7 +147,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
if (S_ISDIR(inode->i_mode)) {
flags = NFS4_ACL_DIR;
- dpacl = get_acl(inode, ACL_TYPE_DEFAULT);
+ dpacl = get_inode_acl(inode, ACL_TYPE_DEFAULT);
if (IS_ERR(dpacl)) {
error = PTR_ERR(dpacl);
goto rel_pacl;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4ce328209f61..2a815f5a52c4 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -76,6 +76,17 @@ static __be32 *xdr_encode_empty_array(__be32 *p)
* 1 Protocol"
*/
+static void encode_uint32(struct xdr_stream *xdr, u32 n)
+{
+ WARN_ON_ONCE(xdr_stream_encode_u32(xdr, n) < 0);
+}
+
+static void encode_bitmap4(struct xdr_stream *xdr, const __u32 *bitmap,
+ size_t len)
+{
+ WARN_ON_ONCE(xdr_stream_encode_uint32_array(xdr, bitmap, len) < 0);
+}
+
/*
* nfs_cb_opnum4
*
@@ -329,6 +340,24 @@ static void encode_cb_recall4args(struct xdr_stream *xdr,
}
/*
+ * CB_RECALLANY4args
+ *
+ * struct CB_RECALLANY4args {
+ * uint32_t craa_objects_to_keep;
+ * bitmap4 craa_type_mask;
+ * };
+ */
+static void
+encode_cb_recallany4args(struct xdr_stream *xdr,
+ struct nfs4_cb_compound_hdr *hdr, struct nfsd4_cb_recall_any *ra)
+{
+ encode_nfs_cb_opnum4(xdr, OP_CB_RECALL_ANY);
+ encode_uint32(xdr, ra->ra_keep);
+ encode_bitmap4(xdr, ra->ra_bmval, ARRAY_SIZE(ra->ra_bmval));
+ hdr->nops++;
+}
+
+/*
* CB_SEQUENCE4args
*
* struct CB_SEQUENCE4args {
@@ -482,6 +511,26 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_cb_nops(&hdr);
}
+/*
+ * 20.6. Operation 8: CB_RECALL_ANY - Keep Any N Recallable Objects
+ */
+static void
+nfs4_xdr_enc_cb_recall_any(struct rpc_rqst *req,
+ struct xdr_stream *xdr, const void *data)
+{
+ const struct nfsd4_callback *cb = data;
+ struct nfsd4_cb_recall_any *ra;
+ struct nfs4_cb_compound_hdr hdr = {
+ .ident = cb->cb_clp->cl_cb_ident,
+ .minorversion = cb->cb_clp->cl_minorversion,
+ };
+
+ ra = container_of(cb, struct nfsd4_cb_recall_any, ra_cb);
+ encode_cb_compound4args(xdr, &hdr);
+ encode_cb_sequence4args(xdr, cb, &hdr);
+ encode_cb_recallany4args(xdr, &hdr, ra);
+ encode_cb_nops(&hdr);
+}
/*
* NFSv4.0 and NFSv4.1 XDR decode functions
@@ -520,6 +569,28 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status);
}
+/*
+ * 20.6. Operation 8: CB_RECALL_ANY - Keep Any N Recallable Objects
+ */
+static int
+nfs4_xdr_dec_cb_recall_any(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfsd4_callback *cb = data;
+ struct nfs4_cb_compound_hdr hdr;
+ int status;
+
+ status = decode_cb_compound4res(xdr, &hdr);
+ if (unlikely(status))
+ return status;
+ status = decode_cb_sequence4res(xdr, cb);
+ if (unlikely(status || cb->cb_seq_status))
+ return status;
+ status = decode_cb_op_status(xdr, OP_CB_RECALL_ANY, &cb->cb_status);
+ return status;
+}
+
#ifdef CONFIG_NFSD_PNFS
/*
* CB_LAYOUTRECALL4args
@@ -783,6 +854,7 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = {
#endif
PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock),
PROC(CB_OFFLOAD, COMPOUND, cb_offload, cb_offload),
+ PROC(CB_RECALL_ANY, COMPOUND, cb_recall_any, cb_recall_any),
};
static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)];
@@ -870,7 +942,7 @@ static const struct cred *get_backchannel_cred(struct nfs4_client *clp, struct r
} else {
struct cred *kcred;
- kcred = prepare_kernel_cred(NULL);
+ kcred = prepare_kernel_cred(&init_task);
if (!kcred)
return NULL;
@@ -916,7 +988,6 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
} else {
if (!conn->cb_xprt)
return -EINVAL;
- clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
clp->cl_cb_session = ses;
args.bc_xprt = conn->cb_xprt;
args.prognumber = clp->cl_cb_session->se_cb_prog;
@@ -936,6 +1007,9 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
rpc_shutdown_client(client);
return -ENOMEM;
}
+
+ if (clp->cl_minorversion != 0)
+ clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
clp->cl_cb_client = client;
clp->cl_cb_cred = cred;
rcu_read_lock();
@@ -1371,11 +1445,21 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
cb->cb_holds_slot = false;
}
-void nfsd4_run_cb(struct nfsd4_callback *cb)
+/**
+ * nfsd4_run_cb - queue up a callback job to run
+ * @cb: callback to queue
+ *
+ * Kick off a callback to do its thing. Returns false if it was already
+ * on a queue, true otherwise.
+ */
+bool nfsd4_run_cb(struct nfsd4_callback *cb)
{
struct nfs4_client *clp = cb->cb_clp;
+ bool queued;
nfsd41_cb_inflight_begin(clp);
- if (!nfsd4_queue_cb(cb))
+ queued = nfsd4_queue_cb(cb);
+ if (!queued)
nfsd41_cb_inflight_end(clp);
+ return queued;
}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index f92161ce1f97..5e9809aff37e 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -41,6 +41,7 @@
#include "idmap.h"
#include "nfsd.h"
#include "netns.h"
+#include "vfs.h"
/*
* Turn off idmapping when using AUTH_SYS.
@@ -82,8 +83,8 @@ ent_init(struct cache_head *cnew, struct cache_head *citm)
new->id = itm->id;
new->type = itm->type;
- strlcpy(new->name, itm->name, sizeof(new->name));
- strlcpy(new->authname, itm->authname, sizeof(new->authname));
+ strscpy(new->name, itm->name, sizeof(new->name));
+ strscpy(new->authname, itm->authname, sizeof(new->authname));
}
static void
@@ -548,7 +549,7 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
return nfserr_badowner;
memcpy(key.name, name, namelen);
key.name[namelen] = '\0';
- strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
+ strscpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
ret = idmap_lookup(rqstp, nametoid_lookup, &key, nn->nametoid_cache, &item);
if (ret == -ENOENT)
return nfserr_badowner;
@@ -584,7 +585,7 @@ static __be32 idmap_id_to_name(struct xdr_stream *xdr,
int ret;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
- strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
+ strscpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item);
if (ret == -ENOENT)
return encode_ascii_id(xdr, id);
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 2c05692a9abf..3564d1c6f610 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -658,7 +658,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
ktime_t now, cutoff;
const struct nfsd4_layout_ops *ops;
-
+ trace_nfsd_cb_layout_done(&ls->ls_stid.sc_stateid, task);
switch (task->tk_status) {
case 0:
case -NFS4ERR_DELAY:
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index a72ab97f77ef..bd880d55f565 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -141,7 +141,6 @@ fh_dup2(struct svc_fh *dst, struct svc_fh *src)
static __be32
do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, int accmode)
{
- __be32 status;
if (open->op_truncate &&
!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
@@ -156,9 +155,7 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
if (open->op_share_deny & NFS4_SHARE_DENY_READ)
accmode |= NFSD_MAY_WRITE;
- status = fh_verify(rqstp, current_fh, S_IFREG, accmode);
-
- return status;
+ return fh_verify(rqstp, current_fh, S_IFREG, accmode);
}
static __be32 nfsd_check_obj_isreg(struct svc_fh *fh)
@@ -454,7 +451,6 @@ static __be32
do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
{
struct svc_fh *current_fh = &cstate->current_fh;
- __be32 status;
int accmode = 0;
/* We don't know the target directory, and therefore can not
@@ -479,9 +475,7 @@ do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, str
if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH)
accmode = NFSD_MAY_OWNER_OVERRIDE;
- status = do_open_permission(rqstp, current_fh, open, accmode);
-
- return status;
+ return do_open_permission(rqstp, current_fh, open, accmode);
}
static void
@@ -668,11 +662,9 @@ static __be32
nfsd4_putrootfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
{
- __be32 status;
-
fh_put(&cstate->current_fh);
- status = exp_pseudoroot(rqstp, &cstate->current_fh);
- return status;
+
+ return exp_pseudoroot(rqstp, &cstate->current_fh);
}
static __be32
@@ -739,10 +731,19 @@ nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
{
struct nfsd4_commit *commit = &u->commit;
+ struct nfsd_file *nf;
+ __be32 status;
- return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
+ status = nfsd_file_acquire(rqstp, &cstate->current_fh, NFSD_MAY_WRITE |
+ NFSD_MAY_NOT_BREAK_LEASE, &nf);
+ if (status != nfs_ok)
+ return status;
+
+ status = nfsd_commit(rqstp, &cstate->current_fh, nf, commit->co_offset,
commit->co_count,
(__be32 *)commit->co_verf.data);
+ nfsd_file_put(nf);
+ return status;
}
static __be32
@@ -942,12 +943,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
&read->rd_stateid, RD_STATE,
&read->rd_nf, NULL);
- if (status) {
- dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
- goto out;
- }
- status = nfs_ok;
-out:
+
read->rd_rqstp = rqstp;
read->rd_fhp = &cstate->current_fh;
return status;
@@ -1116,10 +1112,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfs4_preprocess_stateid_op(rqstp, cstate,
&cstate->current_fh, &setattr->sa_stateid,
WR_STATE, NULL, NULL);
- if (status) {
- dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
+ if (status)
return status;
- }
}
err = fh_want_write(&cstate->current_fh);
if (err)
@@ -1141,6 +1135,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
0, (time64_t)0);
if (!status)
status = nfserrno(attrs.na_labelerr);
+ if (!status)
+ status = nfserrno(attrs.na_aclerr);
out:
nfsd_attrs_free(&attrs);
fh_drop_write(&cstate->current_fh);
@@ -1167,10 +1163,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
write->wr_offset, cnt);
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
stateid, WR_STATE, &nf, NULL);
- if (status) {
- dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
+ if (status)
return status;
- }
write->wr_how_written = write->wr_stable_how;
@@ -1201,17 +1195,13 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh,
src_stateid, RD_STATE, src, NULL);
- if (status) {
- dprintk("NFSD: %s: couldn't process src stateid!\n", __func__);
+ if (status)
goto out;
- }
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
dst_stateid, WR_STATE, dst, NULL);
- if (status) {
- dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__);
+ if (status)
goto out_put_src;
- }
/* fix up for NFS-specific error code */
if (!S_ISREG(file_inode((*src)->nf_file)->i_mode) ||
@@ -1343,7 +1333,7 @@ try_again:
return 0;
}
if (work) {
- strlcpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr) - 1);
+ strscpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr) - 1);
refcount_set(&work->nsui_refcnt, 2);
work->nsui_busy = true;
list_add_tail(&work->nsui_list, &nn->nfsd_ssc_mount_list);
@@ -1471,13 +1461,6 @@ out_err:
return status;
}
-static void
-nfsd4_interssc_disconnect(struct vfsmount *ss_mnt)
-{
- nfs_do_sb_deactive(ss_mnt->mnt_sb);
- mntput(ss_mnt);
-}
-
/*
* Verify COPY destination stateid.
*
@@ -1580,11 +1563,6 @@ nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct file *filp,
{
}
-static void
-nfsd4_interssc_disconnect(struct vfsmount *ss_mnt)
-{
-}
-
static struct file *nfs42_ssc_open(struct vfsmount *ss_mnt,
struct nfs_fh *src_fh,
nfs4_stateid *stateid)
@@ -1621,6 +1599,10 @@ static void nfsd4_cb_offload_release(struct nfsd4_callback *cb)
static int nfsd4_cb_offload_done(struct nfsd4_callback *cb,
struct rpc_task *task)
{
+ struct nfsd4_cb_offload *cbo =
+ container_of(cb, struct nfsd4_cb_offload, co_cb);
+
+ trace_nfsd_cb_offload_done(&cbo->co_res.cb_stateid, task);
return 1;
}
@@ -1648,6 +1630,7 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy,
u64 src_pos = copy->cp_src_pos;
u64 dst_pos = copy->cp_dst_pos;
int status;
+ loff_t end;
/* See RFC 7862 p.67: */
if (bytes_total == 0)
@@ -1667,8 +1650,8 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy,
/* for a non-zero asynchronous copy do a commit of data */
if (nfsd4_copy_is_async(copy) && copy->cp_res.wr_bytes_written > 0) {
since = READ_ONCE(dst->f_wb_err);
- status = vfs_fsync_range(dst, copy->cp_dst_pos,
- copy->cp_res.wr_bytes_written, 0);
+ end = copy->cp_dst_pos + copy->cp_res.wr_bytes_written - 1;
+ status = vfs_fsync_range(dst, copy->cp_dst_pos, end, 0);
if (!status)
status = filemap_check_wb_err(dst->f_mapping, since);
if (!status)
@@ -1768,8 +1751,14 @@ static int nfsd4_do_async_copy(void *data)
filp = nfs42_ssc_open(copy->ss_mnt, &copy->c_fh,
&copy->stateid);
if (IS_ERR(filp)) {
- nfserr = nfserr_offload_denied;
- nfsd4_interssc_disconnect(copy->ss_mnt);
+ switch (PTR_ERR(filp)) {
+ case -EBADF:
+ nfserr = nfserr_wrong_type;
+ break;
+ default:
+ nfserr = nfserr_offload_denied;
+ }
+ /* ss_mnt will be unmounted by the laundromat */
goto do_callback;
}
nfserr = nfsd4_do_copy(copy, filp, copy->nf_dst->nf_file,
@@ -1826,7 +1815,7 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (!nfs4_init_copy_state(nn, copy))
goto out_err;
refcount_set(&async_copy->refcount, 1);
- memcpy(&copy->cp_res.cb_stateid, &copy->cp_stateid.stid,
+ memcpy(&copy->cp_res.cb_stateid, &copy->cp_stateid.cs_stid,
sizeof(copy->cp_res.cb_stateid));
dup_copy_fields(copy, async_copy);
async_copy->copy_task = kthread_create(nfsd4_do_async_copy,
@@ -1850,8 +1839,10 @@ out_err:
if (async_copy)
cleanup_async_copy(async_copy);
status = nfserrno(-ENOMEM);
- if (nfsd4_ssc_is_inter(copy))
- nfsd4_interssc_disconnect(copy->ss_mnt);
+ /*
+ * source's vfsmount of inter-copy will be unmounted
+ * by the laundromat
+ */
goto out;
}
@@ -1862,7 +1853,7 @@ find_async_copy(struct nfs4_client *clp, stateid_t *stateid)
spin_lock(&clp->async_lock);
list_for_each_entry(copy, &clp->async_copies, copies) {
- if (memcmp(&copy->cp_stateid.stid, stateid, NFS4_STATEID_SIZE))
+ if (memcmp(&copy->cp_stateid.cs_stid, stateid, NFS4_STATEID_SIZE))
continue;
refcount_inc(&copy->refcount);
spin_unlock(&clp->async_lock);
@@ -1916,7 +1907,7 @@ nfsd4_copy_notify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
cps = nfs4_alloc_init_cpntf_state(nn, stid);
if (!cps)
goto out;
- memcpy(&cn->cpn_cnr_stateid, &cps->cp_stateid.stid, sizeof(stateid_t));
+ memcpy(&cn->cpn_cnr_stateid, &cps->cp_stateid.cs_stid, sizeof(stateid_t));
memcpy(&cps->cp_p_stateid, &stid->sc_stateid, sizeof(stateid_t));
memcpy(&cps->cp_p_clid, &clp->cl_clientid, sizeof(clientid_t));
@@ -1946,10 +1937,8 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
&fallocate->falloc_stateid,
WR_STATE, &nf, NULL);
- if (status != nfs_ok) {
- dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n");
+ if (status != nfs_ok)
return status;
- }
status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, nf->nf_file,
fallocate->falloc_offset,
@@ -2005,10 +1994,8 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
&seek->seek_stateid,
RD_STATE, &nf, NULL);
- if (status) {
- dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n");
+ if (status)
return status;
- }
switch (seek->seek_whence) {
case NFS4_CONTENT_DATA:
@@ -2633,9 +2620,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
status = nfserr_minor_vers_mismatch;
if (nfsd_minorversion(nn, args->minorversion, NFSD_TEST) <= 0)
goto out;
- status = nfserr_resource;
- if (args->opcnt > NFSD_MAX_OPS_PER_COMPOUND)
- goto out;
status = nfs41_check_op_ordering(args);
if (status) {
@@ -2648,10 +2632,20 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
rqstp->rq_lease_breaker = (void **)&cstate->clp;
- trace_nfsd_compound(rqstp, args->opcnt);
+ trace_nfsd_compound(rqstp, args->tag, args->taglen, args->client_opcnt);
while (!status && resp->opcnt < args->opcnt) {
op = &args->ops[resp->opcnt++];
+ if (unlikely(resp->opcnt == NFSD_MAX_OPS_PER_COMPOUND)) {
+ /* If there are still more operations to process,
+ * stop here and report NFS4ERR_RESOURCE. */
+ if (cstate->minorversion == 0 &&
+ args->client_opcnt > resp->opcnt) {
+ op->status = nfserr_resource;
+ goto encode_op;
+ }
+ }
+
/*
* The XDR decode routines may have pre-set op->status;
* for example, if there is a miscellaneous XDR error
@@ -2727,8 +2721,8 @@ encode_op:
status = op->status;
}
- trace_nfsd_compound_status(args->opcnt, resp->opcnt, status,
- nfsd4_op_name(op->opnum));
+ trace_nfsd_compound_status(args->client_opcnt, resp->opcnt,
+ status, nfsd4_op_name(op->opnum));
nfsd4_cstate_clear_replay(cstate);
nfsd4_increment_op_stats(op->opnum);
@@ -2762,28 +2756,49 @@ out:
#define op_encode_channel_attrs_maxsz (6 + 1 + 1)
-static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+/*
+ * The _rsize() helpers are invoked by the NFSv4 COMPOUND decoder, which
+ * is called before sunrpc sets rq_res.buflen. Thus we have to compute
+ * the maximum payload size here, based on transport limits and the size
+ * of the remaining space in the rq_pages array.
+ */
+static u32 nfsd4_max_payload(const struct svc_rqst *rqstp)
+{
+ u32 buflen;
+
+ buflen = (rqstp->rq_page_end - rqstp->rq_next_page) * PAGE_SIZE;
+ buflen -= rqstp->rq_auth_slack;
+ buflen -= rqstp->rq_res.head[0].iov_len;
+ return min_t(u32, buflen, svc_max_payload(rqstp));
+}
+
+static u32 nfsd4_only_status_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size) * sizeof(__be32);
}
-static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_status_stateid_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32);
}
-static inline u32 nfsd4_access_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_access_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
/* ac_supported, ac_resp_access */
return (op_encode_hdr_size + 2)* sizeof(__be32);
}
-static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_commit_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
}
-static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_create_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_change_info_maxsz
+ nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
@@ -2794,17 +2809,17 @@ static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op
* the op prematurely if the estimate is too large. We may turn off splice
* reads unnecessarily.
*/
-static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp,
- struct nfsd4_op *op)
+static u32 nfsd4_getattr_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
- u32 *bmap = op->u.getattr.ga_bmval;
+ const u32 *bmap = op->u.getattr.ga_bmval;
u32 bmap0 = bmap[0], bmap1 = bmap[1], bmap2 = bmap[2];
u32 ret = 0;
if (bmap0 & FATTR4_WORD0_ACL)
- return svc_max_payload(rqstp);
+ return nfsd4_max_payload(rqstp);
if (bmap0 & FATTR4_WORD0_FS_LOCATIONS)
- return svc_max_payload(rqstp);
+ return nfsd4_max_payload(rqstp);
if (bmap1 & FATTR4_WORD1_OWNER) {
ret += IDMAP_NAMESZ + 4;
@@ -2832,24 +2847,28 @@ static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp,
return ret;
}
-static inline u32 nfsd4_getfh_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_getfh_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + 1) * sizeof(__be32) + NFS4_FHSIZE;
}
-static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_link_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_change_info_maxsz)
* sizeof(__be32);
}
-static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_lock_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_lock_denied_maxsz)
* sizeof(__be32);
}
-static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_open_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_stateid_maxsz
+ op_encode_change_info_maxsz + 1
@@ -2857,20 +2876,18 @@ static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+ op_encode_delegation_maxsz) * sizeof(__be32);
}
-static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_read_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
- u32 maxcount = 0, rlen = 0;
-
- maxcount = svc_max_payload(rqstp);
- rlen = min(op->u.read.rd_length, maxcount);
+ u32 rlen = min(op->u.read.rd_length, nfsd4_max_payload(rqstp));
return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32);
}
-static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_read_plus_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
- u32 maxcount = svc_max_payload(rqstp);
- u32 rlen = min(op->u.read.rd_length, maxcount);
+ u32 rlen = min(op->u.read.rd_length, nfsd4_max_payload(rqstp));
/*
* If we detect that the file changed during hole encoding, then we
* recover by encoding the remaining reply as data. This means we need
@@ -2881,70 +2898,77 @@ static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op
return (op_encode_hdr_size + 2 + seg_len + XDR_QUADLEN(rlen)) * sizeof(__be32);
}
-static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_readdir_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
- u32 maxcount = 0, rlen = 0;
-
- maxcount = svc_max_payload(rqstp);
- rlen = min(op->u.readdir.rd_maxcount, maxcount);
+ u32 rlen = min(op->u.readdir.rd_maxcount, nfsd4_max_payload(rqstp));
return (op_encode_hdr_size + op_encode_verifier_maxsz +
XDR_QUADLEN(rlen)) * sizeof(__be32);
}
-static inline u32 nfsd4_readlink_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_readlink_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + 1) * sizeof(__be32) + PAGE_SIZE;
}
-static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_remove_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_change_info_maxsz)
* sizeof(__be32);
}
-static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_rename_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_change_info_maxsz
+ op_encode_change_info_maxsz) * sizeof(__be32);
}
-static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp,
- struct nfsd4_op *op)
+static u32 nfsd4_sequence_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32);
}
-static inline u32 nfsd4_test_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_test_stateid_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + 1 + op->u.test_stateid.ts_num_ids)
* sizeof(__be32);
}
-static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_setattr_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
}
-static inline u32 nfsd4_secinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_secinfo_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + RPC_AUTH_MAXFLAVOR *
(4 + XDR_QUADLEN(GSS_OID_MAX_LEN))) * sizeof(__be32);
}
-static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_setclientid_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) *
sizeof(__be32);
}
-static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_write_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + 2 + op_encode_verifier_maxsz) * sizeof(__be32);
}
-static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_exchange_id_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\
1 + 1 + /* eir_flags, spr_how */\
@@ -2958,14 +2982,16 @@ static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_o
0 /* ignored eir_server_impl_id contents */) * sizeof(__be32);
}
-static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_bind_conn_to_session_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + \
XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\
2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32);
}
-static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_create_session_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + \
XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\
@@ -2974,7 +3000,8 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd
op_encode_channel_attrs_maxsz) * sizeof(__be32);
}
-static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_copy_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size +
1 /* wr_callback */ +
@@ -2986,16 +3013,16 @@ static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1 /* cr_synchronous */) * sizeof(__be32);
}
-static inline u32 nfsd4_offload_status_rsize(struct svc_rqst *rqstp,
- struct nfsd4_op *op)
+static u32 nfsd4_offload_status_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size +
2 /* osr_count */ +
1 /* osr_complete<1> optional 0 for now */) * sizeof(__be32);
}
-static inline u32 nfsd4_copy_notify_rsize(struct svc_rqst *rqstp,
- struct nfsd4_op *op)
+static u32 nfsd4_copy_notify_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size +
3 /* cnr_lease_time */ +
@@ -3010,12 +3037,10 @@ static inline u32 nfsd4_copy_notify_rsize(struct svc_rqst *rqstp,
}
#ifdef CONFIG_NFSD_PNFS
-static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_getdeviceinfo_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
- u32 maxcount = 0, rlen = 0;
-
- maxcount = svc_max_payload(rqstp);
- rlen = min(op->u.getdeviceinfo.gd_maxcount, maxcount);
+ u32 rlen = min(op->u.getdeviceinfo.gd_maxcount, nfsd4_max_payload(rqstp));
return (op_encode_hdr_size +
1 /* gd_layout_type*/ +
@@ -3028,7 +3053,8 @@ static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4
* so we need to define an arbitrary upper bound here.
*/
#define MAX_LAYOUT_SIZE 128
-static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_layoutget_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size +
1 /* logr_return_on_close */ +
@@ -3037,14 +3063,16 @@ static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op
MAX_LAYOUT_SIZE) * sizeof(__be32);
}
-static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_layoutcommit_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size +
1 /* locr_newsize */ +
2 /* ns_size */) * sizeof(__be32);
}
-static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_layoutreturn_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size +
1 /* lrs_stateid */ +
@@ -3053,41 +3081,36 @@ static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_
#endif /* CONFIG_NFSD_PNFS */
-static inline u32 nfsd4_seek_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_seek_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + 3) * sizeof(__be32);
}
-static inline u32 nfsd4_getxattr_rsize(struct svc_rqst *rqstp,
- struct nfsd4_op *op)
+static u32 nfsd4_getxattr_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
- u32 maxcount, rlen;
-
- maxcount = svc_max_payload(rqstp);
- rlen = min_t(u32, XATTR_SIZE_MAX, maxcount);
+ u32 rlen = min_t(u32, XATTR_SIZE_MAX, nfsd4_max_payload(rqstp));
return (op_encode_hdr_size + 1 + XDR_QUADLEN(rlen)) * sizeof(__be32);
}
-static inline u32 nfsd4_setxattr_rsize(struct svc_rqst *rqstp,
- struct nfsd4_op *op)
+static u32 nfsd4_setxattr_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_change_info_maxsz)
* sizeof(__be32);
}
-static inline u32 nfsd4_listxattrs_rsize(struct svc_rqst *rqstp,
- struct nfsd4_op *op)
+static u32 nfsd4_listxattrs_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
- u32 maxcount, rlen;
-
- maxcount = svc_max_payload(rqstp);
- rlen = min(op->u.listxattrs.lsxa_maxcount, maxcount);
+ u32 rlen = min(op->u.listxattrs.lsxa_maxcount, nfsd4_max_payload(rqstp));
return (op_encode_hdr_size + 4 + XDR_QUADLEN(rlen)) * sizeof(__be32);
}
-static inline u32 nfsd4_removexattr_rsize(struct svc_rqst *rqstp,
- struct nfsd4_op *op)
+static u32 nfsd4_removexattr_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_change_info_maxsz)
* sizeof(__be32);
@@ -3576,6 +3599,7 @@ static const struct svc_procedure nfsd_procedures4[2] = {
.pc_decode = nfssvc_decode_voidarg,
.pc_encode = nfssvc_encode_voidres,
.pc_argsize = sizeof(struct nfsd_voidargs),
+ .pc_argzero = sizeof(struct nfsd_voidargs),
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = 1,
@@ -3586,6 +3610,7 @@ static const struct svc_procedure nfsd_procedures4[2] = {
.pc_decode = nfs4svc_decode_compoundargs,
.pc_encode = nfs4svc_encode_compoundres,
.pc_argsize = sizeof(struct nfsd4_compoundargs),
+ .pc_argzero = offsetof(struct nfsd4_compoundargs, iops),
.pc_ressize = sizeof(struct nfsd4_compoundres),
.pc_release = nfsd4_release_compoundargs,
.pc_cachetype = RC_NOCACHE,
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index c634483d85d2..78b8cd9651d5 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -266,7 +266,7 @@ struct nfs4_dir_ctx {
struct list_head names;
};
-static int
+static bool
nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
@@ -275,14 +275,14 @@ nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen,
struct name_list *entry;
if (namlen != HEXDIR_LEN - 1)
- return 0;
+ return true;
entry = kmalloc(sizeof(struct name_list), GFP_KERNEL);
if (entry == NULL)
- return -ENOMEM;
+ return false;
memcpy(entry->name, name, HEXDIR_LEN - 1);
entry->name[HEXDIR_LEN - 1] = '\0';
list_add(&entry->list, &ctx->names);
- return 0;
+ return true;
}
static int
@@ -807,16 +807,18 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
if (get_user(namelen, &ci->cc_name.cn_len))
return -EFAULT;
name.data = memdup_user(&ci->cc_name.cn_id, namelen);
- if (IS_ERR_OR_NULL(name.data))
- return -EFAULT;
+ if (IS_ERR(name.data))
+ return PTR_ERR(name.data);
name.len = namelen;
get_user(princhashlen, &ci->cc_princhash.cp_len);
if (princhashlen > 0) {
princhash.data = memdup_user(
&ci->cc_princhash.cp_data,
princhashlen);
- if (IS_ERR_OR_NULL(princhash.data))
- return -EFAULT;
+ if (IS_ERR(princhash.data)) {
+ kfree(name.data);
+ return PTR_ERR(princhash.data);
+ }
princhash.len = princhashlen;
} else
princhash.len = 0;
@@ -827,8 +829,8 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
if (get_user(namelen, &cnm->cn_len))
return -EFAULT;
name.data = memdup_user(&cnm->cn_id, namelen);
- if (IS_ERR_OR_NULL(name.data))
- return -EFAULT;
+ if (IS_ERR(name.data))
+ return PTR_ERR(name.data);
name.len = namelen;
}
if (name.len > 5 && memcmp(name.data, "hash:", 5) == 0) {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c5d199d7e6b4..7b2ee535ade8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -44,7 +44,9 @@
#include <linux/jhash.h>
#include <linux/string_helpers.h>
#include <linux/fsnotify.h>
+#include <linux/rhashtable.h>
#include <linux/nfs_ssc.h>
+
#include "xdr4.h"
#include "xdr4cb.h"
#include "vfs.h"
@@ -84,6 +86,7 @@ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
void nfsd4_end_grace(struct nfsd_net *nn);
static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps);
+static void nfsd4_file_hash_remove(struct nfs4_file *fi);
/* Locking: */
@@ -160,6 +163,13 @@ static bool is_client_expired(struct nfs4_client *clp)
return clp->cl_time == 0;
}
+static void nfsd4_dec_courtesy_client_count(struct nfsd_net *nn,
+ struct nfs4_client *clp)
+{
+ if (clp->cl_state != NFSD4_ACTIVE)
+ atomic_add_unless(&nn->nfsd_courtesy_clients, -1, 0);
+}
+
static __be32 get_client_locked(struct nfs4_client *clp)
{
struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
@@ -169,6 +179,7 @@ static __be32 get_client_locked(struct nfs4_client *clp)
if (is_client_expired(clp))
return nfserr_expired;
atomic_inc(&clp->cl_rpc_users);
+ nfsd4_dec_courtesy_client_count(nn, clp);
clp->cl_state = NFSD4_ACTIVE;
return nfs_ok;
}
@@ -190,6 +201,7 @@ renew_client_locked(struct nfs4_client *clp)
list_move_tail(&clp->cl_lru, &nn->client_lru);
clp->cl_time = ktime_get_boottime_seconds();
+ nfsd4_dec_courtesy_client_count(nn, clp);
clp->cl_state = NFSD4_ACTIVE;
}
@@ -357,6 +369,8 @@ nfsd4_cb_notify_lock_prepare(struct nfsd4_callback *cb)
static int
nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task)
{
+ trace_nfsd_cb_notify_lock_done(&zero_stateid, task);
+
/*
* Since this is just an optimization, we don't try very hard if it
* turns out not to succeed. We'll requeue it on NFS4ERR_DELAY, and
@@ -577,11 +591,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
void
put_nfs4_file(struct nfs4_file *fi)
{
- might_lock(&state_lock);
-
- if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
- hlist_del_rcu(&fi->fi_hash);
- spin_unlock(&state_lock);
+ if (refcount_dec_and_test(&fi->fi_ref)) {
+ nfsd4_file_hash_remove(fi);
WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
@@ -664,15 +675,26 @@ find_any_file(struct nfs4_file *f)
return ret;
}
-static struct nfsd_file *find_deleg_file(struct nfs4_file *f)
+static struct nfsd_file *find_any_file_locked(struct nfs4_file *f)
+{
+ lockdep_assert_held(&f->fi_lock);
+
+ if (f->fi_fds[O_RDWR])
+ return f->fi_fds[O_RDWR];
+ if (f->fi_fds[O_WRONLY])
+ return f->fi_fds[O_WRONLY];
+ if (f->fi_fds[O_RDONLY])
+ return f->fi_fds[O_RDONLY];
+ return NULL;
+}
+
+static struct nfsd_file *find_deleg_file_locked(struct nfs4_file *f)
{
- struct nfsd_file *ret = NULL;
+ lockdep_assert_held(&f->fi_lock);
- spin_lock(&f->fi_lock);
if (f->fi_deleg_file)
- ret = nfsd_file_get(f->fi_deleg_file);
- spin_unlock(&f->fi_lock);
- return ret;
+ return f->fi_deleg_file;
+ return NULL;
}
static atomic_long_t num_delegations;
@@ -695,19 +717,20 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
return ret & OWNER_HASH_MASK;
}
-/* hash table for nfs4_file */
-#define FILE_HASH_BITS 8
-#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
+static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp;
-static unsigned int file_hashval(struct svc_fh *fh)
-{
- struct inode *inode = d_inode(fh->fh_dentry);
+static const struct rhashtable_params nfs4_file_rhash_params = {
+ .key_len = sizeof_field(struct nfs4_file, fi_inode),
+ .key_offset = offsetof(struct nfs4_file, fi_inode),
+ .head_offset = offsetof(struct nfs4_file, fi_rlist),
- /* XXX: why not (here & in file cache) use inode? */
- return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS);
-}
-
-static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
+ /*
+ * Start with a single page hash table to reduce resizing churn
+ * on light workloads.
+ */
+ .min_size = 256,
+ .automatic_shrinking = true,
+};
/*
* Check if courtesy clients have conflicting access and resolve it if possible
@@ -820,9 +843,9 @@ static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
swap(f2, fp->fi_fds[O_RDWR]);
spin_unlock(&fp->fi_lock);
if (f1)
- nfsd_file_close(f1);
+ nfsd_file_put(f1);
if (f2)
- nfsd_file_close(f2);
+ nfsd_file_put(f2);
}
}
@@ -963,19 +986,19 @@ out_free:
* Create a unique stateid_t to represent each COPY.
*/
static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid,
- unsigned char sc_type)
+ unsigned char cs_type)
{
int new_id;
- stid->stid.si_opaque.so_clid.cl_boot = (u32)nn->boot_time;
- stid->stid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id;
- stid->sc_type = sc_type;
+ stid->cs_stid.si_opaque.so_clid.cl_boot = (u32)nn->boot_time;
+ stid->cs_stid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id;
+ stid->cs_type = cs_type;
idr_preload(GFP_KERNEL);
spin_lock(&nn->s2s_cp_lock);
new_id = idr_alloc_cyclic(&nn->s2s_cp_stateids, stid, 0, 0, GFP_NOWAIT);
- stid->stid.si_opaque.so_id = new_id;
- stid->stid.si_generation = 1;
+ stid->cs_stid.si_opaque.so_id = new_id;
+ stid->cs_stid.si_generation = 1;
spin_unlock(&nn->s2s_cp_lock);
idr_preload_end();
if (new_id < 0)
@@ -997,7 +1020,7 @@ struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn,
if (!cps)
return NULL;
cps->cpntf_time = ktime_get_boottime_seconds();
- refcount_set(&cps->cp_stateid.sc_count, 1);
+ refcount_set(&cps->cp_stateid.cs_count, 1);
if (!nfs4_init_cp_state(nn, &cps->cp_stateid, NFS4_COPYNOTIFY_STID))
goto out_free;
spin_lock(&nn->s2s_cp_lock);
@@ -1013,11 +1036,11 @@ void nfs4_free_copy_state(struct nfsd4_copy *copy)
{
struct nfsd_net *nn;
- WARN_ON_ONCE(copy->cp_stateid.sc_type != NFS4_COPY_STID);
+ WARN_ON_ONCE(copy->cp_stateid.cs_type != NFS4_COPY_STID);
nn = net_generic(copy->cp_clp->net, nfsd_net_id);
spin_lock(&nn->s2s_cp_lock);
idr_remove(&nn->s2s_cp_stateids,
- copy->cp_stateid.stid.si_opaque.so_id);
+ copy->cp_stateid.cs_stid.si_opaque.so_id);
spin_unlock(&nn->s2s_cp_lock);
}
@@ -1049,6 +1072,12 @@ static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp)
static void nfs4_free_deleg(struct nfs4_stid *stid)
{
+ struct nfs4_delegation *dp = delegstateid(stid);
+
+ WARN_ON_ONCE(!list_empty(&stid->sc_cp_list));
+ WARN_ON_ONCE(!list_empty(&dp->dl_perfile));
+ WARN_ON_ONCE(!list_empty(&dp->dl_perclnt));
+ WARN_ON_ONCE(!list_empty(&dp->dl_recall_lru));
kmem_cache_free(deleg_slab, stid);
atomic_long_dec(&num_delegations);
}
@@ -1338,6 +1367,8 @@ static void revoke_delegation(struct nfs4_delegation *dp)
WARN_ON(!list_empty(&dp->dl_recall_lru));
+ trace_nfsd_stid_revoke(&dp->dl_stid);
+
if (clp->cl_minorversion) {
dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
refcount_inc(&dp->dl_stid.sc_count);
@@ -1462,6 +1493,7 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid)
release_all_access(stp);
if (stp->st_stateowner)
nfs4_put_stateowner(stp->st_stateowner);
+ WARN_ON(!list_empty(&stid->sc_cp_list));
kmem_cache_free(stateid_slab, stid);
}
@@ -1801,13 +1833,12 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
int numslots = fattrs->maxreqs;
int slotsize = slot_bytes(fattrs);
struct nfsd4_session *new;
- int mem, i;
+ int i;
- BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *)
- + sizeof(struct nfsd4_session) > PAGE_SIZE);
- mem = numslots * sizeof(struct nfsd4_slot *);
+ BUILD_BUG_ON(struct_size(new, se_slots, NFSD_MAX_SLOTS_PER_SESSION)
+ > PAGE_SIZE);
- new = kzalloc(sizeof(*new) + mem, GFP_KERNEL);
+ new = kzalloc(struct_size(new, se_slots, numslots), GFP_KERNEL);
if (!new)
return NULL;
/* allocate each struct nfsd4_slot and data cache in one piece */
@@ -2113,6 +2144,7 @@ static void __free_client(struct kref *k)
kfree(clp->cl_nii_domain.data);
kfree(clp->cl_nii_name.data);
idr_destroy(&clp->cl_stateids);
+ kfree(clp->cl_ra);
kmem_cache_free(client_slab, clp);
}
@@ -2233,6 +2265,7 @@ __destroy_client(struct nfs4_client *clp)
if (clp->cl_cb_conn.cb_xprt)
svc_xprt_put(clp->cl_cb_conn.cb_xprt);
atomic_add_unless(&nn->nfs4_client_count, -1, 0);
+ nfsd4_dec_courtesy_client_count(nn, clp);
free_client(clp);
wake_up_all(&expiry_wq);
}
@@ -2478,7 +2511,7 @@ static const char *cb_state2str(int state)
static int client_info_show(struct seq_file *m, void *v)
{
- struct inode *inode = m->private;
+ struct inode *inode = file_inode(m->file);
struct nfs4_client *clp;
u64 clid;
@@ -2518,17 +2551,7 @@ static int client_info_show(struct seq_file *m, void *v)
return 0;
}
-static int client_info_open(struct inode *inode, struct file *file)
-{
- return single_open(file, client_info_show, inode);
-}
-
-static const struct file_operations client_info_fops = {
- .open = client_info_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(client_info);
static void *states_start(struct seq_file *s, loff_t *pos)
__acquires(&clp->cl_lock)
@@ -2604,9 +2627,11 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
ols = openlockstateid(st);
oo = ols->st_stateowner;
nf = st->sc_file;
- file = find_any_file(nf);
+
+ spin_lock(&nf->fi_lock);
+ file = find_any_file_locked(nf);
if (!file)
- return 0;
+ goto out;
seq_printf(s, "- ");
nfs4_show_stateid(s, &st->sc_stateid);
@@ -2628,8 +2653,8 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
seq_printf(s, ", ");
nfs4_show_owner(s, oo);
seq_printf(s, " }\n");
- nfsd_file_put(file);
-
+out:
+ spin_unlock(&nf->fi_lock);
return 0;
}
@@ -2643,9 +2668,10 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
ols = openlockstateid(st);
oo = ols->st_stateowner;
nf = st->sc_file;
- file = find_any_file(nf);
+ spin_lock(&nf->fi_lock);
+ file = find_any_file_locked(nf);
if (!file)
- return 0;
+ goto out;
seq_printf(s, "- ");
nfs4_show_stateid(s, &st->sc_stateid);
@@ -2665,8 +2691,8 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
seq_printf(s, ", ");
nfs4_show_owner(s, oo);
seq_printf(s, " }\n");
- nfsd_file_put(file);
-
+out:
+ spin_unlock(&nf->fi_lock);
return 0;
}
@@ -2678,9 +2704,10 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
ds = delegstateid(st);
nf = st->sc_file;
- file = find_deleg_file(nf);
+ spin_lock(&nf->fi_lock);
+ file = find_deleg_file_locked(nf);
if (!file)
- return 0;
+ goto out;
seq_printf(s, "- ");
nfs4_show_stateid(s, &st->sc_stateid);
@@ -2696,8 +2723,8 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
seq_printf(s, ", ");
nfs4_show_fname(s, file);
seq_printf(s, " }\n");
- nfsd_file_put(file);
-
+out:
+ spin_unlock(&nf->fi_lock);
return 0;
}
@@ -2845,6 +2872,37 @@ static const struct tree_descr client_files[] = {
[3] = {""},
};
+static int
+nfsd4_cb_recall_any_done(struct nfsd4_callback *cb,
+ struct rpc_task *task)
+{
+ trace_nfsd_cb_recall_any_done(cb, task);
+ switch (task->tk_status) {
+ case -NFS4ERR_DELAY:
+ rpc_delay(task, 2 * HZ);
+ return 0;
+ default:
+ return 1;
+ }
+}
+
+static void
+nfsd4_cb_recall_any_release(struct nfsd4_callback *cb)
+{
+ struct nfs4_client *clp = cb->cb_clp;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ spin_lock(&nn->client_lock);
+ clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
+ put_client_renew_locked(clp);
+ spin_unlock(&nn->client_lock);
+}
+
+static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = {
+ .done = nfsd4_cb_recall_any_done,
+ .release = nfsd4_cb_recall_any_release,
+};
+
static struct nfs4_client *create_client(struct xdr_netobj name,
struct svc_rqst *rqstp, nfs4_verifier *verf)
{
@@ -2882,6 +2940,14 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
free_client(clp);
return NULL;
}
+ clp->cl_ra = kzalloc(sizeof(*clp->cl_ra), GFP_KERNEL);
+ if (!clp->cl_ra) {
+ free_client(clp);
+ return NULL;
+ }
+ clp->cl_ra_time = 0;
+ nfsd4_init_cb(&clp->cl_ra->ra_cb, clp, &nfsd4_cb_recall_any_ops,
+ NFSPROC4_CLNT_CB_RECALL_ANY);
return clp;
}
@@ -4251,11 +4317,9 @@ static struct nfs4_file *nfsd4_alloc_file(void)
}
/* OPEN Share state helper functions */
-static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval,
- struct nfs4_file *fp)
-{
- lockdep_assert_held(&state_lock);
+static void nfsd4_file_init(const struct svc_fh *fh, struct nfs4_file *fp)
+{
refcount_set(&fp->fi_ref, 1);
spin_lock_init(&fp->fi_lock);
INIT_LIST_HEAD(&fp->fi_stateids);
@@ -4273,7 +4337,6 @@ static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval,
INIT_LIST_HEAD(&fp->fi_lo_states);
atomic_set(&fp->fi_lo_recalls, 0);
#endif
- hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
}
void
@@ -4337,7 +4400,29 @@ out:
return -ENOMEM;
}
-void nfsd4_init_leases_net(struct nfsd_net *nn)
+static unsigned long
+nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int count;
+ struct nfsd_net *nn = container_of(shrink,
+ struct nfsd_net, nfsd_client_shrinker);
+
+ count = atomic_read(&nn->nfsd_courtesy_clients);
+ if (!count)
+ count = atomic_long_read(&num_delegations);
+ if (count)
+ mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
+ return (unsigned long)count;
+}
+
+static unsigned long
+nfsd4_state_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ return SHRINK_STOP;
+}
+
+int
+nfsd4_init_leases_net(struct nfsd_net *nn)
{
struct sysinfo si;
u64 max_clients;
@@ -4346,8 +4431,8 @@ void nfsd4_init_leases_net(struct nfsd_net *nn)
nn->nfsd4_grace = 90;
nn->somebody_reclaimed = false;
nn->track_reclaim_completes = false;
- nn->clverifier_counter = prandom_u32();
- nn->clientid_base = prandom_u32();
+ nn->clverifier_counter = get_random_u32();
+ nn->clientid_base = get_random_u32();
nn->clientid_counter = nn->clientid_base + 1;
nn->s2s_cp_cl_id = nn->clientid_counter++;
@@ -4356,6 +4441,18 @@ void nfsd4_init_leases_net(struct nfsd_net *nn)
max_clients = (u64)si.totalram * si.mem_unit / (1024 * 1024 * 1024);
max_clients *= NFS4_CLIENTS_PER_GB;
nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB);
+
+ atomic_set(&nn->nfsd_courtesy_clients, 0);
+ nn->nfsd_client_shrinker.scan_objects = nfsd4_state_shrinker_scan;
+ nn->nfsd_client_shrinker.count_objects = nfsd4_state_shrinker_count;
+ nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS;
+ return register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client");
+}
+
+void
+nfsd4_leases_net_shutdown(struct nfsd_net *nn)
+{
+ unregister_shrinker(&nn->nfsd_client_shrinker);
}
static void init_nfs4_replay(struct nfs4_replay *rp)
@@ -4626,71 +4723,80 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
nfs4_put_stid(&last->st_stid);
}
-/* search file_hashtbl[] for file */
-static struct nfs4_file *
-find_file_locked(struct svc_fh *fh, unsigned int hashval)
+static noinline_for_stack struct nfs4_file *
+nfsd4_file_hash_lookup(const struct svc_fh *fhp)
{
- struct nfs4_file *fp;
+ struct inode *inode = d_inode(fhp->fh_dentry);
+ struct rhlist_head *tmp, *list;
+ struct nfs4_file *fi;
- hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
- lockdep_is_held(&state_lock)) {
- if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) {
- if (refcount_inc_not_zero(&fp->fi_ref))
- return fp;
+ rcu_read_lock();
+ list = rhltable_lookup(&nfs4_file_rhltable, &inode,
+ nfs4_file_rhash_params);
+ rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
+ if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
+ if (refcount_inc_not_zero(&fi->fi_ref)) {
+ rcu_read_unlock();
+ return fi;
+ }
}
}
+ rcu_read_unlock();
return NULL;
}
-static struct nfs4_file *insert_file(struct nfs4_file *new, struct svc_fh *fh,
- unsigned int hashval)
+/*
+ * On hash insertion, identify entries with the same inode but
+ * distinct filehandles. They will all be on the list returned
+ * by rhltable_lookup().
+ *
+ * inode->i_lock prevents racing insertions from adding an entry
+ * for the same inode/fhp pair twice.
+ */
+static noinline_for_stack struct nfs4_file *
+nfsd4_file_hash_insert(struct nfs4_file *new, const struct svc_fh *fhp)
{
- struct nfs4_file *fp;
+ struct inode *inode = d_inode(fhp->fh_dentry);
+ struct rhlist_head *tmp, *list;
struct nfs4_file *ret = NULL;
bool alias_found = false;
+ struct nfs4_file *fi;
+ int err;
- spin_lock(&state_lock);
- hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
- lockdep_is_held(&state_lock)) {
- if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) {
- if (refcount_inc_not_zero(&fp->fi_ref))
- ret = fp;
- } else if (d_inode(fh->fh_dentry) == fp->fi_inode)
- fp->fi_aliased = alias_found = true;
- }
- if (likely(ret == NULL)) {
- nfsd4_init_file(fh, hashval, new);
- new->fi_aliased = alias_found;
- ret = new;
+ rcu_read_lock();
+ spin_lock(&inode->i_lock);
+
+ list = rhltable_lookup(&nfs4_file_rhltable, &inode,
+ nfs4_file_rhash_params);
+ rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
+ if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
+ if (refcount_inc_not_zero(&fi->fi_ref))
+ ret = fi;
+ } else
+ fi->fi_aliased = alias_found = true;
}
- spin_unlock(&state_lock);
- return ret;
-}
+ if (ret)
+ goto out_unlock;
-static struct nfs4_file * find_file(struct svc_fh *fh)
-{
- struct nfs4_file *fp;
- unsigned int hashval = file_hashval(fh);
+ nfsd4_file_init(fhp, new);
+ err = rhltable_insert(&nfs4_file_rhltable, &new->fi_rlist,
+ nfs4_file_rhash_params);
+ if (err)
+ goto out_unlock;
- rcu_read_lock();
- fp = find_file_locked(fh, hashval);
+ new->fi_aliased = alias_found;
+ ret = new;
+
+out_unlock:
+ spin_unlock(&inode->i_lock);
rcu_read_unlock();
- return fp;
+ return ret;
}
-static struct nfs4_file *
-find_or_add_file(struct nfs4_file *new, struct svc_fh *fh)
+static noinline_for_stack void nfsd4_file_hash_remove(struct nfs4_file *fi)
{
- struct nfs4_file *fp;
- unsigned int hashval = file_hashval(fh);
-
- rcu_read_lock();
- fp = find_file_locked(fh, hashval);
- rcu_read_unlock();
- if (fp)
- return fp;
-
- return insert_file(new, fh, hashval);
+ rhltable_remove(&nfs4_file_rhltable, &fi->fi_rlist,
+ nfs4_file_rhash_params);
}
/*
@@ -4703,9 +4809,10 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
struct nfs4_file *fp;
__be32 ret = nfs_ok;
- fp = find_file(current_fh);
+ fp = nfsd4_file_hash_lookup(current_fh);
if (!fp)
return ret;
+
/* Check for conflicting share reservations */
spin_lock(&fp->fi_lock);
if (fp->fi_share_deny & deny_type)
@@ -4715,6 +4822,35 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
return ret;
}
+static bool nfsd4_deleg_present(const struct inode *inode)
+{
+ struct file_lock_context *ctx = locks_inode_context(inode);
+
+ return ctx && !list_empty_careful(&ctx->flc_lease);
+}
+
+/**
+ * nfsd_wait_for_delegreturn - wait for delegations to be returned
+ * @rqstp: the RPC transaction being executed
+ * @inode: in-core inode of the file being waited for
+ *
+ * The timeout prevents deadlock if all nfsd threads happen to be
+ * tied up waiting for returning delegations.
+ *
+ * Return values:
+ * %true: delegation was returned
+ * %false: timed out waiting for delegreturn
+ */
+bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp, struct inode *inode)
+{
+ long __maybe_unused timeo;
+
+ timeo = wait_var_event_timeout(inode, !nfsd4_deleg_present(inode),
+ NFSD_DELEGRETURN_TIMEOUT);
+ trace_nfsd_delegret_wakeup(rqstp, inode, timeo);
+ return timeo > 0;
+}
+
static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb)
{
struct nfs4_delegation *dp = cb_to_delegation(cb);
@@ -4743,6 +4879,8 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb,
{
struct nfs4_delegation *dp = cb_to_delegation(cb);
+ trace_nfsd_cb_recall_done(&dp->dl_stid.sc_stateid, task);
+
if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID ||
dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID)
return 1;
@@ -4788,18 +4926,17 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
* We're assuming the state code never drops its reference
* without first removing the lease. Since we're in this lease
* callback (and since the lease code is serialized by the
- * i_lock) we know the server hasn't removed the lease yet, and
+ * flc_lock) we know the server hasn't removed the lease yet, and
* we know it's safe to take a reference.
*/
refcount_inc(&dp->dl_stid.sc_count);
- nfsd4_run_cb(&dp->dl_recall);
+ WARN_ON_ONCE(!nfsd4_run_cb(&dp->dl_recall));
}
-/* Called from break_lease() with i_lock held. */
+/* Called from break_lease() with flc_lock held. */
static bool
nfsd_break_deleg_cb(struct file_lock *fl)
{
- bool ret = false;
struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
struct nfs4_file *fp = dp->dl_stid.sc_file;
struct nfs4_client *clp = dp->dl_stid.sc_client;
@@ -4825,7 +4962,7 @@ nfsd_break_deleg_cb(struct file_lock *fl)
fp->fi_had_conflict = true;
nfsd_break_one_deleg(dp);
spin_unlock(&fp->fi_lock);
- return ret;
+ return false;
}
/**
@@ -5311,6 +5448,7 @@ nfsd4_verify_deleg_dentry(struct nfsd4_open *open, struct nfs4_file *fp,
if (err)
return -EAGAIN;
+ exp_put(exp);
dput(child);
if (child != file_dentry(fp->fi_deleg_file->nf_file))
return -EAGAIN;
@@ -5548,7 +5686,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
* and check for delegations in the process of being recalled.
* If not found, create the nfs4_file struct
*/
- fp = find_or_add_file(open->op_file, current_fh);
+ fp = nfsd4_file_hash_insert(open->op_file, current_fh);
+ if (unlikely(!fp))
+ return nfserr_jukebox;
if (fp != open->op_file) {
status = nfs4_check_deleg(cl, open, &dp);
if (status)
@@ -5825,7 +5965,7 @@ nfs4_lockowner_has_blockers(struct nfs4_lockowner *lo)
list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) {
nf = stp->st_stid.sc_file;
- ctx = nf->fi_inode->i_flctx;
+ ctx = locks_inode_context(nf->fi_inode);
if (!ctx)
continue;
if (locks_owner_has_blockers(ctx, lo))
@@ -5878,8 +6018,11 @@ nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist,
goto exp_client;
if (!state_expired(lt, clp->cl_time))
break;
- if (!atomic_read(&clp->cl_rpc_users))
+ if (!atomic_read(&clp->cl_rpc_users)) {
+ if (clp->cl_state == NFSD4_ACTIVE)
+ atomic_inc(&nn->nfsd_courtesy_clients);
clp->cl_state = NFSD4_COURTESY;
+ }
if (!client_has_state(clp))
goto exp_client;
if (!nfs4_anylock_blockers(clp))
@@ -5894,10 +6037,49 @@ exp_client:
spin_unlock(&nn->client_lock);
}
+static void
+nfs4_get_courtesy_client_reaplist(struct nfsd_net *nn,
+ struct list_head *reaplist)
+{
+ unsigned int maxreap = 0, reapcnt = 0;
+ struct list_head *pos, *next;
+ struct nfs4_client *clp;
+
+ maxreap = NFSD_CLIENT_MAX_TRIM_PER_RUN;
+ INIT_LIST_HEAD(reaplist);
+
+ spin_lock(&nn->client_lock);
+ list_for_each_safe(pos, next, &nn->client_lru) {
+ clp = list_entry(pos, struct nfs4_client, cl_lru);
+ if (clp->cl_state == NFSD4_ACTIVE)
+ break;
+ if (reapcnt >= maxreap)
+ break;
+ if (!mark_client_expired_locked(clp)) {
+ list_add(&clp->cl_lru, reaplist);
+ reapcnt++;
+ }
+ }
+ spin_unlock(&nn->client_lock);
+}
+
+static void
+nfs4_process_client_reaplist(struct list_head *reaplist)
+{
+ struct list_head *pos, *next;
+ struct nfs4_client *clp;
+
+ list_for_each_safe(pos, next, reaplist) {
+ clp = list_entry(pos, struct nfs4_client, cl_lru);
+ trace_nfsd_clid_purged(&clp->cl_clientid);
+ list_del_init(&clp->cl_lru);
+ expire_client(clp);
+ }
+}
+
static time64_t
nfs4_laundromat(struct nfsd_net *nn)
{
- struct nfs4_client *clp;
struct nfs4_openowner *oo;
struct nfs4_delegation *dp;
struct nfs4_ol_stateid *stp;
@@ -5920,18 +6102,14 @@ nfs4_laundromat(struct nfsd_net *nn)
spin_lock(&nn->s2s_cp_lock);
idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) {
cps = container_of(cps_t, struct nfs4_cpntf_state, cp_stateid);
- if (cps->cp_stateid.sc_type == NFS4_COPYNOTIFY_STID &&
+ if (cps->cp_stateid.cs_type == NFS4_COPYNOTIFY_STID &&
state_expired(&lt, cps->cpntf_time))
_free_cpntf_state_locked(nn, cps);
}
spin_unlock(&nn->s2s_cp_lock);
nfs4_get_client_reaplist(nn, &reaplist, &lt);
- list_for_each_safe(pos, next, &reaplist) {
- clp = list_entry(pos, struct nfs4_client, cl_lru);
- trace_nfsd_clid_purged(&clp->cl_clientid);
- list_del_init(&clp->cl_lru);
- expire_client(clp);
- }
+ nfs4_process_client_reaplist(&reaplist);
+
spin_lock(&state_lock);
list_for_each_safe(pos, next, &nn->del_recall_lru) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
@@ -6014,6 +6192,65 @@ laundromat_main(struct work_struct *laundry)
queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ);
}
+static void
+courtesy_client_reaper(struct nfsd_net *nn)
+{
+ struct list_head reaplist;
+
+ nfs4_get_courtesy_client_reaplist(nn, &reaplist);
+ nfs4_process_client_reaplist(&reaplist);
+}
+
+static void
+deleg_reaper(struct nfsd_net *nn)
+{
+ struct list_head *pos, *next;
+ struct nfs4_client *clp;
+ struct list_head cblist;
+
+ INIT_LIST_HEAD(&cblist);
+ spin_lock(&nn->client_lock);
+ list_for_each_safe(pos, next, &nn->client_lru) {
+ clp = list_entry(pos, struct nfs4_client, cl_lru);
+ if (clp->cl_state != NFSD4_ACTIVE ||
+ list_empty(&clp->cl_delegations) ||
+ atomic_read(&clp->cl_delegs_in_recall) ||
+ test_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags) ||
+ (ktime_get_boottime_seconds() -
+ clp->cl_ra_time < 5)) {
+ continue;
+ }
+ list_add(&clp->cl_ra_cblist, &cblist);
+
+ /* release in nfsd4_cb_recall_any_release */
+ atomic_inc(&clp->cl_rpc_users);
+ set_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
+ clp->cl_ra_time = ktime_get_boottime_seconds();
+ }
+ spin_unlock(&nn->client_lock);
+
+ while (!list_empty(&cblist)) {
+ clp = list_first_entry(&cblist, struct nfs4_client,
+ cl_ra_cblist);
+ list_del_init(&clp->cl_ra_cblist);
+ clp->cl_ra->ra_keep = 0;
+ clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG);
+ trace_nfsd_cb_recall_any(clp->cl_ra);
+ nfsd4_run_cb(&clp->cl_ra->ra_cb);
+ }
+}
+
+static void
+nfsd4_state_shrinker_worker(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct nfsd_net *nn = container_of(dwork, struct nfsd_net,
+ nfsd_shrinker_work);
+
+ courtesy_client_reaper(nn);
+ deleg_reaper(nn);
+}
+
static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
{
if (!fh_match(&fhp->fh_handle, &stp->sc_file->fi_fhandle))
@@ -6149,6 +6386,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
struct nfs4_stid **s, struct nfsd_net *nn)
{
__be32 status;
+ struct nfs4_stid *stid;
bool return_revoked = false;
/*
@@ -6171,15 +6409,16 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
}
if (status)
return status;
- *s = find_stateid_by_type(cstate->clp, stateid, typemask);
- if (!*s)
+ stid = find_stateid_by_type(cstate->clp, stateid, typemask);
+ if (!stid)
return nfserr_bad_stateid;
- if (((*s)->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) {
- nfs4_put_stid(*s);
+ if ((stid->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) {
+ nfs4_put_stid(stid);
if (cstate->minorversion)
return nfserr_deleg_revoked;
return nfserr_bad_stateid;
}
+ *s = stid;
return nfs_ok;
}
@@ -6244,12 +6483,12 @@ out:
static void
_free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps)
{
- WARN_ON_ONCE(cps->cp_stateid.sc_type != NFS4_COPYNOTIFY_STID);
- if (!refcount_dec_and_test(&cps->cp_stateid.sc_count))
+ WARN_ON_ONCE(cps->cp_stateid.cs_type != NFS4_COPYNOTIFY_STID);
+ if (!refcount_dec_and_test(&cps->cp_stateid.cs_count))
return;
list_del(&cps->cp_list);
idr_remove(&nn->s2s_cp_stateids,
- cps->cp_stateid.stid.si_opaque.so_id);
+ cps->cp_stateid.cs_stid.si_opaque.so_id);
kfree(cps);
}
/*
@@ -6271,12 +6510,12 @@ __be32 manage_cpntf_state(struct nfsd_net *nn, stateid_t *st,
if (cps_t) {
state = container_of(cps_t, struct nfs4_cpntf_state,
cp_stateid);
- if (state->cp_stateid.sc_type != NFS4_COPYNOTIFY_STID) {
+ if (state->cp_stateid.cs_type != NFS4_COPYNOTIFY_STID) {
state = NULL;
goto unlock;
}
if (!clp)
- refcount_inc(&state->cp_stateid.sc_count);
+ refcount_inc(&state->cp_stateid.cs_count);
else
_free_cpntf_state_locked(nn, state);
}
@@ -6684,6 +6923,7 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
struct nfs4_client *clp = s->st_stid.sc_client;
bool unhashed;
LIST_HEAD(reaplist);
+ struct nfs4_ol_stateid *stp;
spin_lock(&clp->cl_lock);
unhashed = unhash_open_stateid(s, &reaplist);
@@ -6692,6 +6932,8 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
if (unhashed)
put_ol_stateid_locked(s, &reaplist);
spin_unlock(&clp->cl_lock);
+ list_for_each_entry(stp, &reaplist, st_locks)
+ nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
free_ol_stateid_reaplist(&reaplist);
} else {
spin_unlock(&clp->cl_lock);
@@ -6775,6 +7017,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
goto put_stateid;
+ trace_nfsd_deleg_return(stateid);
+ wake_up_var(d_inode(cstate->current_fh.fh_dentry));
destroy_delegation(dp);
put_stateid:
nfs4_put_stid(&dp->dl_stid);
@@ -7585,7 +7829,7 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
}
inode = locks_inode(nf->nf_file);
- flctx = inode->i_flctx;
+ flctx = locks_inode_context(inode);
if (flctx && !list_empty_careful(&flctx->flc_posix)) {
spin_lock(&flctx->flc_lock);
@@ -7830,6 +8074,7 @@ static int nfs4_state_create_net(struct net *net)
INIT_LIST_HEAD(&nn->blocked_locks_lru);
INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main);
+ INIT_DELAYED_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker);
get_net(net);
return 0;
@@ -7905,10 +8150,16 @@ nfs4_state_start(void)
{
int ret;
- ret = nfsd4_create_callback_queue();
+ ret = rhltable_init(&nfs4_file_rhltable, &nfs4_file_rhash_params);
if (ret)
return ret;
+ ret = nfsd4_create_callback_queue();
+ if (ret) {
+ rhltable_destroy(&nfs4_file_rhltable);
+ return ret;
+ }
+
set_max_delegations();
return 0;
}
@@ -7939,6 +8190,7 @@ nfs4_state_shutdown_net(struct net *net)
nfsd4_client_tracking_exit(net);
nfs4_state_destroy_net(net);
+ rhltable_destroy(&nfs4_file_rhltable);
#ifdef CONFIG_NFSD_V4_2_INTER_SSC
nfsd4_ssc_shutdown_umount(nn);
#endif
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1e9690a061ec..2b4ae858c89b 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -42,6 +42,8 @@
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/sunrpc/addr.h>
#include <linux/xattr.h>
+#include <linux/vmalloc.h>
+
#include <uapi/linux/xattr.h>
#include "idmap.h"
@@ -768,16 +770,18 @@ nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs)
static __be32
nfsd4_decode_access(struct nfsd4_compoundargs *argp,
- struct nfsd4_access *access)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_access *access = &u->access;
if (xdr_stream_decode_u32(argp->xdr, &access->ac_req_access) < 0)
return nfserr_bad_xdr;
return nfs_ok;
}
static __be32
-nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
+nfsd4_decode_close(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_close *close = &u->close;
if (xdr_stream_decode_u32(argp->xdr, &close->cl_seqid) < 0)
return nfserr_bad_xdr;
return nfsd4_decode_stateid4(argp, &close->cl_stateid);
@@ -785,20 +789,24 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
static __be32
-nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit)
+nfsd4_decode_commit(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_commit *commit = &u->commit;
if (xdr_stream_decode_u64(argp->xdr, &commit->co_offset) < 0)
return nfserr_bad_xdr;
if (xdr_stream_decode_u32(argp->xdr, &commit->co_count) < 0)
return nfserr_bad_xdr;
+ memset(&commit->co_verf, 0, sizeof(commit->co_verf));
return nfs_ok;
}
static __be32
-nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create)
+nfsd4_decode_create(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_create *create = &u->create;
__be32 *p, status;
+ memset(create, 0, sizeof(*create));
if (xdr_stream_decode_u32(argp->xdr, &create->cr_type) < 0)
return nfserr_bad_xdr;
switch (create->cr_type) {
@@ -840,21 +848,26 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
}
static inline __be32
-nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
+nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_delegreturn *dr = &u->delegreturn;
return nfsd4_decode_stateid4(argp, &dr->dr_stateid);
}
static inline __be32
-nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr)
+nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_getattr *getattr = &u->getattr;
+ memset(getattr, 0, sizeof(*getattr));
return nfsd4_decode_bitmap4(argp, getattr->ga_bmval,
ARRAY_SIZE(getattr->ga_bmval));
}
static __be32
-nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
+nfsd4_decode_link(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_link *link = &u->link;
+ memset(link, 0, sizeof(*link));
return nfsd4_decode_component4(argp, &link->li_name, &link->li_namelen);
}
@@ -901,8 +914,10 @@ nfsd4_decode_locker4(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
}
static __be32
-nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
+nfsd4_decode_lock(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_lock *lock = &u->lock;
+ memset(lock, 0, sizeof(*lock));
if (xdr_stream_decode_u32(argp->xdr, &lock->lk_type) < 0)
return nfserr_bad_xdr;
if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT))
@@ -917,8 +932,10 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
}
static __be32
-nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
+nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_lockt *lockt = &u->lockt;
+ memset(lockt, 0, sizeof(*lockt));
if (xdr_stream_decode_u32(argp->xdr, &lockt->lt_type) < 0)
return nfserr_bad_xdr;
if ((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT))
@@ -932,8 +949,9 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
}
static __be32
-nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
+nfsd4_decode_locku(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_locku *locku = &u->locku;
__be32 status;
if (xdr_stream_decode_u32(argp->xdr, &locku->lu_type) < 0)
@@ -954,8 +972,9 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
}
static __be32
-nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup)
+nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_lookup *lookup = &u->lookup;
return nfsd4_decode_component4(argp, &lookup->lo_name, &lookup->lo_len);
}
@@ -1135,16 +1154,14 @@ nfsd4_decode_open_claim4(struct nfsd4_compoundargs *argp,
}
static __be32
-nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
+nfsd4_decode_open(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_open *open = &u->open;
__be32 status;
u32 dummy;
- memset(open->op_bmval, 0, sizeof(open->op_bmval));
- open->op_iattr.ia_valid = 0;
- open->op_openowner = NULL;
+ memset(open, 0, sizeof(*open));
- open->op_xdr_error = 0;
if (xdr_stream_decode_u32(argp->xdr, &open->op_seqid) < 0)
return nfserr_bad_xdr;
/* deleg_want is ignored */
@@ -1166,8 +1183,10 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
}
static __be32
-nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf)
+nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_open_confirm *open_conf = &u->open_confirm;
__be32 status;
if (argp->minorversion >= 1)
@@ -1179,14 +1198,19 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
if (xdr_stream_decode_u32(argp->xdr, &open_conf->oc_seqid) < 0)
return nfserr_bad_xdr;
+ memset(&open_conf->oc_resp_stateid, 0,
+ sizeof(open_conf->oc_resp_stateid));
return nfs_ok;
}
static __be32
-nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_downgrade *open_down)
+nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_open_downgrade *open_down = &u->open_downgrade;
__be32 status;
+ memset(open_down, 0, sizeof(*open_down));
status = nfsd4_decode_stateid4(argp, &open_down->od_stateid);
if (status)
return status;
@@ -1201,8 +1225,9 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
}
static __be32
-nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
+nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_putfh *putfh = &u->putfh;
__be32 *p;
if (xdr_stream_decode_u32(argp->xdr, &putfh->pf_fhlen) < 0)
@@ -1216,11 +1241,12 @@ nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
if (!putfh->pf_fhval)
return nfserr_jukebox;
+ putfh->no_verify = false;
return nfs_ok;
}
static __be32
-nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p)
+nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p)
{
if (argp->minorversion == 0)
return nfs_ok;
@@ -1228,10 +1254,12 @@ nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p)
}
static __be32
-nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
+nfsd4_decode_read(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_read *read = &u->read;
__be32 status;
+ memset(read, 0, sizeof(*read));
status = nfsd4_decode_stateid4(argp, &read->rd_stateid);
if (status)
return status;
@@ -1244,10 +1272,12 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
}
static __be32
-nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *readdir)
+nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_readdir *readdir = &u->readdir;
__be32 status;
+ memset(readdir, 0, sizeof(*readdir));
if (xdr_stream_decode_u64(argp->xdr, &readdir->rd_cookie) < 0)
return nfserr_bad_xdr;
status = nfsd4_decode_verifier4(argp, &readdir->rd_verf);
@@ -1265,16 +1295,20 @@ nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *read
}
static __be32
-nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove)
+nfsd4_decode_remove(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_remove *remove = &u->remove;
+ memset(&remove->rm_cinfo, 0, sizeof(remove->rm_cinfo));
return nfsd4_decode_component4(argp, &remove->rm_name, &remove->rm_namelen);
}
static __be32
-nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename)
+nfsd4_decode_rename(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_rename *rename = &u->rename;
__be32 status;
+ memset(rename, 0, sizeof(*rename));
status = nfsd4_decode_component4(argp, &rename->rn_sname, &rename->rn_snamelen);
if (status)
return status;
@@ -1282,23 +1316,28 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename
}
static __be32
-nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid)
+nfsd4_decode_renew(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ clientid_t *clientid = &u->renew;
return nfsd4_decode_clientid4(argp, clientid);
}
static __be32
nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
- struct nfsd4_secinfo *secinfo)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_secinfo *secinfo = &u->secinfo;
+ secinfo->si_exp = NULL;
return nfsd4_decode_component4(argp, &secinfo->si_name, &secinfo->si_namelen);
}
static __be32
-nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
+nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_setattr *setattr = &u->setattr;
__be32 status;
+ memset(setattr, 0, sizeof(*setattr));
status = nfsd4_decode_stateid4(argp, &setattr->sa_stateid);
if (status)
return status;
@@ -1309,10 +1348,13 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
}
static __be32
-nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid *setclientid)
+nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_setclientid *setclientid = &u->setclientid;
__be32 *p, status;
+ memset(setclientid, 0, sizeof(*setclientid));
+
if (argp->minorversion >= 1)
return nfserr_notsupp;
@@ -1350,8 +1392,10 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
}
static __be32
-nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid_confirm *scd_c)
+nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_setclientid_confirm *scd_c = &u->setclientid_confirm;
__be32 status;
if (argp->minorversion >= 1)
@@ -1365,10 +1409,13 @@ nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_s
/* Also used for NVERIFY */
static __be32
-nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify)
+nfsd4_decode_verify(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_verify *verify = &u->verify;
__be32 *p, status;
+ memset(verify, 0, sizeof(*verify));
+
status = nfsd4_decode_bitmap4(argp, verify->ve_bmval,
ARRAY_SIZE(verify->ve_bmval));
if (status)
@@ -1390,8 +1437,9 @@ nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify
}
static __be32
-nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
+nfsd4_decode_write(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_write *write = &u->write;
__be32 status;
status = nfsd4_decode_stateid4(argp, &write->wr_stateid);
@@ -1408,12 +1456,17 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
if (!xdr_stream_subsegment(argp->xdr, &write->wr_payload, write->wr_buflen))
return nfserr_bad_xdr;
+ write->wr_bytes_written = 0;
+ write->wr_how_written = 0;
+ memset(&write->wr_verifier, 0, sizeof(write->wr_verifier));
return nfs_ok;
}
static __be32
-nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_release_lockowner *rlockowner)
+nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_release_lockowner *rlockowner = &u->release_lockowner;
__be32 status;
if (argp->minorversion >= 1)
@@ -1430,18 +1483,24 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
return nfs_ok;
}
-static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc)
+static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_backchannel_ctl *bc = &u->backchannel_ctl;
+ memset(bc, 0, sizeof(*bc));
if (xdr_stream_decode_u32(argp->xdr, &bc->bc_cb_program) < 0)
return nfserr_bad_xdr;
return nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec);
}
-static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
+static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session;
u32 use_conn_in_rdma_mode;
__be32 status;
+ memset(bcts, 0, sizeof(*bcts));
status = nfsd4_decode_sessionid4(argp, &bcts->sessionid);
if (status)
return status;
@@ -1579,10 +1638,12 @@ nfsd4_decode_nfs_impl_id4(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
- struct nfsd4_exchange_id *exid)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_exchange_id *exid = &u->exchange_id;
__be32 status;
+ memset(exid, 0, sizeof(*exid));
status = nfsd4_decode_verifier4(argp, &exid->verifier);
if (status)
return status;
@@ -1631,10 +1692,12 @@ nfsd4_decode_channel_attrs4(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
- struct nfsd4_create_session *sess)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_create_session *sess = &u->create_session;
__be32 status;
+ memset(sess, 0, sizeof(*sess));
status = nfsd4_decode_clientid4(argp, &sess->clientid);
if (status)
return status;
@@ -1650,34 +1713,34 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
return status;
if (xdr_stream_decode_u32(argp->xdr, &sess->callback_prog) < 0)
return nfserr_bad_xdr;
- status = nfsd4_decode_cb_sec(argp, &sess->cb_sec);
- if (status)
- return status;
-
- return nfs_ok;
+ return nfsd4_decode_cb_sec(argp, &sess->cb_sec);
}
static __be32
nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
- struct nfsd4_destroy_session *destroy_session)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_destroy_session *destroy_session = &u->destroy_session;
return nfsd4_decode_sessionid4(argp, &destroy_session->sessionid);
}
static __be32
nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp,
- struct nfsd4_free_stateid *free_stateid)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_free_stateid *free_stateid = &u->free_stateid;
return nfsd4_decode_stateid4(argp, &free_stateid->fr_stateid);
}
#ifdef CONFIG_NFSD_PNFS
static __be32
nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
- struct nfsd4_getdeviceinfo *gdev)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo;
__be32 status;
+ memset(gdev, 0, sizeof(*gdev));
status = nfsd4_decode_deviceid4(argp, &gdev->gd_devid);
if (status)
return status;
@@ -1694,10 +1757,12 @@ nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
- struct nfsd4_layoutcommit *lcp)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_layoutcommit *lcp = &u->layoutcommit;
__be32 *p, status;
+ memset(lcp, 0, sizeof(*lcp));
if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_seg.offset) < 0)
return nfserr_bad_xdr;
if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_seg.length) < 0)
@@ -1729,10 +1794,12 @@ nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
- struct nfsd4_layoutget *lgp)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_layoutget *lgp = &u->layoutget;
__be32 status;
+ memset(lgp, 0, sizeof(*lgp));
if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_signal) < 0)
return nfserr_bad_xdr;
if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_layout_type) < 0)
@@ -1756,8 +1823,10 @@ nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
- struct nfsd4_layoutreturn *lrp)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_layoutreturn *lrp = &u->layoutreturn;
+ memset(lrp, 0, sizeof(*lrp));
if (xdr_stream_decode_bool(argp->xdr, &lrp->lr_reclaim) < 0)
return nfserr_bad_xdr;
if (xdr_stream_decode_u32(argp->xdr, &lrp->lr_layout_type) < 0)
@@ -1769,17 +1838,21 @@ nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
#endif /* CONFIG_NFSD_PNFS */
static __be32 nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
- struct nfsd4_secinfo_no_name *sin)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_secinfo_no_name *sin = &u->secinfo_no_name;
if (xdr_stream_decode_u32(argp->xdr, &sin->sin_style) < 0)
return nfserr_bad_xdr;
+
+ sin->sin_exp = NULL;
return nfs_ok;
}
static __be32
nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
- struct nfsd4_sequence *seq)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_sequence *seq = &u->sequence;
__be32 *p, status;
status = nfsd4_decode_sessionid4(argp, &seq->sessionid);
@@ -1793,16 +1866,20 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
seq->maxslots = be32_to_cpup(p++);
seq->cachethis = be32_to_cpup(p);
+ seq->status_flags = 0;
return nfs_ok;
}
static __be32
-nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid)
+nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_test_stateid *test_stateid = &u->test_stateid;
struct nfsd4_test_stateid_id *stateid;
__be32 status;
u32 i;
+ memset(test_stateid, 0, sizeof(*test_stateid));
if (xdr_stream_decode_u32(argp->xdr, &test_stateid->ts_num_ids) < 0)
return nfserr_bad_xdr;
@@ -1822,14 +1899,16 @@ nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_sta
}
static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp,
- struct nfsd4_destroy_clientid *dc)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_destroy_clientid *dc = &u->destroy_clientid;
return nfsd4_decode_clientid4(argp, &dc->clientid);
}
static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp,
- struct nfsd4_reclaim_complete *rc)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_reclaim_complete *rc = &u->reclaim_complete;
if (xdr_stream_decode_bool(argp->xdr, &rc->rca_one_fs) < 0)
return nfserr_bad_xdr;
return nfs_ok;
@@ -1837,8 +1916,9 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
- struct nfsd4_fallocate *fallocate)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_fallocate *fallocate = &u->allocate;
__be32 status;
status = nfsd4_decode_stateid4(argp, &fallocate->falloc_stateid);
@@ -1894,12 +1974,14 @@ static __be32 nfsd4_decode_nl4_server(struct nfsd4_compoundargs *argp,
}
static __be32
-nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
+nfsd4_decode_copy(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_copy *copy = &u->copy;
u32 consecutive, i, count, sync;
struct nl4_server *ns_dummy;
__be32 status;
+ memset(copy, 0, sizeof(*copy));
status = nfsd4_decode_stateid4(argp, &copy->cp_src_stateid);
if (status)
return status;
@@ -1951,10 +2033,12 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
static __be32
nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp,
- struct nfsd4_copy_notify *cn)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_copy_notify *cn = &u->copy_notify;
__be32 status;
+ memset(cn, 0, sizeof(*cn));
cn->cpn_src = svcxdr_tmpalloc(argp, sizeof(*cn->cpn_src));
if (cn->cpn_src == NULL)
return nfserr_jukebox;
@@ -1970,14 +2054,18 @@ nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp,
- struct nfsd4_offload_status *os)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_offload_status *os = &u->offload_status;
+ os->count = 0;
+ os->status = 0;
return nfsd4_decode_stateid4(argp, &os->stateid);
}
static __be32
-nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
+nfsd4_decode_seek(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_seek *seek = &u->seek;
__be32 status;
status = nfsd4_decode_stateid4(argp, &seek->seek_stateid);
@@ -1988,12 +2076,15 @@ nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
if (xdr_stream_decode_u32(argp->xdr, &seek->seek_whence) < 0)
return nfserr_bad_xdr;
+ seek->seek_eof = 0;
+ seek->seek_pos = 0;
return nfs_ok;
}
static __be32
-nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone)
+nfsd4_decode_clone(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_clone *clone = &u->clone;
__be32 status;
status = nfsd4_decode_stateid4(argp, &clone->cl_src_stateid);
@@ -2118,11 +2209,13 @@ nfsd4_decode_xattr_name(struct nfsd4_compoundargs *argp, char **namep)
*/
static __be32
nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp,
- struct nfsd4_getxattr *getxattr)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_getxattr *getxattr = &u->getxattr;
__be32 status;
u32 maxcount;
+ memset(getxattr, 0, sizeof(*getxattr));
status = nfsd4_decode_xattr_name(argp, &getxattr->getxa_name);
if (status)
return status;
@@ -2131,17 +2224,19 @@ nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp,
maxcount = min_t(u32, XATTR_SIZE_MAX, maxcount);
getxattr->getxa_len = maxcount;
-
- return status;
+ return nfs_ok;
}
static __be32
nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp,
- struct nfsd4_setxattr *setxattr)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_setxattr *setxattr = &u->setxattr;
u32 flags, maxcount, size;
__be32 status;
+ memset(setxattr, 0, sizeof(*setxattr));
+
if (xdr_stream_decode_u32(argp->xdr, &flags) < 0)
return nfserr_bad_xdr;
@@ -2176,10 +2271,13 @@ nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp,
- struct nfsd4_listxattrs *listxattrs)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_listxattrs *listxattrs = &u->listxattrs;
u32 maxcount;
+ memset(listxattrs, 0, sizeof(*listxattrs));
+
if (xdr_stream_decode_u64(argp->xdr, &listxattrs->lsxa_cookie) < 0)
return nfserr_bad_xdr;
@@ -2205,112 +2303,114 @@ nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_removexattr(struct nfsd4_compoundargs *argp,
- struct nfsd4_removexattr *removexattr)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_removexattr *removexattr = &u->removexattr;
+ memset(removexattr, 0, sizeof(*removexattr));
return nfsd4_decode_xattr_name(argp, &removexattr->rmxa_name);
}
static __be32
-nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
+nfsd4_decode_noop(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p)
{
return nfs_ok;
}
static __be32
-nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
+nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p)
{
return nfserr_notsupp;
}
-typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
+typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u);
static const nfsd4_dec nfsd4_dec_ops[] = {
- [OP_ACCESS] = (nfsd4_dec)nfsd4_decode_access,
- [OP_CLOSE] = (nfsd4_dec)nfsd4_decode_close,
- [OP_COMMIT] = (nfsd4_dec)nfsd4_decode_commit,
- [OP_CREATE] = (nfsd4_dec)nfsd4_decode_create,
- [OP_DELEGPURGE] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_DELEGRETURN] = (nfsd4_dec)nfsd4_decode_delegreturn,
- [OP_GETATTR] = (nfsd4_dec)nfsd4_decode_getattr,
- [OP_GETFH] = (nfsd4_dec)nfsd4_decode_noop,
- [OP_LINK] = (nfsd4_dec)nfsd4_decode_link,
- [OP_LOCK] = (nfsd4_dec)nfsd4_decode_lock,
- [OP_LOCKT] = (nfsd4_dec)nfsd4_decode_lockt,
- [OP_LOCKU] = (nfsd4_dec)nfsd4_decode_locku,
- [OP_LOOKUP] = (nfsd4_dec)nfsd4_decode_lookup,
- [OP_LOOKUPP] = (nfsd4_dec)nfsd4_decode_noop,
- [OP_NVERIFY] = (nfsd4_dec)nfsd4_decode_verify,
- [OP_OPEN] = (nfsd4_dec)nfsd4_decode_open,
- [OP_OPENATTR] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm,
- [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade,
- [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh,
- [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_putpubfh,
- [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop,
- [OP_READ] = (nfsd4_dec)nfsd4_decode_read,
- [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir,
- [OP_READLINK] = (nfsd4_dec)nfsd4_decode_noop,
- [OP_REMOVE] = (nfsd4_dec)nfsd4_decode_remove,
- [OP_RENAME] = (nfsd4_dec)nfsd4_decode_rename,
- [OP_RENEW] = (nfsd4_dec)nfsd4_decode_renew,
- [OP_RESTOREFH] = (nfsd4_dec)nfsd4_decode_noop,
- [OP_SAVEFH] = (nfsd4_dec)nfsd4_decode_noop,
- [OP_SECINFO] = (nfsd4_dec)nfsd4_decode_secinfo,
- [OP_SETATTR] = (nfsd4_dec)nfsd4_decode_setattr,
- [OP_SETCLIENTID] = (nfsd4_dec)nfsd4_decode_setclientid,
- [OP_SETCLIENTID_CONFIRM] = (nfsd4_dec)nfsd4_decode_setclientid_confirm,
- [OP_VERIFY] = (nfsd4_dec)nfsd4_decode_verify,
- [OP_WRITE] = (nfsd4_dec)nfsd4_decode_write,
- [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner,
+ [OP_ACCESS] = nfsd4_decode_access,
+ [OP_CLOSE] = nfsd4_decode_close,
+ [OP_COMMIT] = nfsd4_decode_commit,
+ [OP_CREATE] = nfsd4_decode_create,
+ [OP_DELEGPURGE] = nfsd4_decode_notsupp,
+ [OP_DELEGRETURN] = nfsd4_decode_delegreturn,
+ [OP_GETATTR] = nfsd4_decode_getattr,
+ [OP_GETFH] = nfsd4_decode_noop,
+ [OP_LINK] = nfsd4_decode_link,
+ [OP_LOCK] = nfsd4_decode_lock,
+ [OP_LOCKT] = nfsd4_decode_lockt,
+ [OP_LOCKU] = nfsd4_decode_locku,
+ [OP_LOOKUP] = nfsd4_decode_lookup,
+ [OP_LOOKUPP] = nfsd4_decode_noop,
+ [OP_NVERIFY] = nfsd4_decode_verify,
+ [OP_OPEN] = nfsd4_decode_open,
+ [OP_OPENATTR] = nfsd4_decode_notsupp,
+ [OP_OPEN_CONFIRM] = nfsd4_decode_open_confirm,
+ [OP_OPEN_DOWNGRADE] = nfsd4_decode_open_downgrade,
+ [OP_PUTFH] = nfsd4_decode_putfh,
+ [OP_PUTPUBFH] = nfsd4_decode_putpubfh,
+ [OP_PUTROOTFH] = nfsd4_decode_noop,
+ [OP_READ] = nfsd4_decode_read,
+ [OP_READDIR] = nfsd4_decode_readdir,
+ [OP_READLINK] = nfsd4_decode_noop,
+ [OP_REMOVE] = nfsd4_decode_remove,
+ [OP_RENAME] = nfsd4_decode_rename,
+ [OP_RENEW] = nfsd4_decode_renew,
+ [OP_RESTOREFH] = nfsd4_decode_noop,
+ [OP_SAVEFH] = nfsd4_decode_noop,
+ [OP_SECINFO] = nfsd4_decode_secinfo,
+ [OP_SETATTR] = nfsd4_decode_setattr,
+ [OP_SETCLIENTID] = nfsd4_decode_setclientid,
+ [OP_SETCLIENTID_CONFIRM] = nfsd4_decode_setclientid_confirm,
+ [OP_VERIFY] = nfsd4_decode_verify,
+ [OP_WRITE] = nfsd4_decode_write,
+ [OP_RELEASE_LOCKOWNER] = nfsd4_decode_release_lockowner,
/* new operations for NFSv4.1 */
- [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_backchannel_ctl,
- [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
- [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id,
- [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session,
- [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session,
- [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid,
- [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_BACKCHANNEL_CTL] = nfsd4_decode_backchannel_ctl,
+ [OP_BIND_CONN_TO_SESSION] = nfsd4_decode_bind_conn_to_session,
+ [OP_EXCHANGE_ID] = nfsd4_decode_exchange_id,
+ [OP_CREATE_SESSION] = nfsd4_decode_create_session,
+ [OP_DESTROY_SESSION] = nfsd4_decode_destroy_session,
+ [OP_FREE_STATEID] = nfsd4_decode_free_stateid,
+ [OP_GET_DIR_DELEGATION] = nfsd4_decode_notsupp,
#ifdef CONFIG_NFSD_PNFS
- [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdeviceinfo,
- [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit,
- [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget,
- [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn,
+ [OP_GETDEVICEINFO] = nfsd4_decode_getdeviceinfo,
+ [OP_GETDEVICELIST] = nfsd4_decode_notsupp,
+ [OP_LAYOUTCOMMIT] = nfsd4_decode_layoutcommit,
+ [OP_LAYOUTGET] = nfsd4_decode_layoutget,
+ [OP_LAYOUTRETURN] = nfsd4_decode_layoutreturn,
#else
- [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_GETDEVICEINFO] = nfsd4_decode_notsupp,
+ [OP_GETDEVICELIST] = nfsd4_decode_notsupp,
+ [OP_LAYOUTCOMMIT] = nfsd4_decode_notsupp,
+ [OP_LAYOUTGET] = nfsd4_decode_notsupp,
+ [OP_LAYOUTRETURN] = nfsd4_decode_notsupp,
#endif
- [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name,
- [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence,
- [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_test_stateid,
- [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid,
- [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete,
+ [OP_SECINFO_NO_NAME] = nfsd4_decode_secinfo_no_name,
+ [OP_SEQUENCE] = nfsd4_decode_sequence,
+ [OP_SET_SSV] = nfsd4_decode_notsupp,
+ [OP_TEST_STATEID] = nfsd4_decode_test_stateid,
+ [OP_WANT_DELEGATION] = nfsd4_decode_notsupp,
+ [OP_DESTROY_CLIENTID] = nfsd4_decode_destroy_clientid,
+ [OP_RECLAIM_COMPLETE] = nfsd4_decode_reclaim_complete,
/* new operations for NFSv4.2 */
- [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate,
- [OP_COPY] = (nfsd4_dec)nfsd4_decode_copy,
- [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_copy_notify,
- [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate,
- [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_offload_status,
- [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_offload_status,
- [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_read,
- [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek,
- [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_CLONE] = (nfsd4_dec)nfsd4_decode_clone,
+ [OP_ALLOCATE] = nfsd4_decode_fallocate,
+ [OP_COPY] = nfsd4_decode_copy,
+ [OP_COPY_NOTIFY] = nfsd4_decode_copy_notify,
+ [OP_DEALLOCATE] = nfsd4_decode_fallocate,
+ [OP_IO_ADVISE] = nfsd4_decode_notsupp,
+ [OP_LAYOUTERROR] = nfsd4_decode_notsupp,
+ [OP_LAYOUTSTATS] = nfsd4_decode_notsupp,
+ [OP_OFFLOAD_CANCEL] = nfsd4_decode_offload_status,
+ [OP_OFFLOAD_STATUS] = nfsd4_decode_offload_status,
+ [OP_READ_PLUS] = nfsd4_decode_read,
+ [OP_SEEK] = nfsd4_decode_seek,
+ [OP_WRITE_SAME] = nfsd4_decode_notsupp,
+ [OP_CLONE] = nfsd4_decode_clone,
/* RFC 8276 extended atributes operations */
- [OP_GETXATTR] = (nfsd4_dec)nfsd4_decode_getxattr,
- [OP_SETXATTR] = (nfsd4_dec)nfsd4_decode_setxattr,
- [OP_LISTXATTRS] = (nfsd4_dec)nfsd4_decode_listxattrs,
- [OP_REMOVEXATTR] = (nfsd4_dec)nfsd4_decode_removexattr,
+ [OP_GETXATTR] = nfsd4_decode_getxattr,
+ [OP_SETXATTR] = nfsd4_decode_setxattr,
+ [OP_LISTXATTRS] = nfsd4_decode_listxattrs,
+ [OP_REMOVEXATTR] = nfsd4_decode_removexattr,
};
static inline bool
@@ -2357,22 +2457,15 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
if (xdr_stream_decode_u32(argp->xdr, &argp->minorversion) < 0)
return false;
- if (xdr_stream_decode_u32(argp->xdr, &argp->opcnt) < 0)
+ if (xdr_stream_decode_u32(argp->xdr, &argp->client_opcnt) < 0)
return false;
-
- /*
- * NFS4ERR_RESOURCE is a more helpful error than GARBAGE_ARGS
- * here, so we return success at the xdr level so that
- * nfsd4_proc can handle this is an NFS-level error.
- */
- if (argp->opcnt > NFSD_MAX_OPS_PER_COMPOUND)
- return true;
+ argp->opcnt = min_t(u32, argp->client_opcnt,
+ NFSD_MAX_OPS_PER_COMPOUND);
if (argp->opcnt > ARRAY_SIZE(argp->iops)) {
- argp->ops = kzalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL);
+ argp->ops = vcalloc(argp->opcnt, sizeof(*argp->ops));
if (!argp->ops) {
argp->ops = argp->iops;
- dprintk("nfsd: couldn't allocate room for COMPOUND\n");
return false;
}
}
@@ -2774,9 +2867,10 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *bmval2, u32
}
-static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
+static int nfsd4_get_mounted_on_ino(struct svc_export *exp, u64 *pino)
{
struct path path = exp->ex_path;
+ struct kstat stat;
int err;
path_get(&path);
@@ -2784,8 +2878,10 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
if (path.dentry != path.mnt->mnt_root)
break;
}
- err = vfs_getattr(&path, stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
+ err = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT);
path_put(&path);
+ if (!err)
+ *pino = stat.ino;
return err;
}
@@ -3282,22 +3378,21 @@ out_acl:
*p++ = cpu_to_be32(stat.btime.tv_nsec);
}
if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
- struct kstat parent_stat;
u64 ino = stat.ino;
p = xdr_reserve_space(xdr, 8);
if (!p)
goto out_resource;
/*
- * Get parent's attributes if not ignoring crossmount
- * and this is the root of a cross-mounted filesystem.
+ * Get ino of mountpoint in parent filesystem, if not ignoring
+ * crossmount and this is the root of a cross-mounted
+ * filesystem.
*/
if (ignore_crossmnt == 0 &&
dentry == exp->ex_path.mnt->mnt_root) {
- err = get_parent_attributes(exp, &parent_stat);
+ err = nfsd4_get_mounted_on_ino(exp, &ino);
if (err)
goto out_nfserr;
- ino = parent_stat.ino;
}
p = xdr_encode_hyper(p, ino);
}
@@ -3594,8 +3689,10 @@ nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
}
static __be32
-nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
+nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_access *access = &u->access;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -3607,8 +3704,10 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
return 0;
}
-static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts)
+static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -3624,8 +3723,10 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
}
static __be32
-nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
+nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_close *close = &u->close;
struct xdr_stream *xdr = resp->xdr;
return nfsd4_encode_stateid(xdr, &close->cl_stateid);
@@ -3633,8 +3734,10 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
static __be32
-nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit)
+nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_commit *commit = &u->commit;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -3647,8 +3750,10 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
}
static __be32
-nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create)
+nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_create *create = &u->create;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -3661,8 +3766,10 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
}
static __be32
-nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr)
+nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_getattr *getattr = &u->getattr;
struct svc_fh *fhp = getattr->ga_fhp;
struct xdr_stream *xdr = resp->xdr;
@@ -3671,8 +3778,10 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
}
static __be32
-nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp)
+nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct svc_fh **fhpp = &u->getfh;
struct xdr_stream *xdr = resp->xdr;
struct svc_fh *fhp = *fhpp;
unsigned int len;
@@ -3726,8 +3835,10 @@ again:
}
static __be32
-nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock)
+nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_lock *lock = &u->lock;
struct xdr_stream *xdr = resp->xdr;
if (!nfserr)
@@ -3739,8 +3850,10 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
}
static __be32
-nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt)
+nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_lockt *lockt = &u->lockt;
struct xdr_stream *xdr = resp->xdr;
if (nfserr == nfserr_denied)
@@ -3749,8 +3862,10 @@ nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
}
static __be32
-nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku)
+nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_locku *locku = &u->locku;
struct xdr_stream *xdr = resp->xdr;
return nfsd4_encode_stateid(xdr, &locku->lu_stateid);
@@ -3758,8 +3873,10 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
static __be32
-nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link)
+nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_link *link = &u->link;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -3772,8 +3889,10 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
static __be32
-nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
+nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_open *open = &u->open;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -3866,16 +3985,20 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
}
static __be32
-nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
+nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_open_confirm *oc = &u->open_confirm;
struct xdr_stream *xdr = resp->xdr;
return nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid);
}
static __be32
-nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
+nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_open_downgrade *od = &u->open_downgrade;
struct xdr_stream *xdr = resp->xdr;
return nfsd4_encode_stateid(xdr, &od->od_stateid);
@@ -3974,8 +4097,9 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
static __be32
nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_read *read)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_read *read = &u->read;
bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
unsigned long maxcount;
struct xdr_stream *xdr = resp->xdr;
@@ -3994,7 +4118,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
}
if (resp->xdr->buf->page_len && splice_ok) {
WARN_ON_ONCE(1);
- return nfserr_resource;
+ return nfserr_serverfault;
}
xdr_commit_encode(xdr);
@@ -4016,8 +4140,10 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
}
static __be32
-nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink)
+nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_readlink *readlink = &u->readlink;
__be32 *p, *maxcount_p, zero = xdr_zero;
struct xdr_stream *xdr = resp->xdr;
int length_offset = xdr->buf->len;
@@ -4061,8 +4187,10 @@ out_err:
}
static __be32
-nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir)
+nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_readdir *readdir = &u->readdir;
int maxcount;
int bytes_left;
loff_t offset;
@@ -4152,8 +4280,10 @@ err_no_verf:
}
static __be32
-nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove)
+nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_remove *remove = &u->remove;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -4165,8 +4295,10 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
}
static __be32
-nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename)
+nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_rename *rename = &u->rename;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -4248,8 +4380,9 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
static __be32
nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_secinfo *secinfo)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_secinfo *secinfo = &u->secinfo;
struct xdr_stream *xdr = resp->xdr;
return nfsd4_do_encode_secinfo(xdr, secinfo->si_exp);
@@ -4257,8 +4390,9 @@ nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_secinfo_no_name *secinfo)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_secinfo_no_name *secinfo = &u->secinfo_no_name;
struct xdr_stream *xdr = resp->xdr;
return nfsd4_do_encode_secinfo(xdr, secinfo->sin_exp);
@@ -4269,8 +4403,10 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
* regardless of the error status.
*/
static __be32
-nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr)
+nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_setattr *setattr = &u->setattr;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -4293,8 +4429,10 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
}
static __be32
-nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd)
+nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_setclientid *scd = &u->setclientid;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -4317,8 +4455,10 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n
}
static __be32
-nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write)
+nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_write *write = &u->write;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -4334,8 +4474,9 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
static __be32
nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_exchange_id *exid)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_exchange_id *exid = &u->exchange_id;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
char *major_id;
@@ -4412,8 +4553,9 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_create_session *sess)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_create_session *sess = &u->create_session;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -4465,8 +4607,9 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_sequence *seq)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_sequence *seq = &u->sequence;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -4488,8 +4631,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_test_stateid *test_stateid)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_test_stateid *test_stateid = &u->test_stateid;
struct xdr_stream *xdr = resp->xdr;
struct nfsd4_test_stateid_id *stateid, *next;
__be32 *p;
@@ -4509,8 +4653,9 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
#ifdef CONFIG_NFSD_PNFS
static __be32
nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_getdeviceinfo *gdev)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo;
struct xdr_stream *xdr = resp->xdr;
const struct nfsd4_layout_ops *ops;
u32 starting_len = xdr->buf->len, needed_len;
@@ -4565,8 +4710,9 @@ toosmall:
static __be32
nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_layoutget *lgp)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_layoutget *lgp = &u->layoutget;
struct xdr_stream *xdr = resp->xdr;
const struct nfsd4_layout_ops *ops;
__be32 *p;
@@ -4592,8 +4738,9 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_layoutcommit *lcp)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_layoutcommit *lcp = &u->layoutcommit;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -4613,8 +4760,9 @@ nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_layoutreturn *lrp)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_layoutreturn *lrp = &u->layoutreturn;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -4699,8 +4847,9 @@ nfsd42_encode_nl4_server(struct nfsd4_compoundres *resp, struct nl4_server *ns)
static __be32
nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_copy *copy)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_copy *copy = &u->copy;
__be32 *p;
nfserr = nfsd42_encode_write_res(resp, &copy->cp_res,
@@ -4716,8 +4865,9 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_offload_status *os)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_offload_status *os = &u->offload_status;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -4731,156 +4881,83 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
- struct nfsd4_read *read,
- unsigned long *maxcount, u32 *eof,
- loff_t *pos)
+ struct nfsd4_read *read)
{
- struct xdr_stream *xdr = resp->xdr;
+ bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
struct file *file = read->rd_nf->nf_file;
- int starting_len = xdr->buf->len;
- loff_t hole_pos;
- __be32 nfserr;
- __be32 *p, tmp;
- __be64 tmp64;
-
- hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE);
- if (hole_pos > read->rd_offset)
- *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset);
- *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len));
+ struct xdr_stream *xdr = resp->xdr;
+ unsigned long maxcount;
+ __be32 nfserr, *p;
/* Content type, offset, byte count */
p = xdr_reserve_space(xdr, 4 + 8 + 4);
if (!p)
- return nfserr_resource;
+ return nfserr_io;
+ if (resp->xdr->buf->page_len && splice_ok) {
+ WARN_ON_ONCE(splice_ok);
+ return nfserr_serverfault;
+ }
- read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount);
- if (read->rd_vlen < 0)
- return nfserr_resource;
+ maxcount = min_t(unsigned long, read->rd_length,
+ (xdr->buf->buflen - xdr->buf->len));
- nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
- resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof);
+ if (file->f_op->splice_read && splice_ok)
+ nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
+ else
+ nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
if (nfserr)
return nfserr;
- xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount));
-
- tmp = htonl(NFS4_CONTENT_DATA);
- write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4);
- tmp64 = cpu_to_be64(read->rd_offset);
- write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp64, 8);
- tmp = htonl(*maxcount);
- write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp, 4);
-
- tmp = xdr_zero;
- write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp,
- xdr_pad_size(*maxcount));
- return nfs_ok;
-}
-
-static __be32
-nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
- struct nfsd4_read *read,
- unsigned long *maxcount, u32 *eof)
-{
- struct file *file = read->rd_nf->nf_file;
- loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
- loff_t f_size = i_size_read(file_inode(file));
- unsigned long count;
- __be32 *p;
-
- if (data_pos == -ENXIO)
- data_pos = f_size;
- else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
- return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
- count = data_pos - read->rd_offset;
-
- /* Content type, offset, byte count */
- p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
- if (!p)
- return nfserr_resource;
- *p++ = htonl(NFS4_CONTENT_HOLE);
+ *p++ = cpu_to_be32(NFS4_CONTENT_DATA);
p = xdr_encode_hyper(p, read->rd_offset);
- p = xdr_encode_hyper(p, count);
+ *p = cpu_to_be32(read->rd_length);
- *eof = (read->rd_offset + count) >= f_size;
- *maxcount = min_t(unsigned long, count, *maxcount);
return nfs_ok;
}
static __be32
nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_read *read)
+ union nfsd4_op_u *u)
{
- unsigned long maxcount, count;
+ struct nfsd4_read *read = &u->read;
+ struct file *file = read->rd_nf->nf_file;
struct xdr_stream *xdr = resp->xdr;
- struct file *file;
int starting_len = xdr->buf->len;
- int last_segment = xdr->buf->len;
- int segments = 0;
- __be32 *p, tmp;
- bool is_data;
- loff_t pos;
- u32 eof;
+ u32 segments = 0;
+ __be32 *p;
if (nfserr)
return nfserr;
- file = read->rd_nf->nf_file;
/* eof flag, segment count */
p = xdr_reserve_space(xdr, 4 + 4);
if (!p)
- return nfserr_resource;
+ return nfserr_io;
xdr_commit_encode(xdr);
- maxcount = min_t(unsigned long, read->rd_length,
- (xdr->buf->buflen - xdr->buf->len));
- count = maxcount;
-
- eof = read->rd_offset >= i_size_read(file_inode(file));
- if (eof)
+ read->rd_eof = read->rd_offset >= i_size_read(file_inode(file));
+ if (read->rd_eof)
goto out;
- pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
- is_data = pos > read->rd_offset;
-
- while (count > 0 && !eof) {
- maxcount = count;
- if (is_data)
- nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof,
- segments == 0 ? &pos : NULL);
- else
- nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof);
- if (nfserr)
- goto out;
- count -= maxcount;
- read->rd_offset += maxcount;
- is_data = !is_data;
- last_segment = xdr->buf->len;
- segments++;
- }
-
-out:
- if (nfserr && segments == 0)
+ nfserr = nfsd4_encode_read_plus_data(resp, read);
+ if (nfserr) {
xdr_truncate_encode(xdr, starting_len);
- else {
- if (nfserr) {
- xdr_truncate_encode(xdr, last_segment);
- nfserr = nfs_ok;
- eof = 0;
- }
- tmp = htonl(eof);
- write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4);
- tmp = htonl(segments);
- write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
+ return nfserr;
}
+ segments++;
+
+out:
+ p = xdr_encode_bool(p, read->rd_eof);
+ *p = cpu_to_be32(segments);
return nfserr;
}
static __be32
nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_copy_notify *cn)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_copy_notify *cn = &u->copy_notify;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -4914,8 +4991,9 @@ nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_seek *seek)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_seek *seek = &u->seek;
__be32 *p;
p = xdr_reserve_space(resp->xdr, 4 + 8);
@@ -4926,7 +5004,8 @@ nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
}
static __be32
-nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
+nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *p)
{
return nfserr;
}
@@ -4977,8 +5056,9 @@ nfsd4_vbuf_to_stream(struct xdr_stream *xdr, char *buf, u32 buflen)
static __be32
nfsd4_encode_getxattr(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_getxattr *getxattr)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_getxattr *getxattr = &u->getxattr;
struct xdr_stream *xdr = resp->xdr;
__be32 *p, err;
@@ -5001,8 +5081,9 @@ nfsd4_encode_getxattr(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_setxattr(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_setxattr *setxattr)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_setxattr *setxattr = &u->setxattr;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -5042,8 +5123,9 @@ nfsd4_listxattr_validate_cookie(struct nfsd4_listxattrs *listxattrs,
static __be32
nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_listxattrs *listxattrs)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_listxattrs *listxattrs = &u->listxattrs;
struct xdr_stream *xdr = resp->xdr;
u32 cookie_offset, count_offset, eof;
u32 left, xdrleft, slen, count;
@@ -5153,8 +5235,9 @@ out:
static __be32
nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_removexattr *removexattr)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_removexattr *removexattr = &u->removexattr;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -5166,7 +5249,7 @@ nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr,
return 0;
}
-typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
+typedef __be32(*nfsd4_enc)(struct nfsd4_compoundres *, __be32, union nfsd4_op_u *u);
/*
* Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1
@@ -5174,93 +5257,93 @@ typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
* done in the decoding phase.
*/
static const nfsd4_enc nfsd4_enc_ops[] = {
- [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access,
- [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close,
- [OP_COMMIT] = (nfsd4_enc)nfsd4_encode_commit,
- [OP_CREATE] = (nfsd4_enc)nfsd4_encode_create,
- [OP_DELEGPURGE] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_DELEGRETURN] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_GETATTR] = (nfsd4_enc)nfsd4_encode_getattr,
- [OP_GETFH] = (nfsd4_enc)nfsd4_encode_getfh,
- [OP_LINK] = (nfsd4_enc)nfsd4_encode_link,
- [OP_LOCK] = (nfsd4_enc)nfsd4_encode_lock,
- [OP_LOCKT] = (nfsd4_enc)nfsd4_encode_lockt,
- [OP_LOCKU] = (nfsd4_enc)nfsd4_encode_locku,
- [OP_LOOKUP] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_LOOKUPP] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_NVERIFY] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_OPEN] = (nfsd4_enc)nfsd4_encode_open,
- [OP_OPENATTR] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_OPEN_CONFIRM] = (nfsd4_enc)nfsd4_encode_open_confirm,
- [OP_OPEN_DOWNGRADE] = (nfsd4_enc)nfsd4_encode_open_downgrade,
- [OP_PUTFH] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_PUTPUBFH] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_PUTROOTFH] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_READ] = (nfsd4_enc)nfsd4_encode_read,
- [OP_READDIR] = (nfsd4_enc)nfsd4_encode_readdir,
- [OP_READLINK] = (nfsd4_enc)nfsd4_encode_readlink,
- [OP_REMOVE] = (nfsd4_enc)nfsd4_encode_remove,
- [OP_RENAME] = (nfsd4_enc)nfsd4_encode_rename,
- [OP_RENEW] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_RESTOREFH] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_SAVEFH] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_SECINFO] = (nfsd4_enc)nfsd4_encode_secinfo,
- [OP_SETATTR] = (nfsd4_enc)nfsd4_encode_setattr,
- [OP_SETCLIENTID] = (nfsd4_enc)nfsd4_encode_setclientid,
- [OP_SETCLIENTID_CONFIRM] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write,
- [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_ACCESS] = nfsd4_encode_access,
+ [OP_CLOSE] = nfsd4_encode_close,
+ [OP_COMMIT] = nfsd4_encode_commit,
+ [OP_CREATE] = nfsd4_encode_create,
+ [OP_DELEGPURGE] = nfsd4_encode_noop,
+ [OP_DELEGRETURN] = nfsd4_encode_noop,
+ [OP_GETATTR] = nfsd4_encode_getattr,
+ [OP_GETFH] = nfsd4_encode_getfh,
+ [OP_LINK] = nfsd4_encode_link,
+ [OP_LOCK] = nfsd4_encode_lock,
+ [OP_LOCKT] = nfsd4_encode_lockt,
+ [OP_LOCKU] = nfsd4_encode_locku,
+ [OP_LOOKUP] = nfsd4_encode_noop,
+ [OP_LOOKUPP] = nfsd4_encode_noop,
+ [OP_NVERIFY] = nfsd4_encode_noop,
+ [OP_OPEN] = nfsd4_encode_open,
+ [OP_OPENATTR] = nfsd4_encode_noop,
+ [OP_OPEN_CONFIRM] = nfsd4_encode_open_confirm,
+ [OP_OPEN_DOWNGRADE] = nfsd4_encode_open_downgrade,
+ [OP_PUTFH] = nfsd4_encode_noop,
+ [OP_PUTPUBFH] = nfsd4_encode_noop,
+ [OP_PUTROOTFH] = nfsd4_encode_noop,
+ [OP_READ] = nfsd4_encode_read,
+ [OP_READDIR] = nfsd4_encode_readdir,
+ [OP_READLINK] = nfsd4_encode_readlink,
+ [OP_REMOVE] = nfsd4_encode_remove,
+ [OP_RENAME] = nfsd4_encode_rename,
+ [OP_RENEW] = nfsd4_encode_noop,
+ [OP_RESTOREFH] = nfsd4_encode_noop,
+ [OP_SAVEFH] = nfsd4_encode_noop,
+ [OP_SECINFO] = nfsd4_encode_secinfo,
+ [OP_SETATTR] = nfsd4_encode_setattr,
+ [OP_SETCLIENTID] = nfsd4_encode_setclientid,
+ [OP_SETCLIENTID_CONFIRM] = nfsd4_encode_noop,
+ [OP_VERIFY] = nfsd4_encode_noop,
+ [OP_WRITE] = nfsd4_encode_write,
+ [OP_RELEASE_LOCKOWNER] = nfsd4_encode_noop,
/* NFSv4.1 operations */
- [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session,
- [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id,
- [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session,
- [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_BACKCHANNEL_CTL] = nfsd4_encode_noop,
+ [OP_BIND_CONN_TO_SESSION] = nfsd4_encode_bind_conn_to_session,
+ [OP_EXCHANGE_ID] = nfsd4_encode_exchange_id,
+ [OP_CREATE_SESSION] = nfsd4_encode_create_session,
+ [OP_DESTROY_SESSION] = nfsd4_encode_noop,
+ [OP_FREE_STATEID] = nfsd4_encode_noop,
+ [OP_GET_DIR_DELEGATION] = nfsd4_encode_noop,
#ifdef CONFIG_NFSD_PNFS
- [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdeviceinfo,
- [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit,
- [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget,
- [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn,
+ [OP_GETDEVICEINFO] = nfsd4_encode_getdeviceinfo,
+ [OP_GETDEVICELIST] = nfsd4_encode_noop,
+ [OP_LAYOUTCOMMIT] = nfsd4_encode_layoutcommit,
+ [OP_LAYOUTGET] = nfsd4_encode_layoutget,
+ [OP_LAYOUTRETURN] = nfsd4_encode_layoutreturn,
#else
- [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_GETDEVICEINFO] = nfsd4_encode_noop,
+ [OP_GETDEVICELIST] = nfsd4_encode_noop,
+ [OP_LAYOUTCOMMIT] = nfsd4_encode_noop,
+ [OP_LAYOUTGET] = nfsd4_encode_noop,
+ [OP_LAYOUTRETURN] = nfsd4_encode_noop,
#endif
- [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name,
- [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
- [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_test_stateid,
- [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_SECINFO_NO_NAME] = nfsd4_encode_secinfo_no_name,
+ [OP_SEQUENCE] = nfsd4_encode_sequence,
+ [OP_SET_SSV] = nfsd4_encode_noop,
+ [OP_TEST_STATEID] = nfsd4_encode_test_stateid,
+ [OP_WANT_DELEGATION] = nfsd4_encode_noop,
+ [OP_DESTROY_CLIENTID] = nfsd4_encode_noop,
+ [OP_RECLAIM_COMPLETE] = nfsd4_encode_noop,
/* NFSv4.2 operations */
- [OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_COPY] = (nfsd4_enc)nfsd4_encode_copy,
- [OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_copy_notify,
- [OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_LAYOUTERROR] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_offload_status,
- [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_read_plus,
- [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek,
- [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_CLONE] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_ALLOCATE] = nfsd4_encode_noop,
+ [OP_COPY] = nfsd4_encode_copy,
+ [OP_COPY_NOTIFY] = nfsd4_encode_copy_notify,
+ [OP_DEALLOCATE] = nfsd4_encode_noop,
+ [OP_IO_ADVISE] = nfsd4_encode_noop,
+ [OP_LAYOUTERROR] = nfsd4_encode_noop,
+ [OP_LAYOUTSTATS] = nfsd4_encode_noop,
+ [OP_OFFLOAD_CANCEL] = nfsd4_encode_noop,
+ [OP_OFFLOAD_STATUS] = nfsd4_encode_offload_status,
+ [OP_READ_PLUS] = nfsd4_encode_read_plus,
+ [OP_SEEK] = nfsd4_encode_seek,
+ [OP_WRITE_SAME] = nfsd4_encode_noop,
+ [OP_CLONE] = nfsd4_encode_noop,
/* RFC 8276 extended atributes operations */
- [OP_GETXATTR] = (nfsd4_enc)nfsd4_encode_getxattr,
- [OP_SETXATTR] = (nfsd4_enc)nfsd4_encode_setxattr,
- [OP_LISTXATTRS] = (nfsd4_enc)nfsd4_encode_listxattrs,
- [OP_REMOVEXATTR] = (nfsd4_enc)nfsd4_encode_removexattr,
+ [OP_GETXATTR] = nfsd4_encode_getxattr,
+ [OP_SETXATTR] = nfsd4_encode_setxattr,
+ [OP_LISTXATTRS] = nfsd4_encode_listxattrs,
+ [OP_REMOVEXATTR] = nfsd4_encode_removexattr,
};
/*
@@ -5394,7 +5477,7 @@ void nfsd4_release_compoundargs(struct svc_rqst *rqstp)
struct nfsd4_compoundargs *args = rqstp->rq_argp;
if (args->ops != args->iops) {
- kfree(args->ops);
+ vfree(args->ops);
args->ops = args->iops;
}
while (args->to_free) {
@@ -5423,12 +5506,8 @@ bool
nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
struct nfsd4_compoundres *resp = rqstp->rq_resp;
- struct xdr_buf *buf = xdr->buf;
__be32 *p;
- WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len +
- buf->tail[0].iov_len);
-
/*
* Send buffer space for the following items is reserved
* at the top of nfsd4_proc_compound().
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 9b31e1103e7b..3e64a3d50a1c 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -604,9 +604,10 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
* scraping this file for info should test the labels to ensure they're
* getting the correct field.
*/
-static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
+int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
{
- struct nfsd_net *nn = m->private;
+ struct nfsd_net *nn = net_generic(file_inode(m->file)->i_sb->s_fs_info,
+ nfsd_net_id);
seq_printf(m, "max entries: %u\n", nn->max_drc_entries);
seq_printf(m, "num entries: %u\n",
@@ -626,11 +627,3 @@ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
seq_printf(m, "cachesize at longest: %u\n", nn->longest_chain_cachesize);
return 0;
}
-
-int nfsd_reply_cache_stats_open(struct inode *inode, struct file *file)
-{
- struct nfsd_net *nn = net_generic(file_inode(file)->i_sb->s_fs_info,
- nfsd_net_id);
-
- return single_open(file, nfsd_reply_cache_stats_show, nn);
-}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 917fa1892fd2..d1e581a60480 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -185,17 +185,7 @@ static int export_features_show(struct seq_file *m, void *v)
return 0;
}
-static int export_features_open(struct inode *inode, struct file *file)
-{
- return single_open(file, export_features_show, NULL);
-}
-
-static const struct file_operations export_features_operations = {
- .open = export_features_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(export_features);
#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
static int supported_enctypes_show(struct seq_file *m, void *v)
@@ -204,17 +194,7 @@ static int supported_enctypes_show(struct seq_file *m, void *v)
return 0;
}
-static int supported_enctypes_open(struct inode *inode, struct file *file)
-{
- return single_open(file, supported_enctypes_show, NULL);
-}
-
-static const struct file_operations supported_enctypes_ops = {
- .open = supported_enctypes_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(supported_enctypes);
#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
static const struct file_operations pool_stats_operations = {
@@ -224,19 +204,9 @@ static const struct file_operations pool_stats_operations = {
.release = nfsd_pool_stats_release,
};
-static const struct file_operations reply_cache_stats_operations = {
- .open = nfsd_reply_cache_stats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(nfsd_reply_cache_stats);
-static const struct file_operations filecache_ops = {
- .open = nfsd_file_cache_stats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(nfsd_file_cache_stats);
/*----------------------------------------------------------------------------*/
/*
@@ -611,7 +581,9 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
cmd = sign == '-' ? NFSD_CLEAR : NFSD_SET;
switch(num) {
+#ifdef CONFIG_NFSD_V2
case 2:
+#endif
case 3:
nfsd_vers(nn, num, cmd);
break;
@@ -631,7 +603,9 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
}
break;
default:
- return -EINVAL;
+ /* Ignore requests to disable non-existent versions */
+ if (cmd == NFSD_SET)
+ return -EINVAL;
}
vers += len + 1;
} while ((len = qword_get(&mesg, vers, size)) > 0);
@@ -1365,7 +1339,7 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
/* Per-export io stats use same ops as exports file */
[NFSD_Export_Stats] = {"export_stats", &exports_nfsd_operations, S_IRUGO},
[NFSD_Export_features] = {"export_features",
- &export_features_operations, S_IRUGO},
+ &export_features_fops, S_IRUGO},
[NFSD_FO_UnlockIP] = {"unlock_ip",
&transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_FO_UnlockFS] = {"unlock_filesystem",
@@ -1374,14 +1348,16 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
[NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
- [NFSD_Reply_Cache_Stats] = {"reply_cache_stats", &reply_cache_stats_operations, S_IRUGO},
+ [NFSD_Reply_Cache_Stats] = {"reply_cache_stats",
+ &nfsd_reply_cache_stats_fops, S_IRUGO},
[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
[NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO},
- [NFSD_Filecache] = {"filecache", &filecache_ops, S_IRUGO},
+ [NFSD_Filecache] = {"filecache", &nfsd_file_cache_stats_fops, S_IRUGO},
#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
- [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO},
+ [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes",
+ &supported_enctypes_fops, S_IRUGO},
#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
#ifdef CONFIG_NFSD_V4
[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
@@ -1481,16 +1457,19 @@ static __net_init int nfsd_init_net(struct net *net)
goto out_idmap_error;
nn->nfsd_versions = NULL;
nn->nfsd4_minorversions = NULL;
- retval = nfsd_reply_cache_init(nn);
+ retval = nfsd4_init_leases_net(nn);
if (retval)
goto out_drc_error;
- nfsd4_init_leases_net(nn);
-
+ retval = nfsd_reply_cache_init(nn);
+ if (retval)
+ goto out_cache_error;
get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key));
seqlock_init(&nn->writeverf_lock);
return 0;
+out_cache_error:
+ nfsd4_leases_net_shutdown(nn);
out_drc_error:
nfsd_idmap_shutdown(net);
out_idmap_error:
@@ -1507,6 +1486,7 @@ static __net_exit void nfsd_exit_net(struct net *net)
nfsd_idmap_shutdown(net);
nfsd_export_shutdown(net);
nfsd_netns_free_versions(net_generic(net, nfsd_net_id));
+ nfsd4_leases_net_shutdown(nn);
}
static struct pernet_operations nfsd_net_ops = {
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 57a468ed85c3..93b42ef9ed91 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -64,8 +64,7 @@ struct readdir_cd {
extern struct svc_program nfsd_program;
-extern const struct svc_version nfsd_version2, nfsd_version3,
- nfsd_version4;
+extern const struct svc_version nfsd_version2, nfsd_version3, nfsd_version4;
extern struct mutex nfsd_mutex;
extern spinlock_t nfsd_drc_lock;
extern unsigned long nfsd_drc_max_mem;
@@ -164,6 +163,7 @@ char * nfs4_recoverydir(void);
bool nfsd4_spo_must_allow(struct svc_rqst *rqstp);
int nfsd4_create_laundry_wq(void);
void nfsd4_destroy_laundry_wq(void);
+bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp, struct inode *inode);
#else
static inline int nfsd4_init_slabs(void) { return 0; }
static inline void nfsd4_free_slabs(void) { }
@@ -179,6 +179,11 @@ static inline bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
}
static inline int nfsd4_create_laundry_wq(void) { return 0; };
static inline void nfsd4_destroy_laundry_wq(void) {};
+static inline bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp,
+ struct inode *inode)
+{
+ return false;
+}
#endif
/*
@@ -343,6 +348,7 @@ void nfsd_lockd_shutdown(void);
#define NFSD_COURTESY_CLIENT_TIMEOUT (24 * 60 * 60) /* seconds */
#define NFSD_CLIENT_MAX_TRIM_PER_RUN 128
#define NFS4_CLIENTS_PER_GB 1024
+#define NFSD_DELEGRETURN_TIMEOUT (HZ / 34) /* 30ms */
/*
* The following attributes are currently not supported by the NFSv4 server:
@@ -498,7 +504,8 @@ extern void unregister_cld_notifier(void);
extern void nfsd4_ssc_init_umount_work(struct nfsd_net *nn);
#endif
-extern void nfsd4_init_leases_net(struct nfsd_net *nn);
+extern int nfsd4_init_leases_net(struct nfsd_net *nn);
+extern void nfsd4_leases_net_shutdown(struct nfsd_net *nn);
#else /* CONFIG_NFSD_V4 */
static inline int nfsd4_is_junction(struct dentry *dentry)
@@ -506,7 +513,8 @@ static inline int nfsd4_is_junction(struct dentry *dentry)
return 0;
}
-static inline void nfsd4_init_leases_net(struct nfsd_net *nn) {};
+static inline int nfsd4_init_leases_net(struct nfsd_net *nn) { return 0; };
+static inline void nfsd4_leases_net_shutdown(struct nfsd_net *nn) {};
#define register_cld_notifier() 0
#define unregister_cld_notifier() do { } while(0)
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index a5b71526cee0..8c52b6c9d31a 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -392,14 +392,8 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
skip_pseudoflavor_check:
/* Finally, check access permissions. */
error = nfsd_permission(rqstp, exp, dentry, access);
-
- if (error) {
- dprintk("fh_verify: %pd2 permission failure, "
- "acc=%x, error=%d\n",
- dentry,
- access, ntohl(error));
- }
out:
+ trace_nfsd_fh_verify_err(rqstp, fhp, type, access, error);
if (error == nfserr_stale)
nfsd_stats_fh_stale_inc(exp);
return error;
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index c3ae6414fc5c..513e028b0bbe 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -220,7 +220,7 @@ __be32 fh_update(struct svc_fh *);
void fh_put(struct svc_fh *);
static __inline__ struct svc_fh *
-fh_copy(struct svc_fh *dst, struct svc_fh *src)
+fh_copy(struct svc_fh *dst, const struct svc_fh *src)
{
WARN_ON(src->fh_dentry);
@@ -229,7 +229,7 @@ fh_copy(struct svc_fh *dst, struct svc_fh *src)
}
static inline void
-fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src)
+fh_copy_shallow(struct knfsd_fh *dst, const struct knfsd_fh *src)
{
dst->fh_size = src->fh_size;
memcpy(&dst->fh_raw, &src->fh_raw, src->fh_size);
@@ -243,7 +243,8 @@ fh_init(struct svc_fh *fhp, int maxsize)
return fhp;
}
-static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+static inline bool fh_match(const struct knfsd_fh *fh1,
+ const struct knfsd_fh *fh2)
{
if (fh1->fh_size != fh2->fh_size)
return false;
@@ -252,7 +253,8 @@ static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
return true;
}
-static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+static inline bool fh_fsid_match(const struct knfsd_fh *fh1,
+ const struct knfsd_fh *fh2)
{
if (fh1->fh_fsid_type != fh2->fh_fsid_type)
return false;
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 7381972f1677..a5570cf75f3f 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -185,6 +185,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
argp->count, argp->offset);
argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2);
+ argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen);
v = 0;
len = argp->count;
@@ -210,7 +211,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
if (resp->status == nfs_ok)
resp->status = fh_getattr(&resp->fh, &resp->stat);
else if (resp->status == nfserr_jukebox)
- return rpc_drop_reply;
+ __set_bit(RQ_DROPME, &rqstp->rq_flags);
return rpc_success;
}
@@ -245,7 +246,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
if (resp->status == nfs_ok)
resp->status = fh_getattr(&resp->fh, &resp->stat);
else if (resp->status == nfserr_jukebox)
- return rpc_drop_reply;
+ __set_bit(RQ_DROPME, &rqstp->rq_flags);
return rpc_success;
}
@@ -390,9 +391,8 @@ nfsd_proc_create(struct svc_rqst *rqstp)
resp->status = nfs_ok;
if (!inode) {
/* File doesn't exist. Create it and set attrs */
- resp->status = nfsd_create_locked(rqstp, dirfhp, argp->name,
- argp->len, &attrs, type, rdev,
- newfhp);
+ resp->status = nfsd_create_locked(rqstp, dirfhp, &attrs, type,
+ rdev, newfhp);
} else if (type == S_IFREG) {
dprintk("nfsd: existing %s, valid=%x, size=%ld\n",
argp->name, attr->ia_valid, (long) attr->ia_size);
@@ -567,24 +567,15 @@ static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp,
struct xdr_buf *buf = &resp->dirlist;
struct xdr_stream *xdr = &resp->xdr;
- count = clamp(count, (u32)(XDR_UNIT * 2), svc_max_payload(rqstp));
-
memset(buf, 0, sizeof(*buf));
/* Reserve room for the NULL ptr & eof flag (-2 words) */
- buf->buflen = count - XDR_UNIT * 2;
+ buf->buflen = clamp(count, (u32)(XDR_UNIT * 2), (u32)PAGE_SIZE);
+ buf->buflen -= XDR_UNIT * 2;
buf->pages = rqstp->rq_next_page;
rqstp->rq_next_page++;
- /* This is xdr_init_encode(), but it assumes that
- * the head kvec has already been consumed. */
- xdr_set_scratch_buffer(xdr, NULL, 0);
- xdr->buf = buf;
- xdr->page_ptr = buf->pages;
- xdr->iov = NULL;
- xdr->p = page_address(*buf->pages);
- xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE);
- xdr->rqst = NULL;
+ xdr_init_encode_pages(xdr, buf, buf->pages, NULL);
}
/*
@@ -646,6 +637,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_decode = nfssvc_decode_voidarg,
.pc_encode = nfssvc_encode_voidres,
.pc_argsize = sizeof(struct nfsd_voidargs),
+ .pc_argzero = sizeof(struct nfsd_voidargs),
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = 0,
@@ -657,6 +649,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_encode = nfssvc_encode_attrstatres,
.pc_release = nfssvc_release_attrstat,
.pc_argsize = sizeof(struct nfsd_fhandle),
+ .pc_argzero = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT,
@@ -668,6 +661,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_encode = nfssvc_encode_attrstatres,
.pc_release = nfssvc_release_attrstat,
.pc_argsize = sizeof(struct nfsd_sattrargs),
+ .pc_argzero = sizeof(struct nfsd_sattrargs),
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+AT,
@@ -678,6 +672,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_decode = nfssvc_decode_voidarg,
.pc_encode = nfssvc_encode_voidres,
.pc_argsize = sizeof(struct nfsd_voidargs),
+ .pc_argzero = sizeof(struct nfsd_voidargs),
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = 0,
@@ -689,6 +684,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_encode = nfssvc_encode_diropres,
.pc_release = nfssvc_release_diropres,
.pc_argsize = sizeof(struct nfsd_diropargs),
+ .pc_argzero = sizeof(struct nfsd_diropargs),
.pc_ressize = sizeof(struct nfsd_diropres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+FH+AT,
@@ -699,6 +695,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_decode = nfssvc_decode_fhandleargs,
.pc_encode = nfssvc_encode_readlinkres,
.pc_argsize = sizeof(struct nfsd_fhandle),
+ .pc_argzero = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd_readlinkres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+1+NFS_MAXPATHLEN/4,
@@ -710,6 +707,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_encode = nfssvc_encode_readres,
.pc_release = nfssvc_release_readres,
.pc_argsize = sizeof(struct nfsd_readargs),
+ .pc_argzero = sizeof(struct nfsd_readargs),
.pc_ressize = sizeof(struct nfsd_readres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4,
@@ -720,6 +718,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_decode = nfssvc_decode_voidarg,
.pc_encode = nfssvc_encode_voidres,
.pc_argsize = sizeof(struct nfsd_voidargs),
+ .pc_argzero = sizeof(struct nfsd_voidargs),
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = 0,
@@ -731,6 +730,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_encode = nfssvc_encode_attrstatres,
.pc_release = nfssvc_release_attrstat,
.pc_argsize = sizeof(struct nfsd_writeargs),
+ .pc_argzero = sizeof(struct nfsd_writeargs),
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+AT,
@@ -742,6 +742,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_encode = nfssvc_encode_diropres,
.pc_release = nfssvc_release_diropres,
.pc_argsize = sizeof(struct nfsd_createargs),
+ .pc_argzero = sizeof(struct nfsd_createargs),
.pc_ressize = sizeof(struct nfsd_diropres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+FH+AT,
@@ -752,6 +753,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_decode = nfssvc_decode_diropargs,
.pc_encode = nfssvc_encode_statres,
.pc_argsize = sizeof(struct nfsd_diropargs),
+ .pc_argzero = sizeof(struct nfsd_diropargs),
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
@@ -762,6 +764,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_decode = nfssvc_decode_renameargs,
.pc_encode = nfssvc_encode_statres,
.pc_argsize = sizeof(struct nfsd_renameargs),
+ .pc_argzero = sizeof(struct nfsd_renameargs),
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
@@ -772,6 +775,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_decode = nfssvc_decode_linkargs,
.pc_encode = nfssvc_encode_statres,
.pc_argsize = sizeof(struct nfsd_linkargs),
+ .pc_argzero = sizeof(struct nfsd_linkargs),
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
@@ -782,6 +786,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_decode = nfssvc_decode_symlinkargs,
.pc_encode = nfssvc_encode_statres,
.pc_argsize = sizeof(struct nfsd_symlinkargs),
+ .pc_argzero = sizeof(struct nfsd_symlinkargs),
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
@@ -793,6 +798,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_encode = nfssvc_encode_diropres,
.pc_release = nfssvc_release_diropres,
.pc_argsize = sizeof(struct nfsd_createargs),
+ .pc_argzero = sizeof(struct nfsd_createargs),
.pc_ressize = sizeof(struct nfsd_diropres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+FH+AT,
@@ -803,6 +809,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_decode = nfssvc_decode_diropargs,
.pc_encode = nfssvc_encode_statres,
.pc_argsize = sizeof(struct nfsd_diropargs),
+ .pc_argzero = sizeof(struct nfsd_diropargs),
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
@@ -813,6 +820,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_decode = nfssvc_decode_readdirargs,
.pc_encode = nfssvc_encode_readdirres,
.pc_argsize = sizeof(struct nfsd_readdirargs),
+ .pc_argzero = sizeof(struct nfsd_readdirargs),
.pc_ressize = sizeof(struct nfsd_readdirres),
.pc_cachetype = RC_NOCACHE,
.pc_name = "READDIR",
@@ -822,6 +830,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_decode = nfssvc_decode_fhandleargs,
.pc_encode = nfssvc_encode_statfsres,
.pc_argsize = sizeof(struct nfsd_fhandle),
+ .pc_argzero = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd_statfsres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+5,
@@ -839,65 +848,3 @@ const struct svc_version nfsd_version2 = {
.vs_dispatch = nfsd_dispatch,
.vs_xdrsize = NFS2_SVC_XDRSIZE,
};
-
-/*
- * Map errnos to NFS errnos.
- */
-__be32
-nfserrno (int errno)
-{
- static struct {
- __be32 nfserr;
- int syserr;
- } nfs_errtbl[] = {
- { nfs_ok, 0 },
- { nfserr_perm, -EPERM },
- { nfserr_noent, -ENOENT },
- { nfserr_io, -EIO },
- { nfserr_nxio, -ENXIO },
- { nfserr_fbig, -E2BIG },
- { nfserr_stale, -EBADF },
- { nfserr_acces, -EACCES },
- { nfserr_exist, -EEXIST },
- { nfserr_xdev, -EXDEV },
- { nfserr_mlink, -EMLINK },
- { nfserr_nodev, -ENODEV },
- { nfserr_notdir, -ENOTDIR },
- { nfserr_isdir, -EISDIR },
- { nfserr_inval, -EINVAL },
- { nfserr_fbig, -EFBIG },
- { nfserr_nospc, -ENOSPC },
- { nfserr_rofs, -EROFS },
- { nfserr_mlink, -EMLINK },
- { nfserr_nametoolong, -ENAMETOOLONG },
- { nfserr_notempty, -ENOTEMPTY },
-#ifdef EDQUOT
- { nfserr_dquot, -EDQUOT },
-#endif
- { nfserr_stale, -ESTALE },
- { nfserr_jukebox, -ETIMEDOUT },
- { nfserr_jukebox, -ERESTARTSYS },
- { nfserr_jukebox, -EAGAIN },
- { nfserr_jukebox, -EWOULDBLOCK },
- { nfserr_jukebox, -ENOMEM },
- { nfserr_io, -ETXTBSY },
- { nfserr_notsupp, -EOPNOTSUPP },
- { nfserr_toosmall, -ETOOSMALL },
- { nfserr_serverfault, -ESERVERFAULT },
- { nfserr_serverfault, -ENFILE },
- { nfserr_io, -EREMOTEIO },
- { nfserr_stale, -EOPENSTALE },
- { nfserr_io, -EUCLEAN },
- { nfserr_perm, -ENOKEY },
- { nfserr_no_grace, -ENOGRACE},
- };
- int i;
-
- for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) {
- if (nfs_errtbl[i].syserr == errno)
- return nfs_errtbl[i].nfserr;
- }
- WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno);
- return nfserr_io;
-}
-
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 4bb5baa17040..56fba1cba3af 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -91,8 +91,12 @@ unsigned long nfsd_drc_mem_used;
#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
static struct svc_stat nfsd_acl_svcstats;
static const struct svc_version *nfsd_acl_version[] = {
+# if defined(CONFIG_NFSD_V2_ACL)
[2] = &nfsd_acl_version2,
+# endif
+# if defined(CONFIG_NFSD_V3_ACL)
[3] = &nfsd_acl_version3,
+# endif
};
#define NFSD_ACL_MINVERS 2
@@ -116,7 +120,9 @@ static struct svc_stat nfsd_acl_svcstats = {
#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
static const struct svc_version *nfsd_version[] = {
+#if defined(CONFIG_NFSD_V2)
[2] = &nfsd_version2,
+#endif
[3] = &nfsd_version3,
#if defined(CONFIG_NFSD_V4)
[4] = &nfsd_version4,
@@ -799,7 +805,7 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
if (nrservs == 0 && nn->nfsd_serv == NULL)
goto out;
- strlcpy(nn->nfsd_name, utsname()->nodename,
+ strscpy(nn->nfsd_name, utsname()->nodename,
sizeof(nn->nfsd_name));
error = nfsd_create_serv(net);
@@ -1054,7 +1060,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
svcxdr_init_encode(rqstp);
*statp = proc->pc_func(rqstp);
- if (*statp == rpc_drop_reply || test_bit(RQ_DROPME, &rqstp->rq_flags))
+ if (test_bit(RQ_DROPME, &rqstp->rq_flags))
goto out_update_drop;
if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream))
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index aba8520b4b8b..caf6355b18fa 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -338,10 +338,8 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
return false;
if (args->len > NFSSVC_MAXBLKSIZE_V2)
return false;
- if (!xdr_stream_subsegment(xdr, &args->payload, args->len))
- return false;
- return true;
+ return xdr_stream_subsegment(xdr, &args->payload, args->len);
}
bool
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index ae596dbf8667..e94634d30591 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -57,11 +57,11 @@ typedef struct {
} stateid_t;
typedef struct {
- stateid_t stid;
+ stateid_t cs_stid;
#define NFS4_COPY_STID 1
#define NFS4_COPYNOTIFY_STID 2
- unsigned char sc_type;
- refcount_t sc_count;
+ unsigned char cs_type;
+ refcount_t cs_count;
} copy_stateid_t;
struct nfsd4_callback {
@@ -175,7 +175,7 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
/* Maximum number of slots per session. 160 is useful for long haul TCP */
#define NFSD_MAX_SLOTS_PER_SESSION 160
/* Maximum number of operations per session compound */
-#define NFSD_MAX_OPS_PER_COMPOUND 16
+#define NFSD_MAX_OPS_PER_COMPOUND 50
/* Maximum session per slot cache size */
#define NFSD_SLOT_CACHE_SIZE 2048
/* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
@@ -368,6 +368,7 @@ struct nfs4_client {
#define NFSD4_CLIENT_UPCALL_LOCK (5) /* upcall serialization */
#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \
1 << NFSD4_CLIENT_CB_KILL)
+#define NFSD4_CLIENT_CB_RECALL_ANY (6)
unsigned long cl_flags;
const struct cred *cl_cb_cred;
struct rpc_clnt *cl_cb_client;
@@ -411,6 +412,10 @@ struct nfs4_client {
unsigned int cl_state;
atomic_t cl_delegs_in_recall;
+
+ struct nfsd4_cb_recall_any *cl_ra;
+ time64_t cl_ra_time;
+ struct list_head cl_ra_cblist;
};
/* struct nfs4_client_reset
@@ -536,16 +541,13 @@ struct nfs4_clnt_odstate {
* inode can have multiple filehandles associated with it, so there is
* (potentially) a many to one relationship between this struct and struct
* inode.
- *
- * These are hashed by filehandle in the file_hashtbl, which is protected by
- * the global state_lock spinlock.
*/
struct nfs4_file {
refcount_t fi_ref;
struct inode * fi_inode;
bool fi_aliased;
spinlock_t fi_lock;
- struct hlist_node fi_hash; /* hash on fi_fhandle */
+ struct rhlist_head fi_rlist;
struct list_head fi_stateids;
union {
struct list_head fi_delegations;
@@ -639,6 +641,7 @@ enum nfsd4_cb_op {
NFSPROC4_CLNT_CB_OFFLOAD,
NFSPROC4_CLNT_CB_SEQUENCE,
NFSPROC4_CLNT_CB_NOTIFY_LOCK,
+ NFSPROC4_CLNT_CB_RECALL_ANY,
};
/* Returns true iff a is later than b: */
@@ -692,12 +695,11 @@ extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
-extern void nfsd4_run_cb(struct nfsd4_callback *cb);
+extern bool nfsd4_run_cb(struct nfsd4_callback *cb);
extern int nfsd4_create_callback_queue(void);
extern void nfsd4_destroy_callback_queue(void);
extern void nfsd4_shutdown_callback(struct nfs4_client *);
extern void nfsd4_shutdown_copy(struct nfs4_client *clp);
-extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp);
extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name,
struct xdr_netobj princhash, struct nfsd_net *nn);
extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn);
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index a8c5a02a84f0..777e24e5da33 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -32,7 +32,7 @@ struct svc_stat nfsd_svcstats = {
.program = &nfsd_program,
};
-static int nfsd_proc_show(struct seq_file *seq, void *v)
+static int nfsd_show(struct seq_file *seq, void *v)
{
int i;
@@ -72,17 +72,7 @@ static int nfsd_proc_show(struct seq_file *seq, void *v)
return 0;
}
-static int nfsd_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, nfsd_proc_show, NULL);
-}
-
-static const struct proc_ops nfsd_proc_ops = {
- .proc_open = nfsd_proc_open,
- .proc_read = seq_read,
- .proc_lseek = seq_lseek,
- .proc_release = single_release,
-};
+DEFINE_PROC_SHOW_ATTRIBUTE(nfsd);
int nfsd_percpu_counters_init(struct percpu_counter counters[], int num)
{
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 9ebd67d461f9..c852ae8eaf37 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -9,9 +9,12 @@
#define _NFSD_TRACE_H
#include <linux/tracepoint.h>
+#include <linux/sunrpc/xprt.h>
+#include <trace/misc/nfs.h>
#include "export.h"
#include "nfsfh.h"
+#include "xdr4.h"
#define NFSD_TRACE_PROC_RES_FIELDS \
__field(unsigned int, netns_ino) \
@@ -84,19 +87,26 @@ DEFINE_NFSD_XDR_ERR_EVENT(cant_encode);
{ NFSD_MAY_64BIT_COOKIE, "64BIT_COOKIE" })
TRACE_EVENT(nfsd_compound,
- TP_PROTO(const struct svc_rqst *rqst,
- u32 args_opcnt),
- TP_ARGS(rqst, args_opcnt),
+ TP_PROTO(
+ const struct svc_rqst *rqst,
+ const char *tag,
+ u32 taglen,
+ u32 opcnt
+ ),
+ TP_ARGS(rqst, tag, taglen, opcnt),
TP_STRUCT__entry(
__field(u32, xid)
- __field(u32, args_opcnt)
+ __field(u32, opcnt)
+ __string_len(tag, tag, taglen)
),
TP_fast_assign(
__entry->xid = be32_to_cpu(rqst->rq_xid);
- __entry->args_opcnt = args_opcnt;
+ __entry->opcnt = opcnt;
+ __assign_str_len(tag, tag, taglen);
),
- TP_printk("xid=0x%08x opcnt=%u",
- __entry->xid, __entry->args_opcnt)
+ TP_printk("xid=0x%08x opcnt=%u tag=%s",
+ __entry->xid, __entry->opcnt, __get_str(tag)
+ )
)
TRACE_EVENT(nfsd_compound_status,
@@ -195,7 +205,7 @@ TRACE_EVENT(nfsd_fh_verify,
__sockaddr(client, rqstp->rq_xprt->xpt_remotelen)
__field(u32, xid)
__field(u32, fh_hash)
- __field(void *, inode)
+ __field(const void *, inode)
__field(unsigned long, type)
__field(unsigned long, access)
),
@@ -211,13 +221,58 @@ TRACE_EVENT(nfsd_fh_verify,
__entry->type = type;
__entry->access = access;
),
- TP_printk("xid=0x%08x fh_hash=0x%08x inode=%p type=%s access=%s",
- __entry->xid, __entry->fh_hash, __entry->inode,
+ TP_printk("xid=0x%08x fh_hash=0x%08x type=%s access=%s",
+ __entry->xid, __entry->fh_hash,
show_fs_file_type(__entry->type),
show_nfsd_may_flags(__entry->access)
)
);
+TRACE_EVENT_CONDITION(nfsd_fh_verify_err,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct svc_fh *fhp,
+ umode_t type,
+ int access,
+ __be32 error
+ ),
+ TP_ARGS(rqstp, fhp, type, access, error),
+ TP_CONDITION(error),
+ TP_STRUCT__entry(
+ __field(unsigned int, netns_ino)
+ __sockaddr(server, rqstp->rq_xprt->xpt_remotelen)
+ __sockaddr(client, rqstp->rq_xprt->xpt_remotelen)
+ __field(u32, xid)
+ __field(u32, fh_hash)
+ __field(const void *, inode)
+ __field(unsigned long, type)
+ __field(unsigned long, access)
+ __field(int, error)
+ ),
+ TP_fast_assign(
+ __entry->netns_ino = SVC_NET(rqstp)->ns.inum;
+ __assign_sockaddr(server, &rqstp->rq_xprt->xpt_local,
+ rqstp->rq_xprt->xpt_locallen);
+ __assign_sockaddr(client, &rqstp->rq_xprt->xpt_remote,
+ rqstp->rq_xprt->xpt_remotelen);
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ if (fhp->fh_dentry)
+ __entry->inode = d_inode(fhp->fh_dentry);
+ else
+ __entry->inode = NULL;
+ __entry->type = type;
+ __entry->access = access;
+ __entry->error = be32_to_cpu(error);
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x type=%s access=%s error=%d",
+ __entry->xid, __entry->fh_hash,
+ show_fs_file_type(__entry->type),
+ show_nfsd_may_flags(__entry->access),
+ __entry->error
+ )
+);
+
DECLARE_EVENT_CLASS(nfsd_fh_err_class,
TP_PROTO(struct svc_rqst *rqstp,
struct svc_fh *fhp,
@@ -489,6 +544,29 @@ DEFINE_NFSD_COPY_ERR_EVENT(clone_file_range_err);
#include "filecache.h"
#include "vfs.h"
+TRACE_EVENT(nfsd_delegret_wakeup,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct inode *inode,
+ long timeo
+ ),
+ TP_ARGS(rqstp, inode, timeo),
+ TP_STRUCT__entry(
+ __field(u32, xid)
+ __field(const void *, inode)
+ __field(long, timeo)
+ ),
+ TP_fast_assign(
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->inode = inode;
+ __entry->timeo = timeo;
+ ),
+ TP_printk("xid=0x%08x inode=%p%s",
+ __entry->xid, __entry->inode,
+ __entry->timeo == 0 ? " (timed out)" : ""
+ )
+);
+
DECLARE_EVENT_CLASS(nfsd_stateid_class,
TP_PROTO(stateid_t *stp),
TP_ARGS(stp),
@@ -529,6 +607,7 @@ DEFINE_STATEID_EVENT(layout_recall_release);
DEFINE_STATEID_EVENT(open);
DEFINE_STATEID_EVENT(deleg_read);
+DEFINE_STATEID_EVENT(deleg_return);
DEFINE_STATEID_EVENT(deleg_recall);
DECLARE_EVENT_CLASS(nfsd_stateseqid_class,
@@ -561,6 +640,61 @@ DEFINE_EVENT(nfsd_stateseqid_class, nfsd_##name, \
DEFINE_STATESEQID_EVENT(preprocess);
DEFINE_STATESEQID_EVENT(open_confirm);
+TRACE_DEFINE_ENUM(NFS4_OPEN_STID);
+TRACE_DEFINE_ENUM(NFS4_LOCK_STID);
+TRACE_DEFINE_ENUM(NFS4_DELEG_STID);
+TRACE_DEFINE_ENUM(NFS4_CLOSED_STID);
+TRACE_DEFINE_ENUM(NFS4_REVOKED_DELEG_STID);
+TRACE_DEFINE_ENUM(NFS4_CLOSED_DELEG_STID);
+TRACE_DEFINE_ENUM(NFS4_LAYOUT_STID);
+
+#define show_stid_type(x) \
+ __print_flags(x, "|", \
+ { NFS4_OPEN_STID, "OPEN" }, \
+ { NFS4_LOCK_STID, "LOCK" }, \
+ { NFS4_DELEG_STID, "DELEG" }, \
+ { NFS4_CLOSED_STID, "CLOSED" }, \
+ { NFS4_REVOKED_DELEG_STID, "REVOKED" }, \
+ { NFS4_CLOSED_DELEG_STID, "CLOSED_DELEG" }, \
+ { NFS4_LAYOUT_STID, "LAYOUT" })
+
+DECLARE_EVENT_CLASS(nfsd_stid_class,
+ TP_PROTO(
+ const struct nfs4_stid *stid
+ ),
+ TP_ARGS(stid),
+ TP_STRUCT__entry(
+ __field(unsigned long, sc_type)
+ __field(int, sc_count)
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(u32, si_id)
+ __field(u32, si_generation)
+ ),
+ TP_fast_assign(
+ const stateid_t *stp = &stid->sc_stateid;
+
+ __entry->sc_type = stid->sc_type;
+ __entry->sc_count = refcount_read(&stid->sc_count);
+ __entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
+ __entry->cl_id = stp->si_opaque.so_clid.cl_id;
+ __entry->si_id = stp->si_opaque.so_id;
+ __entry->si_generation = stp->si_generation;
+ ),
+ TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s",
+ __entry->cl_boot, __entry->cl_id,
+ __entry->si_id, __entry->si_generation,
+ __entry->sc_count, show_stid_type(__entry->sc_type)
+ )
+);
+
+#define DEFINE_STID_EVENT(name) \
+DEFINE_EVENT(nfsd_stid_class, nfsd_stid_##name, \
+ TP_PROTO(const struct nfs4_stid *stid), \
+ TP_ARGS(stid))
+
+DEFINE_STID_EVENT(revoke);
+
DECLARE_EVENT_CLASS(nfsd_clientid_class,
TP_PROTO(const clientid_t *clid),
TP_ARGS(clid),
@@ -742,7 +876,8 @@ DEFINE_CLID_EVENT(confirmed_r);
__print_flags(val, "|", \
{ 1 << NFSD_FILE_HASHED, "HASHED" }, \
{ 1 << NFSD_FILE_PENDING, "PENDING" }, \
- { 1 << NFSD_FILE_REFERENCED, "REFERENCED"})
+ { 1 << NFSD_FILE_REFERENCED, "REFERENCED" }, \
+ { 1 << NFSD_FILE_GC, "GC" })
DECLARE_EVENT_CLASS(nfsd_file_class,
TP_PROTO(struct nfsd_file *nf),
@@ -774,10 +909,11 @@ DEFINE_EVENT(nfsd_file_class, name, \
TP_PROTO(struct nfsd_file *nf), \
TP_ARGS(nf))
-DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_free);
DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
-DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_dispose);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_closing);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_queue);
TRACE_EVENT(nfsd_file_alloc,
TP_PROTO(
@@ -968,35 +1104,6 @@ TRACE_EVENT(nfsd_file_open,
__entry->nf_file)
)
-DECLARE_EVENT_CLASS(nfsd_file_search_class,
- TP_PROTO(
- const struct inode *inode,
- unsigned int count
- ),
- TP_ARGS(inode, count),
- TP_STRUCT__entry(
- __field(const struct inode *, inode)
- __field(unsigned int, count)
- ),
- TP_fast_assign(
- __entry->inode = inode;
- __entry->count = count;
- ),
- TP_printk("inode=%p count=%u",
- __entry->inode, __entry->count)
-);
-
-#define DEFINE_NFSD_FILE_SEARCH_EVENT(name) \
-DEFINE_EVENT(nfsd_file_search_class, name, \
- TP_PROTO( \
- const struct inode *inode, \
- unsigned int count \
- ), \
- TP_ARGS(inode, count))
-
-DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode_sync);
-DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode);
-
TRACE_EVENT(nfsd_file_is_cached,
TP_PROTO(
const struct inode *inode,
@@ -1074,7 +1181,6 @@ DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del_disposed);
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_in_use);
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_writeback);
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_referenced);
-DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_hashed);
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_disposed);
DECLARE_EVENT_CLASS(nfsd_file_lruwalk_class,
@@ -1106,6 +1212,53 @@ DEFINE_EVENT(nfsd_file_lruwalk_class, name, \
DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_gc_removed);
DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_shrinker_removed);
+TRACE_EVENT(nfsd_file_close,
+ TP_PROTO(
+ const struct inode *inode
+ ),
+ TP_ARGS(inode),
+ TP_STRUCT__entry(
+ __field(const void *, inode)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ ),
+ TP_printk("inode=%p",
+ __entry->inode
+ )
+);
+
+TRACE_EVENT(nfsd_file_fsync,
+ TP_PROTO(
+ const struct nfsd_file *nf,
+ int ret
+ ),
+ TP_ARGS(nf, ret),
+ TP_STRUCT__entry(
+ __field(void *, nf_inode)
+ __field(int, nf_ref)
+ __field(int, ret)
+ __field(unsigned long, nf_flags)
+ __field(unsigned char, nf_may)
+ __field(struct file *, nf_file)
+ ),
+ TP_fast_assign(
+ __entry->nf_inode = nf->nf_inode;
+ __entry->nf_ref = refcount_read(&nf->nf_ref);
+ __entry->ret = ret;
+ __entry->nf_flags = nf->nf_flags;
+ __entry->nf_may = nf->nf_may;
+ __entry->nf_file = nf->nf_file;
+ ),
+ TP_printk("inode=%p ref=%d flags=%s may=%s nf_file=%p ret=%d",
+ __entry->nf_inode,
+ __entry->nf_ref,
+ show_nf_flags(__entry->nf_flags),
+ show_nfsd_may_flags(__entry->nf_may),
+ __entry->nf_file, __entry->ret
+ )
+);
+
#include "cache.h"
TRACE_DEFINE_ENUM(RC_DROPIT);
@@ -1399,6 +1552,92 @@ TRACE_EVENT(nfsd_cb_offload,
__entry->fh_hash, __entry->count, __entry->status)
);
+TRACE_EVENT(nfsd_cb_recall_any,
+ TP_PROTO(
+ const struct nfsd4_cb_recall_any *ra
+ ),
+ TP_ARGS(ra),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(u32, keep)
+ __field(unsigned long, bmval0)
+ __sockaddr(addr, ra->ra_cb.cb_clp->cl_cb_conn.cb_addrlen)
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = ra->ra_cb.cb_clp->cl_clientid.cl_boot;
+ __entry->cl_id = ra->ra_cb.cb_clp->cl_clientid.cl_id;
+ __entry->keep = ra->ra_keep;
+ __entry->bmval0 = ra->ra_bmval[0];
+ __assign_sockaddr(addr, &ra->ra_cb.cb_clp->cl_addr,
+ ra->ra_cb.cb_clp->cl_cb_conn.cb_addrlen);
+ ),
+ TP_printk("addr=%pISpc client %08x:%08x keep=%u bmval0=%s",
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
+ __entry->keep, show_rca_mask(__entry->bmval0)
+ )
+);
+
+DECLARE_EVENT_CLASS(nfsd_cb_done_class,
+ TP_PROTO(
+ const stateid_t *stp,
+ const struct rpc_task *task
+ ),
+ TP_ARGS(stp, task),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(u32, si_id)
+ __field(u32, si_generation)
+ __field(int, status)
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
+ __entry->cl_id = stp->si_opaque.so_clid.cl_id;
+ __entry->si_id = stp->si_opaque.so_id;
+ __entry->si_generation = stp->si_generation;
+ __entry->status = task->tk_status;
+ ),
+ TP_printk("client %08x:%08x stateid %08x:%08x status=%d",
+ __entry->cl_boot, __entry->cl_id, __entry->si_id,
+ __entry->si_generation, __entry->status
+ )
+);
+
+#define DEFINE_NFSD_CB_DONE_EVENT(name) \
+DEFINE_EVENT(nfsd_cb_done_class, name, \
+ TP_PROTO( \
+ const stateid_t *stp, \
+ const struct rpc_task *task \
+ ), \
+ TP_ARGS(stp, task))
+
+DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_recall_done);
+DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_notify_lock_done);
+DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_layout_done);
+DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_offload_done);
+
+TRACE_EVENT(nfsd_cb_recall_any_done,
+ TP_PROTO(
+ const struct nfsd4_callback *cb,
+ const struct rpc_task *task
+ ),
+ TP_ARGS(cb, task),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(int, status)
+ ),
+ TP_fast_assign(
+ __entry->status = task->tk_status;
+ __entry->cl_boot = cb->cb_clp->cl_clientid.cl_boot;
+ __entry->cl_id = cb->cb_clp->cl_clientid.cl_id;
+ ),
+ TP_printk("client %08x:%08x status=%d",
+ __entry->cl_boot, __entry->cl_id, __entry->status
+ )
+);
+
#endif /* _NFSD_TRACE_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index fc17b0ac8729..4c3a0d84043c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -49,6 +49,69 @@
#define NFSDDBG_FACILITY NFSDDBG_FILEOP
+/**
+ * nfserrno - Map Linux errnos to NFS errnos
+ * @errno: POSIX(-ish) error code to be mapped
+ *
+ * Returns the appropriate (net-endian) nfserr_* (or nfs_ok if errno is 0). If
+ * it's an error we don't expect, log it once and return nfserr_io.
+ */
+__be32
+nfserrno (int errno)
+{
+ static struct {
+ __be32 nfserr;
+ int syserr;
+ } nfs_errtbl[] = {
+ { nfs_ok, 0 },
+ { nfserr_perm, -EPERM },
+ { nfserr_noent, -ENOENT },
+ { nfserr_io, -EIO },
+ { nfserr_nxio, -ENXIO },
+ { nfserr_fbig, -E2BIG },
+ { nfserr_stale, -EBADF },
+ { nfserr_acces, -EACCES },
+ { nfserr_exist, -EEXIST },
+ { nfserr_xdev, -EXDEV },
+ { nfserr_mlink, -EMLINK },
+ { nfserr_nodev, -ENODEV },
+ { nfserr_notdir, -ENOTDIR },
+ { nfserr_isdir, -EISDIR },
+ { nfserr_inval, -EINVAL },
+ { nfserr_fbig, -EFBIG },
+ { nfserr_nospc, -ENOSPC },
+ { nfserr_rofs, -EROFS },
+ { nfserr_mlink, -EMLINK },
+ { nfserr_nametoolong, -ENAMETOOLONG },
+ { nfserr_notempty, -ENOTEMPTY },
+ { nfserr_dquot, -EDQUOT },
+ { nfserr_stale, -ESTALE },
+ { nfserr_jukebox, -ETIMEDOUT },
+ { nfserr_jukebox, -ERESTARTSYS },
+ { nfserr_jukebox, -EAGAIN },
+ { nfserr_jukebox, -EWOULDBLOCK },
+ { nfserr_jukebox, -ENOMEM },
+ { nfserr_io, -ETXTBSY },
+ { nfserr_notsupp, -EOPNOTSUPP },
+ { nfserr_toosmall, -ETOOSMALL },
+ { nfserr_serverfault, -ESERVERFAULT },
+ { nfserr_serverfault, -ENFILE },
+ { nfserr_io, -EREMOTEIO },
+ { nfserr_stale, -EOPENSTALE },
+ { nfserr_io, -EUCLEAN },
+ { nfserr_perm, -ENOKEY },
+ { nfserr_no_grace, -ENOGRACE},
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) {
+ if (nfs_errtbl[i].syserr == errno)
+ return nfs_errtbl[i].nfserr;
+ }
+ WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno);
+ return nfserr_io;
+}
+
/*
* Called from nfsd_lookup and encode_dirent. Check if we have crossed
* a mount point.
@@ -343,8 +406,61 @@ nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
return nfserrno(get_write_access(inode));
}
-/*
- * Set various file attributes. After this call fhp needs an fh_put.
+static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap)
+{
+ int host_err;
+
+ if (iap->ia_valid & ATTR_SIZE) {
+ /*
+ * RFC5661, Section 18.30.4:
+ * Changing the size of a file with SETATTR indirectly
+ * changes the time_modify and change attributes.
+ *
+ * (and similar for the older RFCs)
+ */
+ struct iattr size_attr = {
+ .ia_valid = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME,
+ .ia_size = iap->ia_size,
+ };
+
+ if (iap->ia_size < 0)
+ return -EFBIG;
+
+ host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL);
+ if (host_err)
+ return host_err;
+ iap->ia_valid &= ~ATTR_SIZE;
+
+ /*
+ * Avoid the additional setattr call below if the only other
+ * attribute that the client sends is the mtime, as we update
+ * it as part of the size change above.
+ */
+ if ((iap->ia_valid & ~ATTR_MTIME) == 0)
+ return 0;
+ }
+
+ if (!iap->ia_valid)
+ return 0;
+
+ iap->ia_valid |= ATTR_CTIME;
+ return notify_change(&init_user_ns, dentry, iap, NULL);
+}
+
+/**
+ * nfsd_setattr - Set various file attributes.
+ * @rqstp: controlling RPC transaction
+ * @fhp: filehandle of target
+ * @attr: attributes to set
+ * @check_guard: set to 1 if guardtime is a valid timestamp
+ * @guardtime: do not act if ctime.tv_sec does not match this timestamp
+ *
+ * This call may adjust the contents of @attr (in particular, this
+ * call may change the bits in the na_iattr.ia_valid field).
+ *
+ * Returns nfs_ok on success, otherwise an NFS status code is
+ * returned. Caller must release @fhp by calling fh_put in either
+ * case.
*/
__be32
nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
@@ -357,9 +473,10 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
int accmode = NFSD_MAY_SATTR;
umode_t ftype = 0;
__be32 err;
- int host_err = 0;
+ int host_err;
bool get_write_count;
bool size_change = (iap->ia_valid & ATTR_SIZE);
+ int retries;
if (iap->ia_valid & ATTR_SIZE) {
accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
@@ -414,54 +531,24 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
inode_lock(inode);
- if (size_change) {
- /*
- * RFC5661, Section 18.30.4:
- * Changing the size of a file with SETATTR indirectly
- * changes the time_modify and change attributes.
- *
- * (and similar for the older RFCs)
- */
- struct iattr size_attr = {
- .ia_valid = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME,
- .ia_size = iap->ia_size,
- };
-
- host_err = -EFBIG;
- if (iap->ia_size < 0)
- goto out_unlock;
-
- host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL);
- if (host_err)
- goto out_unlock;
- iap->ia_valid &= ~ATTR_SIZE;
-
- /*
- * Avoid the additional setattr call below if the only other
- * attribute that the client sends is the mtime, as we update
- * it as part of the size change above.
- */
- if ((iap->ia_valid & ~ATTR_MTIME) == 0)
- goto out_unlock;
- }
-
- if (iap->ia_valid) {
- iap->ia_valid |= ATTR_CTIME;
- host_err = notify_change(&init_user_ns, dentry, iap, NULL);
+ for (retries = 1;;) {
+ host_err = __nfsd_setattr(dentry, iap);
+ if (host_err != -EAGAIN || !retries--)
+ break;
+ if (!nfsd_wait_for_delegreturn(rqstp, inode))
+ break;
}
-
-out_unlock:
if (attr->na_seclabel && attr->na_seclabel->len)
attr->na_labelerr = security_inode_setsecctx(dentry,
attr->na_seclabel->data, attr->na_seclabel->len);
if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && attr->na_pacl)
attr->na_aclerr = set_posix_acl(&init_user_ns,
- inode, ACL_TYPE_ACCESS,
+ dentry, ACL_TYPE_ACCESS,
attr->na_pacl);
if (IS_ENABLED(CONFIG_FS_POSIX_ACL) &&
!attr->na_aclerr && attr->na_dpacl && S_ISDIR(inode->i_mode))
attr->na_aclerr = set_posix_acl(&init_user_ns,
- inode, ACL_TYPE_DEFAULT,
+ dentry, ACL_TYPE_DEFAULT,
attr->na_dpacl);
inode_unlock(inode);
if (size_change)
@@ -572,8 +659,8 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
if (ret == -EOPNOTSUPP || ret == -EXDEV)
- ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
- count, 0);
+ ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count,
+ COPY_FILE_SPLICE);
return ret;
}
@@ -847,10 +934,11 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
struct svc_rqst *rqstp = sd->u.data;
struct page *page = buf->page; // may be a compound one
unsigned offset = buf->offset;
+ struct page *last_page;
- page += offset / PAGE_SIZE;
- for (int i = sd->len; i > 0; i -= PAGE_SIZE)
- svc_rqst_replace_page(rqstp, page++);
+ last_page = page + (offset + sd->len - 1) / PAGE_SIZE;
+ for (page += offset / PAGE_SIZE; page <= last_page; page++)
+ svc_rqst_replace_page(rqstp, page);
if (rqstp->rq_res.page_len == 0) // first call
rqstp->rq_res.page_base = offset % PAGE_SIZE;
rqstp->rq_res.page_len += sd->len;
@@ -918,7 +1006,7 @@ __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
ssize_t host_err;
trace_nfsd_read_vector(rqstp, fhp, offset, *count);
- iov_iter_kvec(&iter, READ, vec, vlen, *count);
+ iov_iter_kvec(&iter, ITER_DEST, vec, vlen, *count);
host_err = vfs_iter_read(file, &iter, &ppos, 0);
return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
}
@@ -1008,7 +1096,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
if (stable && !use_wgather)
flags |= RWF_SYNC;
- iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt);
+ iov_iter_kvec(&iter, ITER_SOURCE, vec, vlen, *cnt);
since = READ_ONCE(file->f_wb_err);
if (verf)
nfsd_copy_write_verifier(verf, nn);
@@ -1060,7 +1148,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
__be32 err;
trace_nfsd_read_start(rqstp, fhp, offset, *count);
- err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
+ err = nfsd_file_acquire_gc(rqstp, fhp, NFSD_MAY_READ, &nf);
if (err)
return err;
@@ -1092,7 +1180,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
trace_nfsd_write_start(rqstp, fhp, offset, *cnt);
- err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_WRITE, &nf);
+ err = nfsd_file_acquire_gc(rqstp, fhp, NFSD_MAY_WRITE, &nf);
if (err)
goto out;
@@ -1108,6 +1196,7 @@ out:
* nfsd_commit - Commit pending writes to stable storage
* @rqstp: RPC request being processed
* @fhp: NFS filehandle
+ * @nf: target file
* @offset: raw offset from beginning of file
* @count: raw count of bytes to sync
* @verf: filled in with the server's current write verifier
@@ -1124,19 +1213,13 @@ out:
* An nfsstat value in network byte order.
*/
__be32
-nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset,
- u32 count, __be32 *verf)
+nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
+ u64 offset, u32 count, __be32 *verf)
{
+ __be32 err = nfs_ok;
u64 maxbytes;
loff_t start, end;
struct nfsd_net *nn;
- struct nfsd_file *nf;
- __be32 err;
-
- err = nfsd_file_acquire(rqstp, fhp,
- NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf);
- if (err)
- goto out;
/*
* Convert the client-provided (offset, count) range to a
@@ -1177,8 +1260,6 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset,
} else
nfsd_copy_write_verifier(verf, nn);
- nfsd_file_put(nf);
-out:
return err;
}
@@ -1255,7 +1336,7 @@ nfsd_check_ignore_resizing(struct iattr *iap)
/* The parent directory should already be locked: */
__be32
nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
- char *fname, int flen, struct nfsd_attrs *attrs,
+ struct nfsd_attrs *attrs,
int type, dev_t rdev, struct svc_fh *resfhp)
{
struct dentry *dentry, *dchild;
@@ -1280,7 +1361,6 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
iap->ia_mode &= ~current_umask();
err = 0;
- host_err = 0;
switch (type) {
case S_IFREG:
host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true);
@@ -1382,8 +1462,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (err)
goto out_unlock;
fh_fill_pre_attrs(fhp);
- err = nfsd_create_locked(rqstp, fhp, fname, flen, attrs, type,
- rdev, resfhp);
+ err = nfsd_create_locked(rqstp, fhp, attrs, type, rdev, resfhp);
fh_fill_post_attrs(fhp);
out_unlock:
inode_unlock(dentry->d_inode);
@@ -1673,7 +1752,15 @@ retry:
.new_dir = tdir,
.new_dentry = ndentry,
};
- host_err = vfs_rename(&rd);
+ int retries;
+
+ for (retries = 1;;) {
+ host_err = vfs_rename(&rd);
+ if (host_err != -EAGAIN || !retries--)
+ break;
+ if (!nfsd_wait_for_delegreturn(rqstp, d_inode(odentry)))
+ break;
+ }
if (!host_err) {
host_err = commit_metadata(tfhp);
if (!host_err)
@@ -1757,9 +1844,18 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
fh_fill_pre_attrs(fhp);
if (type != S_IFDIR) {
+ int retries;
+
if (rdentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK)
nfsd_close_cached_files(rdentry);
- host_err = vfs_unlink(&init_user_ns, dirp, rdentry, NULL);
+
+ for (retries = 1;;) {
+ host_err = vfs_unlink(&init_user_ns, dirp, rdentry, NULL);
+ if (host_err != -EAGAIN || !retries--)
+ break;
+ if (!nfsd_wait_for_delegreturn(rqstp, rinode))
+ break;
+ }
} else {
host_err = vfs_rmdir(&init_user_ns, dirp, rdentry);
}
@@ -1814,7 +1910,7 @@ struct readdir_data {
int full;
};
-static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name,
+static bool nfsd_buffered_filldir(struct dir_context *ctx, const char *name,
int namlen, loff_t offset, u64 ino,
unsigned int d_type)
{
@@ -1826,7 +1922,7 @@ static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name,
reclen = ALIGN(sizeof(struct buffered_dirent) + namlen, sizeof(u64));
if (buf->used + reclen > PAGE_SIZE) {
buf->full = 1;
- return -EINVAL;
+ return false;
}
de->namlen = namlen;
@@ -1836,7 +1932,7 @@ static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name,
memcpy(de->name, name, namlen);
buf->used += reclen;
- return 0;
+ return true;
}
static __be32 nfsd_buffered_readdir(struct file *file, struct svc_fh *fhp,
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index c95cd414b4bb..dbdfef7ae85b 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -60,6 +60,7 @@ static inline void nfsd_attrs_free(struct nfsd_attrs *attrs)
posix_acl_release(attrs->na_dpacl);
}
+__be32 nfserrno (int errno);
int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
struct svc_export **expp);
__be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *,
@@ -79,8 +80,8 @@ __be32 nfsd4_clone_file_range(struct svc_rqst *rqstp,
u64 count, bool sync);
#endif /* CONFIG_NFSD_V4 */
__be32 nfsd_create_locked(struct svc_rqst *, struct svc_fh *,
- char *name, int len, struct nfsd_attrs *attrs,
- int type, dev_t rdev, struct svc_fh *res);
+ struct nfsd_attrs *attrs, int type, dev_t rdev,
+ struct svc_fh *res);
__be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct nfsd_attrs *attrs,
int type, dev_t rdev, struct svc_fh *res);
@@ -88,7 +89,8 @@ __be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
__be32 nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct svc_fh *resfhp, struct nfsd_attrs *iap);
__be32 nfsd_commit(struct svc_rqst *rqst, struct svc_fh *fhp,
- u64 offset, u32 count, __be32 *verf);
+ struct nfsd_file *nf, u64 offset, u32 count,
+ __be32 *verf);
#ifdef CONFIG_NFSD_V4
__be32 nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
char *name, void **bufp, int *lenp);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 96267258e629..4fd2cf6d1d2d 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -717,13 +717,13 @@ struct nfsd4_compoundargs {
struct svcxdr_tmpbuf *to_free;
struct svc_rqst *rqstp;
- u32 taglen;
char * tag;
+ u32 taglen;
u32 minorversion;
+ u32 client_opcnt;
u32 opcnt;
struct nfsd4_op *ops;
struct nfsd4_op iops[8];
- int cachetype;
};
struct nfsd4_compoundres {
@@ -732,8 +732,8 @@ struct nfsd4_compoundres {
struct svc_rqst * rqstp;
__be32 *statusp;
- u32 taglen;
char * tag;
+ u32 taglen;
u32 opcnt;
struct nfsd4_compound_state cstate;
@@ -888,12 +888,18 @@ struct nfsd4_operation {
u32 op_flags;
char *op_name;
/* Try to get response size before operation */
- u32 (*op_rsize_bop)(struct svc_rqst *, struct nfsd4_op *);
+ u32 (*op_rsize_bop)(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op);
void (*op_get_currentstateid)(struct nfsd4_compound_state *,
union nfsd4_op_u *);
void (*op_set_currentstateid)(struct nfsd4_compound_state *,
union nfsd4_op_u *);
};
+struct nfsd4_cb_recall_any {
+ struct nfsd4_callback ra_cb;
+ u32 ra_keep;
+ u32 ra_bmval[1];
+};
#endif
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index 547cf07cf4e0..0d39af1b00a0 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -48,3 +48,9 @@
#define NFS4_dec_cb_offload_sz (cb_compound_dec_hdr_sz + \
cb_sequence_dec_sz + \
op_dec_sz)
+#define NFS4_enc_cb_recall_any_sz (cb_compound_enc_hdr_sz + \
+ cb_sequence_enc_sz + \
+ 1 + 1 + 1)
+#define NFS4_dec_cb_recall_any_sz (cb_compound_dec_hdr_sz + \
+ cb_sequence_dec_sz + \
+ op_dec_sz)
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 9f4d9432d38a..b9d15c3df3cc 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -1668,8 +1668,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key)
maxkey = nilfs_btree_node_get_key(node, nchildren - 1);
nextmaxkey = (nchildren > 1) ?
nilfs_btree_node_get_key(node, nchildren - 2) : 0;
- if (bh != NULL)
- brelse(bh);
+ brelse(bh);
return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW);
}
@@ -1717,8 +1716,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *btree,
ptrs[i] = le64_to_cpu(dptrs[i]);
}
- if (bh != NULL)
- brelse(bh);
+ brelse(bh);
return nitems;
}
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 3b55e239705f..9930fa901039 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -111,6 +111,13 @@ static void nilfs_dat_commit_free(struct inode *dat,
kunmap_atomic(kaddr);
nilfs_dat_commit_entry(dat, req);
+
+ if (unlikely(req->pr_desc_bh == NULL || req->pr_bitmap_bh == NULL)) {
+ nilfs_error(dat->i_sb,
+ "state inconsistency probably due to duplicate use of vblocknr = %llu",
+ (unsigned long long)req->pr_entry_nr);
+ return;
+ }
nilfs_palloc_commit_free_entry(dat, req);
}
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 67f63cfeade5..232dd7b6cca1 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -328,6 +328,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
struct inode *inode;
struct nilfs_inode_info *ii;
struct nilfs_root *root;
+ struct buffer_head *bh;
int err = -ENOMEM;
ino_t ino;
@@ -343,11 +344,25 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
ii->i_state = BIT(NILFS_I_NEW);
ii->i_root = root;
- err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
+ err = nilfs_ifile_create_inode(root->ifile, &ino, &bh);
if (unlikely(err))
goto failed_ifile_create_inode;
/* reference count of i_bh inherits from nilfs_mdt_read_block() */
+ if (unlikely(ino < NILFS_USER_INO)) {
+ nilfs_warn(sb,
+ "inode bitmap is inconsistent for reserved inodes");
+ do {
+ brelse(bh);
+ err = nilfs_ifile_create_inode(root->ifile, &ino, &bh);
+ if (unlikely(err))
+ goto failed_ifile_create_inode;
+ } while (ino < NILFS_USER_INO);
+
+ nilfs_info(sb, "repaired inode bitmap for reserved inodes");
+ }
+ ii->i_bh = bh;
+
atomic64_inc(&root->inodes_count);
inode_init_owner(&init_user_ns, inode, dir, mode);
inode->i_ino = ino;
@@ -440,6 +455,8 @@ int nilfs_read_inode_common(struct inode *inode,
inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+ if (nilfs_is_metadata_file_inode(inode) && !S_ISREG(inode->i_mode))
+ return -EIO; /* this inode is for metadata and corrupted */
if (inode->i_nlink == 0)
return -ESTALE; /* this inode is deleted */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 3267e96c256c..39b7eea2642a 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -480,41 +480,36 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
sector_t start_blk,
sector_t *blkoff)
{
- unsigned int i;
+ unsigned int i, nr_folios;
pgoff_t index;
- unsigned int nblocks_in_page;
unsigned long length = 0;
- sector_t b;
- struct pagevec pvec;
- struct page *page;
+ struct folio_batch fbatch;
+ struct folio *folio;
if (inode->i_mapping->nrpages == 0)
return 0;
index = start_blk >> (PAGE_SHIFT - inode->i_blkbits);
- nblocks_in_page = 1U << (PAGE_SHIFT - inode->i_blkbits);
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
repeat:
- pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE,
- pvec.pages);
- if (pvec.nr == 0)
+ nr_folios = filemap_get_folios_contig(inode->i_mapping, &index, ULONG_MAX,
+ &fbatch);
+ if (nr_folios == 0)
return length;
- if (length > 0 && pvec.pages[0]->index > index)
- goto out;
-
- b = pvec.pages[0]->index << (PAGE_SHIFT - inode->i_blkbits);
i = 0;
do {
- page = pvec.pages[i];
+ folio = fbatch.folios[i];
- lock_page(page);
- if (page_has_buffers(page)) {
+ folio_lock(folio);
+ if (folio_buffers(folio)) {
struct buffer_head *bh, *head;
+ sector_t b;
- bh = head = page_buffers(page);
+ b = folio->index << (PAGE_SHIFT - inode->i_blkbits);
+ bh = head = folio_buffers(folio);
do {
if (b < start_blk)
continue;
@@ -529,21 +524,17 @@ repeat:
} else {
if (length > 0)
goto out_locked;
-
- b += nblocks_in_page;
}
- unlock_page(page);
+ folio_unlock(folio);
- } while (++i < pagevec_count(&pvec));
+ } while (++i < nr_folios);
- index = page->index + 1;
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
cond_resched();
goto repeat;
out_locked:
- unlock_page(page);
-out:
- pagevec_release(&pvec);
+ folio_unlock(folio);
+ folio_batch_release(&fbatch);
return length;
}
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 0afe0832c754..3335ef352915 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -317,7 +317,7 @@ void nilfs_relax_pressure_in_lock(struct super_block *sb)
struct the_nilfs *nilfs = sb->s_fs_info;
struct nilfs_sc_info *sci = nilfs->ns_writer;
- if (!sci || !sci->sc_flush_request)
+ if (sb_rdonly(sb) || unlikely(!sci) || !sci->sc_flush_request)
return;
set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
@@ -875,9 +875,11 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
nilfs_cpfile_put_checkpoint(
nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
- } else
- WARN_ON(err == -EINVAL || err == -ENOENT);
-
+ } else if (err == -EINVAL || err == -ENOENT) {
+ nilfs_error(sci->sc_super,
+ "checkpoint creation failed due to metadata corruption.");
+ err = -EIO;
+ }
return err;
}
@@ -891,7 +893,11 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0,
&raw_cp, &bh_cp);
if (unlikely(err)) {
- WARN_ON(err == -EINVAL || err == -ENOENT);
+ if (err == -EINVAL || err == -ENOENT) {
+ nilfs_error(sci->sc_super,
+ "checkpoint finalization failed due to metadata corruption.");
+ err = -EIO;
+ }
goto failed_ibh;
}
raw_cp->cp_snapshot_list.ssl_next = 0;
@@ -2235,16 +2241,14 @@ int nilfs_construct_segment(struct super_block *sb)
struct the_nilfs *nilfs = sb->s_fs_info;
struct nilfs_sc_info *sci = nilfs->ns_writer;
struct nilfs_transaction_info *ti;
- int err;
- if (!sci)
+ if (sb_rdonly(sb) || unlikely(!sci))
return -EROFS;
/* A call inside transactions causes a deadlock. */
BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC);
- err = nilfs_segctor_sync(sci);
- return err;
+ return nilfs_segctor_sync(sci);
}
/**
@@ -2276,7 +2280,7 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
struct nilfs_transaction_info ti;
int err = 0;
- if (!sci)
+ if (sb_rdonly(sb) || unlikely(!sci))
return -EROFS;
nilfs_transaction_lock(sb, &ti, 0);
@@ -2772,11 +2776,12 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
if (nilfs->ns_writer) {
/*
- * This happens if the filesystem was remounted
- * read/write after nilfs_error degenerated it into a
- * read-only mount.
+ * This happens if the filesystem is made read-only by
+ * __nilfs_error or nilfs_remount and then remounted
+ * read/write. In these cases, reuse the existing
+ * writer.
*/
- nilfs_detach_log_writer(sb);
+ return 0;
}
nilfs->ns_writer = nilfs_segctor_new(sb, root);
@@ -2786,10 +2791,9 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
inode_attach_wb(nilfs->ns_bdev->bd_inode, NULL);
err = nilfs_segctor_start_thread(nilfs->ns_writer);
- if (err) {
- kfree(nilfs->ns_writer);
- nilfs->ns_writer = NULL;
- }
+ if (unlikely(err))
+ nilfs_detach_log_writer(sb);
+
return err;
}
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 77ff8e95421f..dc359b56fdfa 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -495,14 +495,22 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
{
struct buffer_head *bh;
+ void *kaddr;
+ struct nilfs_segment_usage *su;
int ret;
+ down_write(&NILFS_MDT(sufile)->mi_sem);
ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
if (!ret) {
mark_buffer_dirty(bh);
nilfs_mdt_mark_dirty(sufile);
+ kaddr = kmap_atomic(bh->b_page);
+ su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
+ nilfs_segment_usage_set_dirty(su);
+ kunmap_atomic(kaddr);
brelse(bh);
}
+ up_write(&NILFS_MDT(sufile)->mi_sem);
return ret;
}
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index ba108f915391..6edb6e0dd61f 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1133,8 +1133,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
goto out;
if (*flags & SB_RDONLY) {
- /* Shutting down log writer */
- nilfs_detach_log_writer(sb);
sb->s_flags |= SB_RDONLY;
/*
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 3b4a079c9617..2064e6473d30 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -13,6 +13,7 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/random.h>
+#include <linux/log2.h>
#include <linux/crc32.h>
#include "nilfs.h"
#include "segment.h"
@@ -193,6 +194,34 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
}
/**
+ * nilfs_get_blocksize - get block size from raw superblock data
+ * @sb: super block instance
+ * @sbp: superblock raw data buffer
+ * @blocksize: place to store block size
+ *
+ * nilfs_get_blocksize() calculates the block size from the block size
+ * exponent information written in @sbp and stores it in @blocksize,
+ * or aborts with an error message if it's too large.
+ *
+ * Return Value: On success, 0 is returned. If the block size is too
+ * large, -EINVAL is returned.
+ */
+static int nilfs_get_blocksize(struct super_block *sb,
+ struct nilfs_super_block *sbp, int *blocksize)
+{
+ unsigned int shift_bits = le32_to_cpu(sbp->s_log_block_size);
+
+ if (unlikely(shift_bits >
+ ilog2(NILFS_MAX_BLOCK_SIZE) - BLOCK_SIZE_BITS)) {
+ nilfs_err(sb, "too large filesystem blocksize: 2 ^ %u KiB",
+ shift_bits);
+ return -EINVAL;
+ }
+ *blocksize = BLOCK_SIZE << shift_bits;
+ return 0;
+}
+
+/**
* load_nilfs - load and recover the nilfs
* @nilfs: the_nilfs structure to be released
* @sb: super block instance used to recover past segment
@@ -245,11 +274,15 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
/* verify consistency between two super blocks */
- blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size);
+ err = nilfs_get_blocksize(sb, sbp[0], &blocksize);
+ if (err)
+ goto scan_error;
+
if (blocksize != nilfs->ns_blocksize) {
nilfs_warn(sb,
"blocksize differs between two super blocks (%d != %d)",
blocksize, nilfs->ns_blocksize);
+ err = -EINVAL;
goto scan_error;
}
@@ -443,11 +476,33 @@ static int nilfs_valid_sb(struct nilfs_super_block *sbp)
return crc == le32_to_cpu(sbp->s_sum);
}
-static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
+/**
+ * nilfs_sb2_bad_offset - check the location of the second superblock
+ * @sbp: superblock raw data buffer
+ * @offset: byte offset of second superblock calculated from device size
+ *
+ * nilfs_sb2_bad_offset() checks if the position on the second
+ * superblock is valid or not based on the filesystem parameters
+ * stored in @sbp. If @offset points to a location within the segment
+ * area, or if the parameters themselves are not normal, it is
+ * determined to be invalid.
+ *
+ * Return Value: true if invalid, false if valid.
+ */
+static bool nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
{
- return offset < ((le64_to_cpu(sbp->s_nsegments) *
- le32_to_cpu(sbp->s_blocks_per_segment)) <<
- (le32_to_cpu(sbp->s_log_block_size) + 10));
+ unsigned int shift_bits = le32_to_cpu(sbp->s_log_block_size);
+ u32 blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
+ u64 nsegments = le64_to_cpu(sbp->s_nsegments);
+ u64 index;
+
+ if (blocks_per_segment < NILFS_SEG_MIN_BLOCKS ||
+ shift_bits > ilog2(NILFS_MAX_BLOCK_SIZE) - BLOCK_SIZE_BITS)
+ return true;
+
+ index = offset >> (shift_bits + BLOCK_SIZE_BITS);
+ do_div(index, blocks_per_segment);
+ return index < nsegments;
}
static void nilfs_release_super_block(struct the_nilfs *nilfs)
@@ -586,9 +641,11 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
if (err)
goto failed_sbh;
- blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
- if (blocksize < NILFS_MIN_BLOCK_SIZE ||
- blocksize > NILFS_MAX_BLOCK_SIZE) {
+ err = nilfs_get_blocksize(sb, sbp, &blocksize);
+ if (err)
+ goto failed_sbh;
+
+ if (blocksize < NILFS_MIN_BLOCK_SIZE) {
nilfs_err(sb,
"couldn't mount because of unsupported filesystem blocksize %d",
blocksize);
@@ -690,9 +747,7 @@ int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
{
unsigned long ncleansegs;
- down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
- up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
*nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
return 0;
}
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index cd7d09a569ff..a2a15bc4df28 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -18,7 +18,7 @@
#include "fanotify.h"
-static bool fanotify_path_equal(struct path *p1, struct path *p2)
+static bool fanotify_path_equal(const struct path *p1, const struct path *p2)
{
return p1->mnt == p2->mnt && p1->dentry == p2->dentry;
}
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 1d9f11255c64..57f51a9a3015 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -452,13 +452,7 @@ static inline bool fanotify_is_error_event(u32 mask)
return mask & FAN_FS_ERROR;
}
-static inline bool fanotify_event_has_path(struct fanotify_event *event)
-{
- return event->type == FANOTIFY_EVENT_TYPE_PATH ||
- event->type == FANOTIFY_EVENT_TYPE_PATH_PERM;
-}
-
-static inline struct path *fanotify_event_path(struct fanotify_event *event)
+static inline const struct path *fanotify_event_path(struct fanotify_event *event)
{
if (event->type == FANOTIFY_EVENT_TYPE_PATH)
return &FANOTIFY_PE(event)->path;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index f0e49a406ffa..4546da4a54f9 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -249,7 +249,7 @@ out:
return event;
}
-static int create_fd(struct fsnotify_group *group, struct path *path,
+static int create_fd(struct fsnotify_group *group, const struct path *path,
struct file **file)
{
int client_fd;
@@ -619,7 +619,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
char __user *buf, size_t count)
{
struct fanotify_event_metadata metadata;
- struct path *path = fanotify_event_path(event);
+ const struct path *path = fanotify_event_path(event);
struct fanotify_info *info = fanotify_event_info(event);
unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
@@ -1553,7 +1553,7 @@ static int fanotify_test_fid(struct dentry *dentry)
}
static int fanotify_events_supported(struct fsnotify_group *group,
- struct path *path, __u64 mask,
+ const struct path *path, __u64 mask,
unsigned int flags)
{
unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 87d8a50ee803..fde74eb333cc 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -76,10 +76,6 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb)
*/
extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
-/* allocate and destroy and event holder to attach events to notification/access queues */
-extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void);
-extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder);
-
extern struct kmem_cache *fsnotify_mark_connector_cachep;
#endif /* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 800c1d0eb0d0..3506f6074288 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -28,7 +28,7 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
struct inode *inode = d_inode(dentry);
const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
- return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
+ return dynamic_dname(buffer, buflen, "%s:[%lu]",
ns_ops->name, inode->i_ino);
}
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 52615e6090e1..a3865bc4a0c6 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -594,17 +594,37 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name,
for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) {
u8 *mrec_end = (u8 *)ctx->mrec +
le32_to_cpu(ctx->mrec->bytes_allocated);
- u8 *name_end = (u8 *)a + le16_to_cpu(a->name_offset) +
- a->name_length * sizeof(ntfschar);
- if ((u8*)a < (u8*)ctx->mrec || (u8*)a > mrec_end ||
- name_end > mrec_end)
+ u8 *name_end;
+
+ /* check whether ATTR_RECORD wrap */
+ if ((u8 *)a < (u8 *)ctx->mrec)
+ break;
+
+ /* check whether Attribute Record Header is within bounds */
+ if ((u8 *)a > mrec_end ||
+ (u8 *)a + sizeof(ATTR_RECORD) > mrec_end)
+ break;
+
+ /* check whether ATTR_RECORD's name is within bounds */
+ name_end = (u8 *)a + le16_to_cpu(a->name_offset) +
+ a->name_length * sizeof(ntfschar);
+ if (name_end > mrec_end)
break;
+
ctx->attr = a;
if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) ||
a->type == AT_END))
return -ENOENT;
if (unlikely(!a->length))
break;
+
+ /* check whether ATTR_RECORD's length wrap */
+ if ((u8 *)a + le32_to_cpu(a->length) < (u8 *)a)
+ break;
+ /* check whether ATTR_RECORD's length is within bounds */
+ if ((u8 *)a + le32_to_cpu(a->length) > mrec_end)
+ break;
+
if (a->type != type)
continue;
/*
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 58b660dbbee9..c481b14e4fd9 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -527,12 +527,12 @@ err_out:
goto out;
}
-static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
+static inline void ntfs_submit_bh_for_read(struct buffer_head *bh)
{
lock_buffer(bh);
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
- return submit_bh(REQ_OP_READ, bh);
+ submit_bh(REQ_OP_READ, bh);
}
/**
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index db0f1995aedd..08c659332e26 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1829,6 +1829,13 @@ int ntfs_read_inode_mount(struct inode *vi)
goto err_out;
}
+ /* Sanity check offset to the first attribute */
+ if (le16_to_cpu(m->attrs_offset) >= le32_to_cpu(m->bytes_allocated)) {
+ ntfs_error(sb, "Incorrect mft offset to the first attribute %u in superblock.",
+ le16_to_cpu(m->attrs_offset));
+ goto err_out;
+ }
+
/* Need this to sanity check attribute list references to $MFT. */
vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 5ae8de09b271..001f4e053c85 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2092,7 +2092,8 @@ get_ctx_vol_failed:
// TODO: Initialize security.
/* Get the extended system files' directory inode. */
vol->extend_ino = ntfs_iget(sb, FILE_Extend);
- if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino)) {
+ if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino) ||
+ !S_ISDIR(vol->extend_ino->i_mode)) {
if (!IS_ERR(vol->extend_ino))
iput(vol->extend_ino);
ntfs_error(sb, "Failed to load $Extend.");
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 511e58f2b0f8..e5399ebc3a2b 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -707,7 +707,7 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
setattr_copy(mnt_userns, inode, attr);
if (mode != inode->i_mode) {
- err = ntfs_acl_chmod(mnt_userns, inode);
+ err = ntfs_acl_chmod(mnt_userns, dentry);
if (err)
goto out;
@@ -1160,7 +1160,7 @@ const struct inode_operations ntfs_file_inode_operations = {
.setattr = ntfs3_setattr,
.listxattr = ntfs_listxattr,
.permission = ntfs_permission,
- .get_acl = ntfs_get_acl,
+ .get_inode_acl = ntfs_get_acl,
.set_acl = ntfs_set_acl,
.fiemap = ntfs_fiemap,
};
diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c
index e61545b9772e..c6eb371a3695 100644
--- a/fs/ntfs3/fslog.c
+++ b/fs/ntfs3/fslog.c
@@ -3798,7 +3798,7 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
}
log_init_pg_hdr(log, page_size, page_size, 1, 1);
- log_create(log, l_size, 0, get_random_int(), false, false);
+ log_create(log, l_size, 0, get_random_u32(), false, false);
log->ra = ra;
@@ -3872,7 +3872,7 @@ check_restart_area:
/* Do some checks based on whether we have a valid log page. */
if (!rst_info.valid_page) {
- open_log_count = get_random_int();
+ open_log_count = get_random_u32();
goto init_log_instance;
}
open_log_count = le32_to_cpu(ra2->open_log_count);
@@ -4023,7 +4023,7 @@ find_oldest:
memcpy(ra->clients, Add2Ptr(ra2, t16),
le16_to_cpu(ra2->ra_len) - t16);
- log->current_openlog_count = get_random_int();
+ log->current_openlog_count = get_random_u32();
ra->open_log_count = cpu_to_le32(log->current_openlog_count);
log->ra_size = offsetof(struct RESTART_AREA, clients) +
sizeof(struct CLIENT_REC);
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 31bc94f87940..20b953871574 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -636,17 +636,9 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
bh->b_size = block_size;
off = vbo & (PAGE_SIZE - 1);
set_bh_page(bh, page, off);
-
- lock_buffer(bh);
- bh->b_end_io = end_buffer_read_sync;
- get_bh(bh);
- submit_bh(REQ_OP_READ, bh);
-
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
- err = -EIO;
+ err = bh_read(bh, 0);
+ if (err < 0)
goto out;
- }
zero_user_segment(page, off + voff, off + block_size);
}
}
@@ -2069,8 +2061,6 @@ const struct inode_operations ntfs_link_inode_operations = {
.setattr = ntfs3_setattr,
.listxattr = ntfs_listxattr,
.permission = ntfs_permission,
- .get_acl = ntfs_get_acl,
- .set_acl = ntfs_set_acl,
};
const struct address_space_operations ntfs_aops = {
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index 0e72d2067804..c8db35e2ae17 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -371,7 +371,7 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry,
* ntfs_create_inode -> ntfs_init_acl -> posix_acl_create ->
* ntfs_get_acl -> ntfs_get_acl_ex -> ni_lock
*/
- struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT);
+ struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT);
if (IS_ERR(p)) {
err = PTR_ERR(p);
@@ -598,7 +598,7 @@ const struct inode_operations ntfs_dir_inode_operations = {
.mknod = ntfs_mknod,
.rename = ntfs_rename,
.permission = ntfs_permission,
- .get_acl = ntfs_get_acl,
+ .get_inode_acl = ntfs_get_acl,
.set_acl = ntfs_set_acl,
.setattr = ntfs3_setattr,
.getattr = ntfs_getattr,
@@ -611,7 +611,7 @@ const struct inode_operations ntfs_special_inode_operations = {
.setattr = ntfs3_setattr,
.getattr = ntfs_getattr,
.listxattr = ntfs_listxattr,
- .get_acl = ntfs_get_acl,
+ .get_inode_acl = ntfs_get_acl,
.set_acl = ntfs_set_acl,
};
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 5cefcfa52118..0e051c5595a2 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -857,7 +857,7 @@ unsigned long ntfs_names_hash(const u16 *name, size_t len, const u16 *upcase,
/* globals from xattr.c */
#ifdef CONFIG_NTFS3_FS_POSIX_ACL
struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu);
-int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct inode *dir);
@@ -866,7 +866,7 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode,
#define ntfs_set_acl NULL
#endif
-int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode);
+int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry);
int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode,
int mask);
ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index cfd59bb0f9de..616df209feea 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -652,71 +652,10 @@ out:
/*
* ntfs_set_acl - inode_operations::set_acl
*/
-int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
- return ntfs_set_acl_ex(mnt_userns, inode, acl, type, false);
-}
-
-static int ntfs_xattr_get_acl(struct user_namespace *mnt_userns,
- struct inode *inode, int type, void *buffer,
- size_t size)
-{
- struct posix_acl *acl;
- int err;
-
- if (!(inode->i_sb->s_flags & SB_POSIXACL)) {
- ntfs_inode_warn(inode, "add mount option \"acl\" to use acl");
- return -EOPNOTSUPP;
- }
-
- acl = ntfs_get_acl(inode, type, false);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
-
- if (!acl)
- return -ENODATA;
-
- err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
- posix_acl_release(acl);
-
- return err;
-}
-
-static int ntfs_xattr_set_acl(struct user_namespace *mnt_userns,
- struct inode *inode, int type, const void *value,
- size_t size)
-{
- struct posix_acl *acl;
- int err;
-
- if (!(inode->i_sb->s_flags & SB_POSIXACL)) {
- ntfs_inode_warn(inode, "add mount option \"acl\" to use acl");
- return -EOPNOTSUPP;
- }
-
- if (!inode_owner_or_capable(mnt_userns, inode))
- return -EPERM;
-
- if (!value) {
- acl = NULL;
- } else {
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
-
- if (acl) {
- err = posix_acl_valid(&init_user_ns, acl);
- if (err)
- goto release_and_out;
- }
- }
-
- err = ntfs_set_acl(mnt_userns, inode, acl, type);
-
-release_and_out:
- posix_acl_release(acl);
- return err;
+ return ntfs_set_acl_ex(mnt_userns, d_inode(dentry), acl, type, false);
}
/*
@@ -758,8 +697,9 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode,
/*
* ntfs_acl_chmod - Helper for ntfs3_setattr().
*/
-int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode)
+int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry)
{
+ struct inode *inode = d_inode(dentry);
struct super_block *sb = inode->i_sb;
if (!(sb->s_flags & SB_POSIXACL))
@@ -768,7 +708,7 @@ int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode)
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
- return posix_acl_chmod(mnt_userns, inode, inode->i_mode);
+ return posix_acl_chmod(mnt_userns, dentry, inode->i_mode);
}
/*
@@ -884,19 +824,6 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de,
goto out;
}
-#ifdef CONFIG_NTFS3_FS_POSIX_ACL
- if (!strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) ||
- !strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT)) {
- /* TODO: init_user_ns? */
- err = ntfs_xattr_get_acl(
- &init_user_ns, inode,
- strlen(name) == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1
- ? ACL_TYPE_ACCESS
- : ACL_TYPE_DEFAULT,
- buffer, size);
- goto out;
- }
-#endif
/* Deal with NTFS extended attribute. */
err = ntfs_get_ea(inode, name, strlen(name), buffer, size, NULL);
@@ -1009,18 +936,6 @@ set_new_fa:
goto out;
}
-#ifdef CONFIG_NTFS3_FS_POSIX_ACL
- if (!strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) ||
- !strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT)) {
- err = ntfs_xattr_set_acl(
- mnt_userns, inode,
- strlen(name) == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1
- ? ACL_TYPE_ACCESS
- : ACL_TYPE_DEFAULT,
- value, size);
- goto out;
- }
-#endif
/* Deal with NTFS extended attribute. */
err = ntfs_set_ea(inode, name, strlen(name), value, size, flags, 0);
@@ -1110,7 +1025,7 @@ static bool ntfs_xattr_user_list(struct dentry *dentry)
}
// clang-format off
-static const struct xattr_handler ntfs_xattr_handler = {
+static const struct xattr_handler ntfs_other_xattr_handler = {
.prefix = "",
.get = ntfs_getxattr,
.set = ntfs_setxattr,
@@ -1118,7 +1033,11 @@ static const struct xattr_handler ntfs_xattr_handler = {
};
const struct xattr_handler *ntfs_xattr_handlers[] = {
- &ntfs_xattr_handler,
+#ifdef CONFIG_NTFS3_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
+ &ntfs_other_xattr_handler,
NULL,
};
// clang-format on
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 23a72a423955..9f19cf9a5a9f 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -260,12 +260,13 @@ static int ocfs2_set_acl(handle_t *handle,
return ret;
}
-int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
struct buffer_head *bh = NULL;
int status, had_lock;
struct ocfs2_lock_holder oh;
+ struct inode *inode = d_inode(dentry);
had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
if (had_lock < 0)
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 95a57c888ab6..a897c4e41b26 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -17,7 +17,7 @@ struct ocfs2_acl_entry {
};
struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type, bool rcu);
-int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *);
extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index af4157f61927..1d65f6ef00ca 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -636,7 +636,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
!buffer_new(bh) &&
ocfs2_should_read_blk(inode, page, block_start) &&
(block_start < from || block_end > to)) {
- ll_rw_block(REQ_OP_READ, 1, &bh);
+ bh_read_nowait(bh, 0);
*wait_bh++=bh;
}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index b13d344d40b6..60b97c92e2b2 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -335,7 +335,7 @@ static void o2hb_arm_timeout(struct o2hb_region *reg)
/* negotiate timeout must be less than write timeout. */
schedule_delayed_work(&reg->hr_nego_timeout_work,
msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS));
- memset(reg->hr_nego_node_bitmap, 0, sizeof(reg->hr_nego_node_bitmap));
+ bitmap_zero(reg->hr_nego_node_bitmap, O2NM_MAX_NODES);
}
static void o2hb_disarm_timeout(struct o2hb_region *reg)
@@ -375,7 +375,7 @@ static void o2hb_nego_timeout(struct work_struct *work)
if (reg->hr_last_hb_status)
return;
- o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+ o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES);
/* lowest node as master node to make negotiate decision. */
master_node = find_first_bit(live_node_bitmap, O2NM_MAX_NODES);
@@ -386,8 +386,8 @@ static void o2hb_nego_timeout(struct work_struct *work)
config_item_name(&reg->hr_item), reg->hr_bdev);
set_bit(master_node, reg->hr_nego_node_bitmap);
}
- if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap,
- sizeof(reg->hr_nego_node_bitmap))) {
+ if (!bitmap_equal(reg->hr_nego_node_bitmap, live_node_bitmap,
+ O2NM_MAX_NODES)) {
/* check negotiate bitmap every second to do timeout
* approve decision.
*/
@@ -856,8 +856,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg)
* live nodes heartbeat on it. In other words, the region has been
* added to all nodes.
*/
- if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
- sizeof(o2hb_live_node_bitmap)))
+ if (!bitmap_equal(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
+ O2NM_MAX_NODES))
goto unlock;
printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n",
@@ -1087,7 +1087,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
* If a node is not configured but is in the livemap, we still need
* to read the slot so as to be able to remove it from the livemap.
*/
- o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+ o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES);
i = -1;
while ((i = find_next_bit(live_node_bitmap,
O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
@@ -1437,11 +1437,11 @@ void o2hb_init(void)
for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++)
INIT_LIST_HEAD(&o2hb_live_slots[i]);
- memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
- memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
- memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
- memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
- memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
+ bitmap_zero(o2hb_live_node_bitmap, O2NM_MAX_NODES);
+ bitmap_zero(o2hb_region_bitmap, O2NM_MAX_REGIONS);
+ bitmap_zero(o2hb_live_region_bitmap, O2NM_MAX_REGIONS);
+ bitmap_zero(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS);
+ bitmap_zero(o2hb_failed_region_bitmap, O2NM_MAX_REGIONS);
o2hb_dependent_users = 0;
@@ -1450,23 +1450,21 @@ void o2hb_init(void)
/* if we're already in a callback then we're already serialized by the sem */
static void o2hb_fill_node_map_from_callback(unsigned long *map,
- unsigned bytes)
+ unsigned int bits)
{
- BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
-
- memcpy(map, &o2hb_live_node_bitmap, bytes);
+ bitmap_copy(map, o2hb_live_node_bitmap, bits);
}
/*
* get a map of all nodes that are heartbeating in any regions
*/
-void o2hb_fill_node_map(unsigned long *map, unsigned bytes)
+void o2hb_fill_node_map(unsigned long *map, unsigned int bits)
{
/* callers want to serialize this map and callbacks so that they
* can trust that they don't miss nodes coming to the party */
down_read(&o2hb_callback_sem);
spin_lock(&o2hb_live_lock);
- o2hb_fill_node_map_from_callback(map, bytes);
+ o2hb_fill_node_map_from_callback(map, bits);
spin_unlock(&o2hb_live_lock);
up_read(&o2hb_callback_sem);
}
@@ -2460,7 +2458,7 @@ int o2hb_check_node_heartbeating_no_sem(u8 node_num)
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
spin_lock(&o2hb_live_lock);
- o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
+ o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES);
spin_unlock(&o2hb_live_lock);
if (!test_bit(node_num, testing_map)) {
mlog(ML_HEARTBEAT,
@@ -2477,7 +2475,7 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num)
{
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
- o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
+ o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES);
if (!test_bit(node_num, testing_map)) {
mlog(ML_HEARTBEAT,
"node (%u) does not have heartbeating enabled.\n",
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 1d4100abf6f8..8ef8c1b9eeb7 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -59,7 +59,7 @@ int o2hb_register_callback(const char *region_uuid,
void o2hb_unregister_callback(const char *region_uuid,
struct o2hb_callback_func *hc);
void o2hb_fill_node_map(unsigned long *map,
- unsigned bytes);
+ unsigned int bits);
void o2hb_exit(void);
void o2hb_init(void);
int o2hb_check_node_heartbeating_no_sem(u8 node_num);
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 7524994e3199..35c05c18de59 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -438,7 +438,7 @@ static int o2net_fill_bitmap(char *buf, int len)
unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
int i = -1, out = 0;
- o2net_fill_node_map(map, sizeof(map));
+ o2net_fill_node_map(map, O2NM_MAX_NODES);
while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
out += scnprintf(buf + out, PAGE_SIZE - out, "%d ", i);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 27fee68f860a..2f61d39e4e50 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -54,7 +54,7 @@ int o2nm_configured_node_map(unsigned long *map, unsigned bytes)
return -EINVAL;
read_lock(&cluster->cl_nodes_lock);
- memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap));
+ bitmap_copy(map, cluster->cl_nodes_bitmap, O2NM_MAX_NODES);
read_unlock(&cluster->cl_nodes_lock);
return 0;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index f660c0dbdb63..a07b24d170f2 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -900,7 +900,7 @@ static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
{
struct kvec vec = { .iov_len = len, .iov_base = data, };
struct msghdr msg = { .msg_flags = MSG_DONTWAIT, };
- iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, len);
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, len);
return sock_recvmsg(sock, &msg, MSG_DONTWAIT);
}
@@ -990,14 +990,12 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
}
/* Get a map of all nodes to which this node is currently connected to */
-void o2net_fill_node_map(unsigned long *map, unsigned bytes)
+void o2net_fill_node_map(unsigned long *map, unsigned int bits)
{
struct o2net_sock_container *sc;
int node, ret;
- BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
-
- memset(map, 0, bytes);
+ bitmap_zero(map, bits);
for (node = 0; node < O2NM_MAX_NODES; ++node) {
if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret))
continue;
@@ -1604,6 +1602,7 @@ static void o2net_start_connect(struct work_struct *work)
sc->sc_sock = sock; /* freed by sc_kref_release */
sock->sk->sk_allocation = GFP_ATOMIC;
+ sock->sk->sk_use_task_frag = false;
myaddr.sin_family = AF_INET;
myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 81c3d65d68fe..694471fc46b8 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2032,7 +2032,7 @@ struct ocfs2_empty_dir_priv {
unsigned seen_other;
unsigned dx_dir;
};
-static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name,
+static bool ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name,
int name_len, loff_t pos, u64 ino,
unsigned type)
{
@@ -2052,7 +2052,7 @@ static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name,
*/
if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
p->seen_dot = 1;
- return 0;
+ return true;
}
if (name_len == 2 && !strncmp("..", name, 2) &&
@@ -2060,13 +2060,13 @@ static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name,
p->seen_dot_dot = 1;
if (p->dx_dir && p->seen_dot)
- return 1;
+ return false;
- return 0;
+ return true;
}
p->seen_other = 1;
- return 1;
+ return false;
}
static int ocfs2_empty_dir_dx(struct inode *inode,
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index fd2022712167..20f790a47484 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -1094,7 +1094,7 @@ static inline enum dlm_status dlm_err_to_dlm_status(int err)
static inline void dlm_node_iter_init(unsigned long *map,
struct dlm_node_iter *iter)
{
- memcpy(iter->node_map, map, sizeof(iter->node_map));
+ bitmap_copy(iter->node_map, map, O2NM_MAX_NODES);
iter->curnode = -1;
}
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index c4eccd499db8..5c04dde99981 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1576,8 +1576,8 @@ static int dlm_should_restart_join(struct dlm_ctxt *dlm,
spin_lock(&dlm->spinlock);
/* For now, we restart the process if the node maps have
* changed at all */
- ret = memcmp(ctxt->live_map, dlm->live_nodes_map,
- sizeof(dlm->live_nodes_map));
+ ret = !bitmap_equal(ctxt->live_map, dlm->live_nodes_map,
+ O2NM_MAX_NODES);
spin_unlock(&dlm->spinlock);
if (ret)
@@ -1604,13 +1604,11 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
/* group sem locking should work for us here -- we're already
* registered for heartbeat events so filling this should be
* atomic wrt getting those handlers called. */
- o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map));
+ o2hb_fill_node_map(dlm->live_nodes_map, O2NM_MAX_NODES);
spin_lock(&dlm->spinlock);
- memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map));
-
+ bitmap_copy(ctxt->live_map, dlm->live_nodes_map, O2NM_MAX_NODES);
__dlm_set_joining_node(dlm, dlm->node_num);
-
spin_unlock(&dlm->spinlock);
node = -1;
@@ -1643,8 +1641,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
* yes_resp_map. Copy that into our domain map and send a join
* assert message to clean up everyone elses state. */
spin_lock(&dlm->spinlock);
- memcpy(dlm->domain_map, ctxt->yes_resp_map,
- sizeof(ctxt->yes_resp_map));
+ bitmap_copy(dlm->domain_map, ctxt->yes_resp_map, O2NM_MAX_NODES);
set_bit(dlm->node_num, dlm->domain_map);
spin_unlock(&dlm->spinlock);
@@ -2009,9 +2006,9 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n",
dlm->recovery_map, &(dlm->recovery_map[0]));
- memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map));
- memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map));
- memset(dlm->domain_map, 0, sizeof(dlm->domain_map));
+ bitmap_zero(dlm->recovery_map, O2NM_MAX_NODES);
+ bitmap_zero(dlm->live_nodes_map, O2NM_MAX_NODES);
+ bitmap_zero(dlm->domain_map, O2NM_MAX_NODES);
dlm->dlm_thread_task = NULL;
dlm->dlm_reco_thread_task = NULL;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 227da5b1b6ab..d610da8e2f24 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -258,12 +258,12 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
mle->type = type;
INIT_HLIST_NODE(&mle->master_hash_node);
INIT_LIST_HEAD(&mle->hb_events);
- memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+ bitmap_zero(mle->maybe_map, O2NM_MAX_NODES);
spin_lock_init(&mle->spinlock);
init_waitqueue_head(&mle->wq);
atomic_set(&mle->woken, 0);
kref_init(&mle->mle_refs);
- memset(mle->response_map, 0, sizeof(mle->response_map));
+ bitmap_zero(mle->response_map, O2NM_MAX_NODES);
mle->master = O2NM_MAX_NODES;
mle->new_master = O2NM_MAX_NODES;
mle->inuse = 0;
@@ -290,8 +290,8 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
atomic_inc(&dlm->mle_cur_count[mle->type]);
/* copy off the node_map and register hb callbacks on our copy */
- memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
- memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
+ bitmap_copy(mle->node_map, dlm->domain_map, O2NM_MAX_NODES);
+ bitmap_copy(mle->vote_map, dlm->domain_map, O2NM_MAX_NODES);
clear_bit(dlm->node_num, mle->vote_map);
clear_bit(dlm->node_num, mle->node_map);
@@ -572,7 +572,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
spin_unlock(&dlm->track_lock);
memset(res->lvb, 0, DLM_LVB_LEN);
- memset(res->refmap, 0, sizeof(res->refmap));
+ bitmap_zero(res->refmap, O2NM_MAX_NODES);
}
struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
@@ -1036,10 +1036,10 @@ recheck:
spin_lock(&mle->spinlock);
m = mle->master;
- map_changed = (memcmp(mle->vote_map, mle->node_map,
- sizeof(mle->vote_map)) != 0);
- voting_done = (memcmp(mle->vote_map, mle->response_map,
- sizeof(mle->vote_map)) == 0);
+ map_changed = !bitmap_equal(mle->vote_map, mle->node_map,
+ O2NM_MAX_NODES);
+ voting_done = bitmap_equal(mle->vote_map, mle->response_map,
+ O2NM_MAX_NODES);
/* restart if we hit any errors */
if (map_changed) {
@@ -1277,11 +1277,11 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
/* now blank out everything, as if we had never
* contacted anyone */
- memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
- memset(mle->response_map, 0, sizeof(mle->response_map));
+ bitmap_zero(mle->maybe_map, O2NM_MAX_NODES);
+ bitmap_zero(mle->response_map, O2NM_MAX_NODES);
/* reset the vote_map to the current node_map */
- memcpy(mle->vote_map, mle->node_map,
- sizeof(mle->node_map));
+ bitmap_copy(mle->vote_map, mle->node_map,
+ O2NM_MAX_NODES);
/* put myself into the maybe map */
if (mle->type != DLM_MLE_BLOCK)
set_bit(dlm->node_num, mle->maybe_map);
@@ -2094,7 +2094,7 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
flags = item->u.am.flags;
spin_lock(&dlm->spinlock);
- memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
+ bitmap_copy(nodemap, dlm->domain_map, O2NM_MAX_NODES);
spin_unlock(&dlm->spinlock);
clear_bit(dlm->node_num, nodemap);
@@ -3447,7 +3447,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
ret = 0;
}
- memset(iter.node_map, 0, sizeof(iter.node_map));
+ bitmap_zero(iter.node_map, O2NM_MAX_NODES);
set_bit(old_master, iter.node_map);
mlog(0, "doing assert master of %.*s back to %u\n",
res->lockname.len, res->lockname.name, old_master);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 52ad342fec3e..50da8af988c1 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -733,7 +733,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
struct dlm_reco_node_data *ndata;
spin_lock(&dlm->spinlock);
- memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map));
+ bitmap_copy(dlm->reco.node_map, dlm->domain_map, O2NM_MAX_NODES);
/* nodes can only be removed (by dying) after dropping
* this lock, and death will be trapped later, so this should do */
spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9c67edd215d5..5c60b6bc85bf 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1991,7 +1991,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
}
}
- if (file && should_remove_suid(file->f_path.dentry)) {
+ if (file && setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) {
ret = __ocfs2_write_remove_suid(inode, di_bh);
if (ret) {
mlog_errno(ret);
@@ -2279,7 +2279,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
* inode. There's also the dinode i_size state which
* can be lost via setattr during extending writes (we
* set inode->i_size at the end of a write. */
- if (should_remove_suid(dentry)) {
+ if (setattr_should_drop_suidgid(&init_user_ns, inode)) {
if (meta_level == 0) {
ocfs2_inode_unlock_for_extent_tree(inode,
&di_bh,
@@ -2712,7 +2712,7 @@ const struct inode_operations ocfs2_file_iops = {
.permission = ocfs2_permission,
.listxattr = ocfs2_listxattr,
.fiemap = ocfs2_fiemap,
- .get_acl = ocfs2_iop_get_acl,
+ .get_inode_acl = ocfs2_iop_get_acl,
.set_acl = ocfs2_iop_set_acl,
.fileattr_get = ocfs2_fileattr_get,
.fileattr_set = ocfs2_fileattr_set,
@@ -2722,7 +2722,7 @@ const struct inode_operations ocfs2_special_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
.permission = ocfs2_permission,
- .get_acl = ocfs2_iop_get_acl,
+ .get_inode_acl = ocfs2_iop_get_acl,
.set_acl = ocfs2_iop_set_acl,
};
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index fa87d89cf754..3fb98b4569a2 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -157,7 +157,7 @@ static void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
replay_map->rm_state = REPLAY_DONE;
}
-static void ocfs2_free_replay_slots(struct ocfs2_super *osb)
+void ocfs2_free_replay_slots(struct ocfs2_super *osb)
{
struct ocfs2_replay_map *replay_map = osb->replay_map;
@@ -2057,7 +2057,7 @@ struct ocfs2_orphan_filldir_priv {
enum ocfs2_orphan_reco_type orphan_reco_type;
};
-static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
+static bool ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
int name_len, loff_t pos, u64 ino,
unsigned type)
{
@@ -2066,21 +2066,21 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
struct inode *iter;
if (name_len == 1 && !strncmp(".", name, 1))
- return 0;
+ return true;
if (name_len == 2 && !strncmp("..", name, 2))
- return 0;
+ return true;
/* do not include dio entry in case of orphan scan */
if ((p->orphan_reco_type == ORPHAN_NO_NEED_TRUNCATE) &&
(!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
OCFS2_DIO_ORPHAN_PREFIX_LEN)))
- return 0;
+ return true;
/* Skip bad inodes so that recovery can continue */
iter = ocfs2_iget(p->osb, ino,
OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
if (IS_ERR(iter))
- return 0;
+ return true;
if (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
OCFS2_DIO_ORPHAN_PREFIX_LEN))
@@ -2090,7 +2090,7 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
* happen concurrently with unlink/rename */
if (OCFS2_I(iter)->ip_next_orphan) {
iput(iter);
- return 0;
+ return true;
}
trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno);
@@ -2099,7 +2099,7 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
OCFS2_I(iter)->ip_next_orphan = p->head;
p->head = iter;
- return 0;
+ return true;
}
static int ocfs2_queue_orphans(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 969d0aa28718..41c382f68529 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -150,6 +150,7 @@ int ocfs2_recovery_init(struct ocfs2_super *osb);
void ocfs2_recovery_exit(struct ocfs2_super *osb);
int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
+void ocfs2_free_replay_slots(struct ocfs2_super *osb);
/*
* Journal Control:
* Initialize, Load, Shutdown, Wipe a journal.
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 961d1cf54388..a8fd51afb794 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -232,6 +232,7 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns,
handle_t *handle = NULL;
struct ocfs2_super *osb;
struct ocfs2_dinode *dirfe;
+ struct ocfs2_dinode *fe = NULL;
struct buffer_head *new_fe_bh = NULL;
struct inode *inode = NULL;
struct ocfs2_alloc_context *inode_ac = NULL;
@@ -382,6 +383,7 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns,
goto leave;
}
+ fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
if (S_ISDIR(mode)) {
status = ocfs2_fill_new_dir(osb, handle, dir, inode,
new_fe_bh, data_ac, meta_ac);
@@ -454,8 +456,11 @@ roll_back:
leave:
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
- if (handle)
+ if (handle) {
+ if (status < 0 && fe)
+ ocfs2_set_links_count(fe, 0);
ocfs2_commit_trans(osb, handle);
+ }
ocfs2_inode_unlock(dir, 1);
if (did_block_signals)
@@ -632,18 +637,9 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
return status;
}
- status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
+ return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
parent_fe_bh, handle, inode_ac,
fe_blkno, suballoc_loc, suballoc_bit);
- if (status < 0) {
- u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit);
- int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode,
- inode_ac->ac_bh, suballoc_bit, bg_blkno, 1);
- if (tmp)
- mlog_errno(tmp);
- }
-
- return status;
}
static int ocfs2_mkdir(struct user_namespace *mnt_userns,
@@ -2028,8 +2024,11 @@ bail:
ocfs2_clusters_to_bytes(osb->sb, 1));
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
- if (handle)
+ if (handle) {
+ if (status < 0 && fe)
+ ocfs2_set_links_count(fe, 0);
ocfs2_commit_trans(osb, handle);
+ }
ocfs2_inode_unlock(dir, 1);
if (did_block_signals)
@@ -2916,7 +2915,7 @@ const struct inode_operations ocfs2_dir_iops = {
.permission = ocfs2_permission,
.listxattr = ocfs2_listxattr,
.fiemap = ocfs2_fiemap,
- .get_acl = ocfs2_iop_get_acl,
+ .get_inode_acl = ocfs2_iop_get_acl,
.set_acl = ocfs2_iop_set_acl,
.fileattr_get = ocfs2_fileattr_get,
.fileattr_set = ocfs2_fileattr_set,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 740b64238312..a503c553bab2 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -560,8 +560,7 @@ static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di)
u32 nlink = le16_to_cpu(di->i_links_count);
u32 hi = le16_to_cpu(di->i_links_count_hi);
- if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL))
- nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
+ nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
return nlink;
}
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 638d875eccc7..7aebdbf5cc0a 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -527,7 +527,7 @@ struct ocfs2_extent_block
* value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty.
*/
struct ocfs2_slot_map {
-/*00*/ __le16 sm_slots[0];
+/*00*/ DECLARE_FLEX_ARRAY(__le16, sm_slots);
/*
* Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255,
* 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize.
@@ -548,7 +548,7 @@ struct ocfs2_extended_slot {
* i_size.
*/
struct ocfs2_slot_map_extended {
-/*00*/ struct ocfs2_extended_slot se_slots[0];
+/*00*/ DECLARE_FLEX_ARRAY(struct ocfs2_extended_slot, se_slots);
/*
* Actual size is i_size of the slot_map system file. It should
* match s_max_slots * sizeof(struct ocfs2_extended_slot)
@@ -727,7 +727,7 @@ struct ocfs2_dinode {
struct ocfs2_extent_list i_list;
struct ocfs2_truncate_log i_dealloc;
struct ocfs2_inline_data i_data;
- __u8 i_symlink[0];
+ DECLARE_FLEX_ARRAY(__u8, i_symlink);
} id2;
/* Actual on-disk size is one block */
};
@@ -892,7 +892,7 @@ struct ocfs2_group_desc
/*30*/ struct ocfs2_block_check bg_check; /* Error checking */
__le64 bg_reserved2;
/*40*/ union {
- __u8 bg_bitmap[0];
+ DECLARE_FLEX_ARRAY(__u8, bg_bitmap);
struct {
/*
* Block groups may be discontiguous when
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 1358981e80a3..623db358b1ef 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2614,7 +2614,7 @@ static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
}
/*
- * Calculate out the start and number of virtual clusters we need to to CoW.
+ * Calculate out the start and number of virtual clusters we need to CoW.
*
* cpos is vitual start cluster position we want to do CoW in a
* file and write_len is the cluster length.
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 88f75f7f02d7..c973c03f6fd8 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -273,17 +273,17 @@ static int o2cb_cluster_check(void)
*/
#define O2CB_MAP_STABILIZE_COUNT 60
for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) {
- o2hb_fill_node_map(hbmap, sizeof(hbmap));
+ o2hb_fill_node_map(hbmap, O2NM_MAX_NODES);
if (!test_bit(node_num, hbmap)) {
printk(KERN_ERR "o2cb: %s heartbeat has not been "
"started.\n", (o2hb_global_heartbeat_active() ?
"Global" : "Local"));
return -EINVAL;
}
- o2net_fill_node_map(netmap, sizeof(netmap));
+ o2net_fill_node_map(netmap, O2NM_MAX_NODES);
/* Force set the current node to allow easy compare */
set_bit(node_num, netmap);
- if (!memcmp(hbmap, netmap, sizeof(hbmap)))
+ if (bitmap_equal(hbmap, netmap, O2NM_MAX_NODES))
return 0;
if (i < O2CB_MAP_STABILIZE_COUNT - 1)
msleep(1000);
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index a75e2b7d67f5..64e6ddcfe329 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -991,7 +991,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
lc->oc_type = NO_CONTROLD;
rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name,
- DLM_LSFL_FS | DLM_LSFL_NEWEXCL, DLM_LVB_LEN,
+ DLM_LSFL_NEWEXCL, DLM_LVB_LEN,
&ocfs2_ls_ops, conn, &ops_rv, &fsdlm);
if (rc) {
if (rc == -EEXIST || rc == -EPROTO)
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index dd77b7aaabf5..a8d5ca98fa57 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -334,10 +334,10 @@ int ocfs2_cluster_connect(const char *stack_name,
goto out;
}
- strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1);
+ strscpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1);
new_conn->cc_namelen = grouplen;
if (cluster_name_len)
- strlcpy(new_conn->cc_cluster_name, cluster_name,
+ strscpy(new_conn->cc_cluster_name, cluster_name,
CLUSTER_NAME_MAX + 1);
new_conn->cc_cluster_name_len = cluster_name_len;
new_conn->cc_recovery_handler = recovery_handler;
@@ -669,6 +669,8 @@ static struct ctl_table_header *ocfs2_table_header;
static int __init ocfs2_stack_glue_init(void)
{
+ int ret;
+
strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
ocfs2_table_header = register_sysctl("fs/ocfs2/nm", ocfs2_nm_table);
@@ -678,7 +680,11 @@ static int __init ocfs2_stack_glue_init(void)
return -ENOMEM; /* or something. */
}
- return ocfs2_sysfs_init();
+ ret = ocfs2_sysfs_init();
+ if (ret)
+ unregister_sysctl_table(ocfs2_table_header);
+
+ return ret;
}
static void __exit ocfs2_stack_glue_exit(void)
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 5805a03d100b..9c74eace3adc 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -106,7 +106,7 @@ int ocfs2_claim_clusters(handle_t *handle,
u32 *cluster_start,
u32 *num_clusters);
/*
- * Use this variant of ocfs2_claim_clusters to specify a maxiumum
+ * Use this variant of ocfs2_claim_clusters to specify a maximum
* number of clusters smaller than the allocation reserved.
*/
int __ocfs2_claim_clusters(handle_t *handle,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index e2cc9eec287c..0b0e6a132101 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1159,6 +1159,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
out_dismount:
atomic_set(&osb->vol_state, VOLUME_DISABLED);
wake_up(&osb->osb_mount_event);
+ ocfs2_free_replay_slots(osb);
ocfs2_dismount_volume(sb, 1);
goto out;
@@ -1764,9 +1765,7 @@ static int ocfs2_get_sector(struct super_block *sb,
if (!buffer_dirty(*bh))
clear_buffer_uptodate(*bh);
unlock_buffer(*bh);
- ll_rw_block(REQ_OP_READ, 1, bh);
- wait_on_buffer(*bh);
- if (!buffer_uptodate(*bh)) {
+ if (bh_read(*bh, 0) < 0) {
mlog_errno(-EIO);
brelse(*bh);
*bh = NULL;
@@ -1824,12 +1823,14 @@ static int ocfs2_mount_volume(struct super_block *sb)
status = ocfs2_truncate_log_init(osb);
if (status < 0) {
mlog_errno(status);
- goto out_system_inodes;
+ goto out_check_volume;
}
ocfs2_super_unlock(osb, 1);
return 0;
+out_check_volume:
+ ocfs2_free_replay_slots(osb);
out_system_inodes:
if (osb->local_alloc_state == OCFS2_LA_ENABLED)
ocfs2_shutdown_local_alloc(osb);
@@ -2221,7 +2222,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto out_journal;
}
- strlcpy(osb->vol_label, di->id2.i_super.s_label,
+ strscpy(osb->vol_label, di->id2.i_super.s_label,
OCFS2_MAX_VOL_LABEL_LEN);
osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index fa7fe2393ff6..3a5b4b88a583 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -294,11 +294,6 @@ static void omfs_readahead(struct readahead_control *rac)
mpage_readahead(rac, omfs_get_block);
}
-static int omfs_writepage(struct page *page, struct writeback_control *wbc)
-{
- return block_write_full_page(page, omfs_get_block, wbc);
-}
-
static int
omfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
@@ -375,10 +370,10 @@ const struct address_space_operations omfs_aops = {
.invalidate_folio = block_invalidate_folio,
.read_folio = omfs_read_folio,
.readahead = omfs_readahead,
- .writepage = omfs_writepage,
.writepages = omfs_writepages,
.write_begin = omfs_write_begin,
.write_end = generic_write_end,
.bmap = omfs_bmap,
+ .migrate_folio = buffer_migrate_folio,
};
diff --git a/fs/open.c b/fs/open.c
index cf7e5c350a54..82c1a28b3308 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -54,7 +54,7 @@ int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry,
}
/* Remove suid, sgid, and file capabilities on truncate too */
- ret = dentry_needs_remove_privs(dentry);
+ ret = dentry_needs_remove_privs(mnt_userns, dentry);
if (ret < 0)
return ret;
if (ret)
@@ -188,7 +188,7 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
if (IS_APPEND(file_inode(f.file)))
goto out_putf;
sb_start_write(inode->i_sb);
- error = security_path_truncate(&f.file->f_path);
+ error = security_file_truncate(f.file);
if (!error)
error = do_truncate(file_mnt_user_ns(f.file), dentry, length,
ATTR_MTIME | ATTR_CTIME, f.file);
@@ -723,10 +723,10 @@ retry_deleg:
return -EINVAL;
if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid))
return -EINVAL;
- if (!S_ISDIR(inode->i_mode))
- newattrs.ia_valid |=
- ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
inode_lock(inode);
+ if (!S_ISDIR(inode->i_mode))
+ newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV |
+ setattr_should_drop_sgid(mnt_userns, inode);
/* Continue to send actual fs values, not the mount values. */
error = security_path_chown(
path,
@@ -842,7 +842,9 @@ static int do_dentry_open(struct file *f,
return 0;
}
- if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
+ if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
+ i_readcount_inc(inode);
+ } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
error = get_write_access(inode);
if (unlikely(error))
goto cleanup_file;
@@ -882,8 +884,6 @@ static int do_dentry_open(struct file *f,
goto cleanup_all;
}
f->f_mode |= FMODE_OPENED;
- if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
- i_readcount_inc(inode);
if ((f->f_mode & FMODE_READ) &&
likely(f->f_op->read || f->f_op->read_iter))
f->f_mode |= FMODE_CAN_READ;
@@ -937,10 +937,7 @@ cleanup_all:
if (WARN_ON_ONCE(error > 0))
error = -EINVAL;
fops_put(f->f_op);
- if (f->f_mode & FMODE_WRITER) {
- put_write_access(inode);
- __mnt_drop_write(f->f_path.mnt);
- }
+ put_file_access(f);
cleanup_file:
path_put(&f->f_path);
f->f_path.mnt = NULL;
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
index 605e5a3506ec..c5da2091cefb 100644
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
@@ -64,8 +64,7 @@ struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu)
return acl;
}
-static int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl,
- int type)
+int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
int error = 0;
void *value = NULL;
@@ -119,12 +118,13 @@ out:
return error;
}
-int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int orangefs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int error;
struct iattr iattr;
int rc;
+ struct inode *inode = d_inode(dentry);
memset(&iattr, 0, sizeof iattr);
@@ -153,46 +153,7 @@ int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
rc = __orangefs_set_acl(inode, acl, type);
if (!rc && (iattr.ia_valid == ATTR_MODE))
- rc = __orangefs_setattr(inode, &iattr);
+ rc = __orangefs_setattr_mode(dentry, &iattr);
return rc;
}
-
-int orangefs_init_acl(struct inode *inode, struct inode *dir)
-{
- struct posix_acl *default_acl, *acl;
- umode_t mode = inode->i_mode;
- struct iattr iattr;
- int error = 0;
-
- error = posix_acl_create(dir, &mode, &default_acl, &acl);
- if (error)
- return error;
-
- if (default_acl) {
- error = __orangefs_set_acl(inode, default_acl,
- ACL_TYPE_DEFAULT);
- posix_acl_release(default_acl);
- } else {
- inode->i_default_acl = NULL;
- }
-
- if (acl) {
- if (!error)
- error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS);
- posix_acl_release(acl);
- } else {
- inode->i_acl = NULL;
- }
-
- /* If mode of the inode was changed, then do a forcible ->setattr */
- if (mode != inode->i_mode) {
- memset(&iattr, 0, sizeof iattr);
- inode->i_mode = mode;
- iattr.ia_mode = mode;
- iattr.ia_valid |= ATTR_MODE;
- __orangefs_setattr(inode, &iattr);
- }
-
- return error;
-}
diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c
index e2c2699d8016..9cacce5d55c1 100644
--- a/fs/orangefs/dir.c
+++ b/fs/orangefs/dir.c
@@ -398,7 +398,7 @@ static int orangefs_dir_release(struct inode *inode, struct file *file)
const struct file_operations orangefs_dir_operations = {
.llseek = orangefs_dir_llseek,
.read = generic_read_dir,
- .iterate = orangefs_dir_iterate,
+ .iterate_shared = orangefs_dir_iterate,
.open = orangefs_dir_open,
.release = orangefs_dir_release
};
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 86810e5d7914..167fa43b24f9 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -273,7 +273,6 @@ out:
gossip_debug(GOSSIP_FILE_DEBUG,
"%s(%pU): PUT buffer_index %d\n",
__func__, handle, buffer_index);
- buffer_index = -1;
}
op_release(new_op);
return ret;
@@ -417,9 +416,7 @@ static int orangefs_file_release(struct inode *inode, struct file *file)
* readahead cache (if any); this forces an expensive refresh of
* data for the next caller of mmap (or 'get_block' accesses)
*/
- if (file_inode(file) &&
- file_inode(file)->i_mapping &&
- mapping_nrpages(&file_inode(file)->i_data)) {
+ if (mapping_nrpages(file->f_mapping)) {
if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) {
gossip_debug(GOSSIP_INODE_DEBUG,
"calling flush_racache on %pU\n",
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 7a8c0c6e698d..4df560894386 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -53,7 +53,7 @@ static int orangefs_writepage_locked(struct page *page,
bv.bv_len = wlen;
bv.bv_offset = off % PAGE_SIZE;
WARN_ON(wlen == 0);
- iov_iter_bvec(&iter, WRITE, &bv, 1, wlen);
+ iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, wlen);
ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen,
len, wr, NULL, NULL);
@@ -112,7 +112,7 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow,
else
ow->bv[i].bv_offset = 0;
}
- iov_iter_bvec(&iter, WRITE, ow->bv, ow->npages, ow->len);
+ iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->npages, ow->len);
WARN_ON(ow->off >= len);
if (ow->off + ow->len > len)
@@ -270,7 +270,7 @@ static void orangefs_readahead(struct readahead_control *rac)
offset = readahead_pos(rac);
i_pages = &rac->mapping->i_pages;
- iov_iter_xarray(&iter, READ, i_pages, offset, readahead_length(rac));
+ iov_iter_xarray(&iter, ITER_DEST, i_pages, offset, readahead_length(rac));
/* read in the pages. */
if ((ret = wait_for_direct_io(ORANGEFS_IO_READ, inode,
@@ -303,7 +303,7 @@ static int orangefs_read_folio(struct file *file, struct folio *folio)
bv.bv_page = &folio->page;
bv.bv_len = folio_size(folio);
bv.bv_offset = 0;
- iov_iter_bvec(&iter, READ, &bv, 1, folio_size(folio));
+ iov_iter_bvec(&iter, ITER_DEST, &bv, 1, folio_size(folio));
ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter,
folio_size(folio), inode->i_size, NULL, NULL, file);
@@ -530,7 +530,6 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
size_t count = iov_iter_count(iter);
ssize_t total_count = 0;
ssize_t ret = -EINVAL;
- int i = 0;
gossip_debug(GOSSIP_FILE_DEBUG,
"%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
@@ -556,7 +555,6 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
while (iov_iter_count(iter)) {
size_t each_count = iov_iter_count(iter);
size_t amt_complete;
- i++;
/* how much to transfer in this loop iteration */
if (each_count > orangefs_bufmap_size_query())
@@ -828,15 +826,23 @@ again:
spin_unlock(&inode->i_lock);
mark_inode_dirty(inode);
- if (iattr->ia_valid & ATTR_MODE)
- /* change mod on a file that has ACLs */
- ret = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
-
ret = 0;
out:
return ret;
}
+int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr)
+{
+ int ret;
+ struct inode *inode = d_inode(dentry);
+
+ ret = __orangefs_setattr(inode, iattr);
+ /* change mode on a file that has ACLs */
+ if (!ret && (iattr->ia_valid & ATTR_MODE))
+ ret = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
+ return ret;
+}
+
/*
* Change attributes of an object referenced by dentry.
*/
@@ -849,7 +855,7 @@ int orangefs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
ret = setattr_prepare(&init_user_ns, dentry, iattr);
if (ret)
goto out;
- ret = __orangefs_setattr(d_inode(dentry), iattr);
+ ret = __orangefs_setattr_mode(dentry, iattr);
sync_inode_metadata(d_inode(dentry), 1);
out:
gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n",
@@ -967,7 +973,7 @@ static int orangefs_fileattr_set(struct user_namespace *mnt_userns,
/* ORANGEFS2 implementation of VFS inode operations for files */
static const struct inode_operations orangefs_file_inode_operations = {
- .get_acl = orangefs_get_acl,
+ .get_inode_acl = orangefs_get_acl,
.set_acl = orangefs_set_acl,
.setattr = orangefs_setattr,
.getattr = orangefs_getattr,
@@ -1097,8 +1103,9 @@ struct inode *orangefs_iget(struct super_block *sb,
* Allocate an inode for a newly created file and insert it into the inode hash.
*/
struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
- int mode, dev_t dev, struct orangefs_object_kref *ref)
+ umode_t mode, dev_t dev, struct orangefs_object_kref *ref)
{
+ struct posix_acl *acl = NULL, *default_acl = NULL;
unsigned long hash = orangefs_handle_hash(ref);
struct inode *inode;
int error;
@@ -1115,6 +1122,10 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
if (!inode)
return ERR_PTR(-ENOMEM);
+ error = posix_acl_create(dir, &mode, &default_acl, &acl);
+ if (error)
+ goto out_iput;
+
orangefs_set_inode(inode, ref);
inode->i_ino = hash; /* needed for stat etc */
@@ -1125,6 +1136,19 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
orangefs_init_iops(inode);
inode->i_rdev = dev;
+ if (default_acl) {
+ error = __orangefs_set_acl(inode, default_acl,
+ ACL_TYPE_DEFAULT);
+ if (error)
+ goto out_iput;
+ }
+
+ if (acl) {
+ error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ if (error)
+ goto out_iput;
+ }
+
error = insert_inode_locked4(inode, hash, orangefs_test_inode, ref);
if (error < 0)
goto out_iput;
@@ -1132,10 +1156,22 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
gossip_debug(GOSSIP_INODE_DEBUG,
"Initializing ACL's for inode %pU\n",
get_khandle_from_ino(inode));
- orangefs_init_acl(inode, dir);
+ if (mode != inode->i_mode) {
+ struct iattr iattr = {
+ .ia_mode = mode,
+ .ia_valid = ATTR_MODE,
+ };
+ inode->i_mode = mode;
+ __orangefs_setattr(inode, &iattr);
+ __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+ }
+ posix_acl_release(acl);
+ posix_acl_release(default_acl);
return inode;
out_iput:
iput(inode);
+ posix_acl_release(acl);
+ posix_acl_release(default_acl);
return ERR_PTR(error);
}
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 600e8eee541f..75c1a3dcf68c 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -430,7 +430,7 @@ static int orangefs_rename(struct user_namespace *mnt_userns,
/* ORANGEFS implementation of VFS inode operations for directories */
const struct inode_operations orangefs_dir_inode_operations = {
.lookup = orangefs_lookup,
- .get_acl = orangefs_get_acl,
+ .get_inode_acl = orangefs_get_acl,
.set_acl = orangefs_set_acl,
.create = orangefs_create,
.unlink = orangefs_unlink,
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 29eaa4544372..1b508f543384 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -194,15 +194,10 @@ void orangefs_debugfs_init(int debug_mask)
*/
static void orangefs_kernel_debug_init(void)
{
- int rc = -ENOMEM;
- char *k_buffer = NULL;
+ static char k_buffer[ORANGEFS_MAX_DEBUG_STRING_LEN] = { };
gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
- k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
- if (!k_buffer)
- goto out;
-
if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
strcpy(k_buffer, kernel_debug_string);
strcat(k_buffer, "\n");
@@ -213,15 +208,14 @@ static void orangefs_kernel_debug_init(void)
debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE, 0444, debug_dir, k_buffer,
&kernel_debug_fops);
-
-out:
- gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
}
void orangefs_debugfs_cleanup(void)
{
debugfs_remove_recursive(debug_dir);
+ kfree(debug_help_string);
+ debug_help_string = NULL;
}
/* open ORANGEFS_KMOD_DEBUG_HELP_FILE */
@@ -297,18 +291,13 @@ static int help_show(struct seq_file *m, void *v)
/*
* initialize the client-debug file.
*/
-static int orangefs_client_debug_init(void)
+static void orangefs_client_debug_init(void)
{
- int rc = -ENOMEM;
- char *c_buffer = NULL;
+ static char c_buffer[ORANGEFS_MAX_DEBUG_STRING_LEN] = { };
gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
- c_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
- if (!c_buffer)
- goto out;
-
if (strlen(client_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
strcpy(c_buffer, client_debug_string);
strcat(c_buffer, "\n");
@@ -322,13 +311,6 @@ static int orangefs_client_debug_init(void)
debug_dir,
c_buffer,
&kernel_debug_fops);
-
- rc = 0;
-
-out:
-
- gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
- return rc;
}
/* open ORANGEFS_KMOD_DEBUG_FILE or ORANGEFS_CLIENT_DEBUG_FILE.*/
@@ -671,6 +653,7 @@ int orangefs_prepare_debugfs_help_string(int at_boot)
memset(debug_help_string, 0, DEBUG_HELP_STRING_SIZE);
strlcat(debug_help_string, new, string_size);
mutex_unlock(&orangefs_help_file_lock);
+ kfree(new);
}
rc = 0;
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index b5940ec1836a..6e0cc01b3a14 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -103,13 +103,13 @@ enum orangefs_vfs_op_states {
#define ORANGEFS_CACHE_CREATE_FLAGS 0
#endif
-extern int orangefs_init_acl(struct inode *inode, struct inode *dir);
extern const struct xattr_handler *orangefs_xattr_handlers[];
extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu);
extern int orangefs_set_acl(struct user_namespace *mnt_userns,
- struct inode *inode, struct posix_acl *acl,
+ struct dentry *dentry, struct posix_acl *acl,
int type);
+int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
/*
* orangefs data structures
@@ -356,11 +356,12 @@ void fsid_key_table_finalize(void);
vm_fault_t orangefs_page_mkwrite(struct vm_fault *);
struct inode *orangefs_new_inode(struct super_block *sb,
struct inode *dir,
- int mode,
+ umode_t mode,
dev_t dev,
struct orangefs_object_kref *ref);
int __orangefs_setattr(struct inode *, struct iattr *);
+int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr);
int orangefs_setattr(struct user_namespace *, struct dentry *, struct iattr *);
int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path,
diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c
index cd7297815f91..5ab741c60b7e 100644
--- a/fs/orangefs/orangefs-mod.c
+++ b/fs/orangefs/orangefs-mod.c
@@ -141,7 +141,7 @@ static int __init orangefs_init(void)
gossip_err("%s: could not initialize device subsystem %d!\n",
__func__,
ret);
- goto cleanup_device;
+ goto cleanup_sysfs;
}
ret = register_filesystem(&orangefs_fs_type);
@@ -152,11 +152,11 @@ static int __init orangefs_init(void)
goto out;
}
- orangefs_sysfs_exit();
-
-cleanup_device:
orangefs_dev_cleanup();
+cleanup_sysfs:
+ orangefs_sysfs_exit();
+
sysfs_init_failed:
orangefs_debugfs_cleanup();
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
index de80b62553bb..be4ba03a01a0 100644
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -896,9 +896,18 @@ static struct attribute *orangefs_default_attrs[] = {
};
ATTRIBUTE_GROUPS(orangefs_default);
+static struct kobject *orangefs_obj;
+
+static void orangefs_obj_release(struct kobject *kobj)
+{
+ kfree(orangefs_obj);
+ orangefs_obj = NULL;
+}
+
static struct kobj_type orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = orangefs_default_groups,
+ .release = orangefs_obj_release,
};
static struct orangefs_attribute acache_hard_limit_attribute =
@@ -934,9 +943,18 @@ static struct attribute *acache_orangefs_default_attrs[] = {
};
ATTRIBUTE_GROUPS(acache_orangefs_default);
+static struct kobject *acache_orangefs_obj;
+
+static void acache_orangefs_obj_release(struct kobject *kobj)
+{
+ kfree(acache_orangefs_obj);
+ acache_orangefs_obj = NULL;
+}
+
static struct kobj_type acache_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = acache_orangefs_default_groups,
+ .release = acache_orangefs_obj_release,
};
static struct orangefs_attribute capcache_hard_limit_attribute =
@@ -972,9 +990,18 @@ static struct attribute *capcache_orangefs_default_attrs[] = {
};
ATTRIBUTE_GROUPS(capcache_orangefs_default);
+static struct kobject *capcache_orangefs_obj;
+
+static void capcache_orangefs_obj_release(struct kobject *kobj)
+{
+ kfree(capcache_orangefs_obj);
+ capcache_orangefs_obj = NULL;
+}
+
static struct kobj_type capcache_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = capcache_orangefs_default_groups,
+ .release = capcache_orangefs_obj_release,
};
static struct orangefs_attribute ccache_hard_limit_attribute =
@@ -1010,9 +1037,18 @@ static struct attribute *ccache_orangefs_default_attrs[] = {
};
ATTRIBUTE_GROUPS(ccache_orangefs_default);
+static struct kobject *ccache_orangefs_obj;
+
+static void ccache_orangefs_obj_release(struct kobject *kobj)
+{
+ kfree(ccache_orangefs_obj);
+ ccache_orangefs_obj = NULL;
+}
+
static struct kobj_type ccache_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = ccache_orangefs_default_groups,
+ .release = ccache_orangefs_obj_release,
};
static struct orangefs_attribute ncache_hard_limit_attribute =
@@ -1048,9 +1084,18 @@ static struct attribute *ncache_orangefs_default_attrs[] = {
};
ATTRIBUTE_GROUPS(ncache_orangefs_default);
+static struct kobject *ncache_orangefs_obj;
+
+static void ncache_orangefs_obj_release(struct kobject *kobj)
+{
+ kfree(ncache_orangefs_obj);
+ ncache_orangefs_obj = NULL;
+}
+
static struct kobj_type ncache_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = ncache_orangefs_default_groups,
+ .release = ncache_orangefs_obj_release,
};
static struct orangefs_attribute pc_acache_attribute =
@@ -1079,9 +1124,18 @@ static struct attribute *pc_orangefs_default_attrs[] = {
};
ATTRIBUTE_GROUPS(pc_orangefs_default);
+static struct kobject *pc_orangefs_obj;
+
+static void pc_orangefs_obj_release(struct kobject *kobj)
+{
+ kfree(pc_orangefs_obj);
+ pc_orangefs_obj = NULL;
+}
+
static struct kobj_type pc_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = pc_orangefs_default_groups,
+ .release = pc_orangefs_obj_release,
};
static struct orangefs_attribute stats_reads_attribute =
@@ -1103,19 +1157,20 @@ static struct attribute *stats_orangefs_default_attrs[] = {
};
ATTRIBUTE_GROUPS(stats_orangefs_default);
+static struct kobject *stats_orangefs_obj;
+
+static void stats_orangefs_obj_release(struct kobject *kobj)
+{
+ kfree(stats_orangefs_obj);
+ stats_orangefs_obj = NULL;
+}
+
static struct kobj_type stats_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = stats_orangefs_default_groups,
+ .release = stats_orangefs_obj_release,
};
-static struct kobject *orangefs_obj;
-static struct kobject *acache_orangefs_obj;
-static struct kobject *capcache_orangefs_obj;
-static struct kobject *ccache_orangefs_obj;
-static struct kobject *ncache_orangefs_obj;
-static struct kobject *pc_orangefs_obj;
-static struct kobject *stats_orangefs_obj;
-
int orangefs_sysfs_init(void)
{
int rc = -EINVAL;
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index dd188c7996b3..6708e54b0e30 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig
@@ -96,7 +96,7 @@ config OVERLAY_FS_XINO_AUTO
depends on 64BIT
help
If this config option is enabled then overlay filesystems will use
- unused high bits in undelying filesystem inode numbers to map all
+ unused high bits in underlying filesystem inode numbers to map all
inodes to a unified address space. The mapped 64bit inode numbers
might not be compatible with applications that expect 32bit inodes.
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index fdde6c56cc3d..6e4e65ee050d 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -44,7 +44,36 @@ static bool ovl_must_copy_xattr(const char *name)
!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
}
-int ovl_copy_xattr(struct super_block *sb, struct path *oldpath, struct dentry *new)
+static int ovl_copy_acl(struct ovl_fs *ofs, const struct path *path,
+ struct dentry *dentry, const char *acl_name)
+{
+ int err;
+ struct posix_acl *clone, *real_acl = NULL;
+
+ real_acl = ovl_get_acl_path(path, acl_name, false);
+ if (!real_acl)
+ return 0;
+
+ if (IS_ERR(real_acl)) {
+ err = PTR_ERR(real_acl);
+ if (err == -ENODATA || err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ clone = posix_acl_clone(real_acl, GFP_KERNEL);
+ posix_acl_release(real_acl); /* release original acl */
+ if (!clone)
+ return -ENOMEM;
+
+ err = ovl_do_set_acl(ofs, dentry, acl_name, clone);
+
+ /* release cloned acl */
+ posix_acl_release(clone);
+ return err;
+}
+
+int ovl_copy_xattr(struct super_block *sb, const struct path *oldpath, struct dentry *new)
{
struct dentry *old = oldpath->dentry;
ssize_t list_size, size, value_size = 0;
@@ -93,6 +122,15 @@ int ovl_copy_xattr(struct super_block *sb, struct path *oldpath, struct dentry *
error = 0;
continue; /* Discard */
}
+
+ if (is_posix_acl_xattr(name)) {
+ error = ovl_copy_acl(OVL_FS(sb), oldpath, new, name);
+ if (!error)
+ continue;
+ /* POSIX ACLs must be copied. */
+ break;
+ }
+
retry:
size = ovl_do_getxattr(oldpath, name, value, value_size);
if (size == -ERANGE)
@@ -132,8 +170,8 @@ out:
return error;
}
-static int ovl_copy_fileattr(struct inode *inode, struct path *old,
- struct path *new)
+static int ovl_copy_fileattr(struct inode *inode, const struct path *old,
+ const struct path *new)
{
struct fileattr oldfa = { .flags_valid = true };
struct fileattr newfa = { .flags_valid = true };
@@ -193,11 +231,11 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old,
return ovl_real_fileattr_set(new, &newfa);
}
-static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old,
- struct path *new, loff_t len)
+static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
+ struct file *new_file, loff_t len)
{
+ struct path datapath;
struct file *old_file;
- struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
loff_t cloned;
@@ -206,23 +244,18 @@ static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old,
bool skip_hole = false;
int error = 0;
- if (len == 0)
- return 0;
+ ovl_path_lowerdata(dentry, &datapath);
+ if (WARN_ON(datapath.dentry == NULL))
+ return -EIO;
- old_file = ovl_path_open(old, O_LARGEFILE | O_RDONLY);
+ old_file = ovl_path_open(&datapath, O_LARGEFILE | O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
- new_file = ovl_path_open(new, O_LARGEFILE | O_WRONLY);
- if (IS_ERR(new_file)) {
- error = PTR_ERR(new_file);
- goto out_fput;
- }
-
/* Try to use clone_file_range to clone up within the same fs */
cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0);
if (cloned == len)
- goto out;
+ goto out_fput;
/* Couldn't clone, so now we try to copy the data */
/* Check if lower fs supports seek operation */
@@ -282,10 +315,8 @@ static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old,
len -= bytes;
}
-out:
if (!error && ovl_should_sync(ofs))
error = vfs_fsync(new_file, 0);
- fput(new_file);
out_fput:
fput(old_file);
return error;
@@ -556,30 +587,31 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
return err;
}
-static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
+static int ovl_copy_up_data(struct ovl_copy_up_ctx *c, const struct path *temp)
{
struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
- struct inode *inode = d_inode(c->dentry);
- struct path upperpath, datapath;
+ struct file *new_file;
int err;
- ovl_path_upper(c->dentry, &upperpath);
- if (WARN_ON(upperpath.dentry != NULL))
- return -EIO;
+ if (!S_ISREG(c->stat.mode) || c->metacopy || !c->stat.size)
+ return 0;
- upperpath.dentry = temp;
+ new_file = ovl_path_open(temp, O_LARGEFILE | O_WRONLY);
+ if (IS_ERR(new_file))
+ return PTR_ERR(new_file);
- /*
- * Copy up data first and then xattrs. Writing data after
- * xattrs will remove security.capability xattr automatically.
- */
- if (S_ISREG(c->stat.mode) && !c->metacopy) {
- ovl_path_lowerdata(c->dentry, &datapath);
- err = ovl_copy_up_data(ofs, &datapath, &upperpath,
- c->stat.size);
- if (err)
- return err;
- }
+ err = ovl_copy_up_file(ofs, c->dentry, new_file, c->stat.size);
+ fput(new_file);
+
+ return err;
+}
+
+static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp)
+{
+ struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
+ struct inode *inode = d_inode(c->dentry);
+ struct path upperpath = { .mnt = ovl_upper_mnt(ofs), .dentry = temp };
+ int err;
err = ovl_copy_xattr(c->dentry->d_sb, &c->lowerpath, temp);
if (err)
@@ -662,6 +694,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
struct inode *inode;
struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir);
+ struct path path = { .mnt = ovl_upper_mnt(ofs) };
struct dentry *temp, *upper;
struct ovl_cu_creds cc;
int err;
@@ -688,7 +721,16 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
if (IS_ERR(temp))
goto unlock;
- err = ovl_copy_up_inode(c, temp);
+ /*
+ * Copy up data first and then xattrs. Writing data after
+ * xattrs will remove security.capability xattr automatically.
+ */
+ path.dentry = temp;
+ err = ovl_copy_up_data(c, &path);
+ if (err)
+ goto cleanup;
+
+ err = ovl_copy_up_metadata(c, temp);
if (err)
goto cleanup;
@@ -732,6 +774,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
struct inode *udir = d_inode(c->destdir);
struct dentry *temp, *upper;
+ struct file *tmpfile;
struct ovl_cu_creds cc;
int err;
@@ -739,15 +782,22 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
if (err)
return err;
- temp = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode);
+ tmpfile = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode);
ovl_revert_cu_creds(&cc);
- if (IS_ERR(temp))
- return PTR_ERR(temp);
+ if (IS_ERR(tmpfile))
+ return PTR_ERR(tmpfile);
+
+ temp = tmpfile->f_path.dentry;
+ if (!c->metacopy && c->stat.size) {
+ err = ovl_copy_up_file(ofs, c->dentry, tmpfile, c->stat.size);
+ if (err)
+ return err;
+ }
- err = ovl_copy_up_inode(c, temp);
+ err = ovl_copy_up_metadata(c, temp);
if (err)
- goto out_dput;
+ goto out_fput;
inode_lock_nested(udir, I_MUTEX_PARENT);
@@ -761,16 +811,14 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
inode_unlock(udir);
if (err)
- goto out_dput;
+ goto out_fput;
if (!c->metacopy)
ovl_set_upperdata(d_inode(c->dentry));
- ovl_inode_update(d_inode(c->dentry), temp);
+ ovl_inode_update(d_inode(c->dentry), dget(temp));
- return 0;
-
-out_dput:
- dput(temp);
+out_fput:
+ fput(tmpfile);
return err;
}
@@ -872,7 +920,7 @@ static bool ovl_need_meta_copy_up(struct dentry *dentry, umode_t mode,
return true;
}
-static ssize_t ovl_getxattr_value(struct path *path, char *name, char **value)
+static ssize_t ovl_getxattr_value(const struct path *path, char *name, char **value)
{
ssize_t res;
char *buf;
@@ -899,7 +947,7 @@ static ssize_t ovl_getxattr_value(struct path *path, char *name, char **value)
static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
{
struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
- struct path upperpath, datapath;
+ struct path upperpath;
int err;
char *capability = NULL;
ssize_t cap_size;
@@ -908,10 +956,6 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
if (WARN_ON(upperpath.dentry == NULL))
return -EIO;
- ovl_path_lowerdata(c->dentry, &datapath);
- if (WARN_ON(datapath.dentry == NULL))
- return -EIO;
-
if (c->stat.size) {
err = cap_size = ovl_getxattr_value(&upperpath, XATTR_NAME_CAPS,
&capability);
@@ -919,7 +963,7 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
goto out;
}
- err = ovl_copy_up_data(ofs, &datapath, &upperpath, c->stat.size);
+ err = ovl_copy_up_data(c, &upperpath);
if (err)
goto out_free;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 6b03457f72bb..f61e37f4c8ff 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -435,28 +435,12 @@ out:
}
static int ovl_set_upper_acl(struct ovl_fs *ofs, struct dentry *upperdentry,
- const char *name, const struct posix_acl *acl)
+ const char *acl_name, struct posix_acl *acl)
{
- void *buffer;
- size_t size;
- int err;
-
if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !acl)
return 0;
- size = posix_acl_xattr_size(acl->a_count);
- buffer = kmalloc(size, GFP_KERNEL);
- if (!buffer)
- return -ENOMEM;
-
- err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
- if (err < 0)
- goto out_free;
-
- err = ovl_do_setxattr(ofs, upperdentry, name, buffer, size, XATTR_CREATE);
-out_free:
- kfree(buffer);
- return err;
+ return ovl_do_set_acl(ofs, upperdentry, acl_name, acl);
}
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
@@ -592,28 +576,42 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
goto out_revert_creds;
}
- err = -ENOMEM;
- override_cred = prepare_creds();
- if (override_cred) {
+ if (!attr->hardlink) {
+ err = -ENOMEM;
+ override_cred = prepare_creds();
+ if (!override_cred)
+ goto out_revert_creds;
+ /*
+ * In the creation cases(create, mkdir, mknod, symlink),
+ * ovl should transfer current's fs{u,g}id to underlying
+ * fs. Because underlying fs want to initialize its new
+ * inode owner using current's fs{u,g}id. And in this
+ * case, the @inode is a new inode that is initialized
+ * in inode_init_owner() to current's fs{u,g}id. So use
+ * the inode's i_{u,g}id to override the cred's fs{u,g}id.
+ *
+ * But in the other hardlink case, ovl_link() does not
+ * create a new inode, so just use the ovl mounter's
+ * fs{u,g}id.
+ */
override_cred->fsuid = inode->i_uid;
override_cred->fsgid = inode->i_gid;
- if (!attr->hardlink) {
- err = security_dentry_create_files_as(dentry,
- attr->mode, &dentry->d_name, old_cred,
- override_cred);
- if (err) {
- put_cred(override_cred);
- goto out_revert_creds;
- }
+ err = security_dentry_create_files_as(dentry,
+ attr->mode, &dentry->d_name, old_cred,
+ override_cred);
+ if (err) {
+ put_cred(override_cred);
+ goto out_revert_creds;
}
put_cred(override_creds(override_cred));
put_cred(override_cred);
-
- if (!ovl_dentry_is_whiteout(dentry))
- err = ovl_create_upper(dentry, inode, attr);
- else
- err = ovl_create_over_whiteout(dentry, inode, attr);
}
+
+ if (!ovl_dentry_is_whiteout(dentry))
+ err = ovl_create_upper(dentry, inode, attr);
+ else
+ err = ovl_create_over_whiteout(dentry, inode, attr);
+
out_revert_creds:
revert_creds(old_cred);
return err;
@@ -1311,7 +1309,9 @@ const struct inode_operations ovl_dir_inode_operations = {
.permission = ovl_permission,
.getattr = ovl_getattr,
.listxattr = ovl_listxattr,
+ .get_inode_acl = ovl_get_inode_acl,
.get_acl = ovl_get_acl,
+ .set_acl = ovl_set_acl,
.update_time = ovl_update_time,
.fileattr_get = ovl_fileattr_get,
.fileattr_set = ovl_fileattr_set,
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index e065a5b9a442..a25bb3453dde 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -339,7 +339,7 @@ out_iput:
return dentry;
}
-/* Get the upper or lower dentry in stach whose on layer @idx */
+/* Get the upper or lower dentry in stack whose on layer @idx */
static struct dentry *ovl_dentry_real_at(struct dentry *dentry, int idx)
{
struct ovl_entry *oe = dentry->d_fsdata;
@@ -463,7 +463,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb,
/* Get connected upper overlay dir from index */
if (index) {
- struct dentry *upper = ovl_index_upper(ofs, index);
+ struct dentry *upper = ovl_index_upper(ofs, index, true);
dput(index);
if (IS_ERR_OR_NULL(upper))
@@ -739,7 +739,7 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb,
/* Then try to get a connected upper dir by index */
if (index && d_is_dir(index)) {
- struct dentry *upper = ovl_index_upper(ofs, index);
+ struct dentry *upper = ovl_index_upper(ofs, index, true);
err = PTR_ERR(upper);
if (IS_ERR_OR_NULL(upper))
@@ -796,7 +796,7 @@ static struct ovl_fh *ovl_fid_to_fh(struct fid *fid, int buflen, int fh_type)
return ERR_PTR(-ENOMEM);
/* Copy unaligned inner fh into aligned buffer */
- memcpy(&fh->fb, fid, buflen - OVL_FH_WIRE_OFFSET);
+ memcpy(fh->buf, fid, buflen - OVL_FH_WIRE_OFFSET);
return fh;
}
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index daff601b5c41..c9d0c362c7ef 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -34,11 +34,11 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode)
return 'm';
}
-/* No atime modificaton nor notify on underlying */
+/* No atime modification nor notify on underlying */
#define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY)
static struct file *ovl_open_realfile(const struct file *file,
- struct path *realpath)
+ const struct path *realpath)
{
struct inode *realinode = d_inode(realpath->dentry);
struct inode *inode = file_inode(file);
@@ -96,6 +96,7 @@ static int ovl_change_flags(struct file *file, unsigned int flags)
spin_lock(&file->f_lock);
file->f_flags = (file->f_flags & ~OVL_SETFL_MASK) | flags;
+ file->f_iocb_flags = iocb_flags(file);
spin_unlock(&file->f_lock);
return 0;
@@ -517,9 +518,16 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
const struct cred *old_cred;
int ret;
+ inode_lock(inode);
+ /* Update mode */
+ ovl_copyattr(inode);
+ ret = file_remove_privs(file);
+ if (ret)
+ goto out_unlock;
+
ret = ovl_real_fdget(file, &real);
if (ret)
- return ret;
+ goto out_unlock;
old_cred = ovl_override_creds(file_inode(file)->i_sb);
ret = vfs_fallocate(real.file, mode, offset, len);
@@ -530,6 +538,9 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
fdput(real);
+out_unlock:
+ inode_unlock(inode);
+
return ret;
}
@@ -567,14 +578,23 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
const struct cred *old_cred;
loff_t ret;
+ inode_lock(inode_out);
+ if (op != OVL_DEDUPE) {
+ /* Update mode */
+ ovl_copyattr(inode_out);
+ ret = file_remove_privs(file_out);
+ if (ret)
+ goto out_unlock;
+ }
+
ret = ovl_real_fdget(file_out, &real_out);
if (ret)
- return ret;
+ goto out_unlock;
ret = ovl_real_fdget(file_in, &real_in);
if (ret) {
fdput(real_out);
- return ret;
+ goto out_unlock;
}
old_cred = ovl_override_creds(file_inode(file_out)->i_sb);
@@ -603,6 +623,9 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
fdput(real_in);
fdput(real_out);
+out_unlock:
+ inode_unlock(inode_out);
+
return ret;
}
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 0fbcb590af84..ee6dfa577c93 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -14,6 +14,8 @@
#include <linux/fileattr.h>
#include <linux/security.h>
#include <linux/namei.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
#include "overlayfs.h"
@@ -460,7 +462,7 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
* of the POSIX ACLs retrieved from the lower layer to this function to not
* alter the POSIX ACLs for the underlying filesystem.
*/
-static void ovl_idmap_posix_acl(struct inode *realinode,
+static void ovl_idmap_posix_acl(const struct inode *realinode,
struct user_namespace *mnt_userns,
struct posix_acl *acl)
{
@@ -485,6 +487,64 @@ static void ovl_idmap_posix_acl(struct inode *realinode,
}
/*
+ * The @noperm argument is used to skip permission checking and is a temporary
+ * measure. Quoting Miklos from an earlier discussion:
+ *
+ * > So there are two paths to getting an acl:
+ * > 1) permission checking and 2) retrieving the value via getxattr(2).
+ * > This is a similar situation as reading a symlink vs. following it.
+ * > When following a symlink overlayfs always reads the link on the
+ * > underlying fs just as if it was a readlink(2) call, calling
+ * > security_inode_readlink() instead of security_inode_follow_link().
+ * > This is logical: we are reading the link from the underlying storage,
+ * > and following it on overlayfs.
+ * >
+ * > Applying the same logic to acl: we do need to call the
+ * > security_inode_getxattr() on the underlying fs, even if just want to
+ * > check permissions on overlay. This is currently not done, which is an
+ * > inconsistency.
+ * >
+ * > Maybe adding the check to ovl_get_acl() is the right way to go, but
+ * > I'm a little afraid of a performance regression. Will look into that.
+ *
+ * Until we have made a decision allow this helper to take the @noperm
+ * argument. We should hopefully be able to remove it soon.
+ */
+struct posix_acl *ovl_get_acl_path(const struct path *path,
+ const char *acl_name, bool noperm)
+{
+ struct posix_acl *real_acl, *clone;
+ struct user_namespace *mnt_userns;
+ struct inode *realinode = d_inode(path->dentry);
+
+ mnt_userns = mnt_user_ns(path->mnt);
+
+ if (noperm)
+ real_acl = get_inode_acl(realinode, posix_acl_type(acl_name));
+ else
+ real_acl = vfs_get_acl(mnt_userns, path->dentry, acl_name);
+ if (IS_ERR_OR_NULL(real_acl))
+ return real_acl;
+
+ if (!is_idmapped_mnt(path->mnt))
+ return real_acl;
+
+ /*
+ * We cannot alter the ACLs returned from the relevant layer as that
+ * would alter the cached values filesystem wide for the lower
+ * filesystem. Instead we can clone the ACLs and then apply the
+ * relevant idmapping of the layer.
+ */
+ clone = posix_acl_clone(real_acl, GFP_KERNEL);
+ posix_acl_release(real_acl); /* release original acl */
+ if (!clone)
+ return ERR_PTR(-ENOMEM);
+
+ ovl_idmap_posix_acl(realinode, mnt_userns, clone);
+ return clone;
+}
+
+/*
* When the relevant layer is an idmapped mount we need to take the idmapping
* of the layer into account and translate any ACL_{GROUP,USER} values
* according to the idmapped mount.
@@ -495,10 +555,12 @@ static void ovl_idmap_posix_acl(struct inode *realinode,
*
* This is obviously only relevant when idmapped layers are used.
*/
-struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu)
+struct posix_acl *do_ovl_get_acl(struct user_namespace *mnt_userns,
+ struct inode *inode, int type,
+ bool rcu, bool noperm)
{
struct inode *realinode = ovl_inode_real(inode);
- struct posix_acl *acl, *clone;
+ struct posix_acl *acl;
struct path realpath;
if (!IS_POSIXACL(realinode))
@@ -512,40 +574,115 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu)
}
if (rcu) {
+ /*
+ * If the layer is idmapped drop out of RCU path walk
+ * so we can clone the ACLs.
+ */
+ if (is_idmapped_mnt(realpath.mnt))
+ return ERR_PTR(-ECHILD);
+
acl = get_cached_acl_rcu(realinode, type);
} else {
const struct cred *old_cred;
old_cred = ovl_override_creds(inode->i_sb);
- acl = get_acl(realinode, type);
+ acl = ovl_get_acl_path(&realpath, posix_acl_xattr_name(type), noperm);
revert_creds(old_cred);
}
- /*
- * If there are no POSIX ACLs, or we encountered an error,
- * or the layer isn't idmapped we don't need to do anything.
- */
- if (!is_idmapped_mnt(realpath.mnt) || IS_ERR_OR_NULL(acl))
- return acl;
+
+ return acl;
+}
+
+static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode,
+ struct posix_acl *acl, int type)
+{
+ int err;
+ struct path realpath;
+ const char *acl_name;
+ const struct cred *old_cred;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ struct dentry *upperdentry = ovl_dentry_upper(dentry);
+ struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
+
+ err = ovl_want_write(dentry);
+ if (err)
+ return err;
/*
- * We only get here if the layer is idmapped. So drop out of RCU path
- * walk so we can clone the ACLs. There's no need to release the ACLs
- * since get_cached_acl_rcu() doesn't take a reference on the ACLs.
+ * If ACL is to be removed from a lower file, check if it exists in
+ * the first place before copying it up.
*/
- if (rcu)
- return ERR_PTR(-ECHILD);
+ acl_name = posix_acl_xattr_name(type);
+ if (!acl && !upperdentry) {
+ struct posix_acl *real_acl;
- clone = posix_acl_clone(acl, GFP_KERNEL);
- if (!clone)
- clone = ERR_PTR(-ENOMEM);
+ ovl_path_lower(dentry, &realpath);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ real_acl = vfs_get_acl(mnt_user_ns(realpath.mnt), realdentry,
+ acl_name);
+ revert_creds(old_cred);
+ if (IS_ERR(real_acl)) {
+ err = PTR_ERR(real_acl);
+ goto out_drop_write;
+ }
+ posix_acl_release(real_acl);
+ }
+
+ if (!upperdentry) {
+ err = ovl_copy_up(dentry);
+ if (err)
+ goto out_drop_write;
+
+ realdentry = ovl_dentry_upper(dentry);
+ }
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ if (acl)
+ err = ovl_do_set_acl(ofs, realdentry, acl_name, acl);
else
- ovl_idmap_posix_acl(realinode, mnt_user_ns(realpath.mnt), clone);
+ err = ovl_do_remove_acl(ofs, realdentry, acl_name);
+ revert_creds(old_cred);
+
+ /* copy c/mtime */
+ ovl_copyattr(inode);
+
+out_drop_write:
+ ovl_drop_write(dentry);
+ return err;
+}
+
+int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct posix_acl *acl, int type)
+{
+ int err;
+ struct inode *inode = d_inode(dentry);
+ struct dentry *workdir = ovl_workdir(dentry);
+ struct inode *realinode = ovl_inode_real(inode);
+
+ if (!IS_POSIXACL(d_inode(workdir)))
+ return -EOPNOTSUPP;
+ if (!realinode->i_op->set_acl)
+ return -EOPNOTSUPP;
+ if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+ return acl ? -EACCES : 0;
+ if (!inode_owner_or_capable(&init_user_ns, inode))
+ return -EPERM;
+
/*
- * Since we're not in RCU path walk we always need to release the
- * original ACLs.
+ * Check if sgid bit needs to be cleared (actual setacl operation will
+ * be done with mounter's capabilities and so that won't do it for us).
*/
- posix_acl_release(acl);
- return clone;
+ if (unlikely(inode->i_mode & S_ISGID) && type == ACL_TYPE_ACCESS &&
+ !in_group_p(inode->i_gid) &&
+ !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) {
+ struct iattr iattr = { .ia_valid = ATTR_KILL_SGID };
+
+ err = ovl_setattr(&init_user_ns, dentry, &iattr);
+ if (err)
+ return err;
+ }
+
+ return ovl_set_or_remove_acl(dentry, inode, acl, type);
}
#endif
@@ -588,7 +725,7 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* Introducing security_inode_fileattr_get/set() hooks would solve this issue
* properly.
*/
-static int ovl_security_fileattr(struct path *realpath, struct fileattr *fa,
+static int ovl_security_fileattr(const struct path *realpath, struct fileattr *fa,
bool set)
{
struct file *file;
@@ -610,7 +747,7 @@ static int ovl_security_fileattr(struct path *realpath, struct fileattr *fa,
return err;
}
-int ovl_real_fileattr_set(struct path *realpath, struct fileattr *fa)
+int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa)
{
int err;
@@ -685,7 +822,7 @@ static void ovl_fileattr_prot_flags(struct inode *inode, struct fileattr *fa)
}
}
-int ovl_real_fileattr_get(struct path *realpath, struct fileattr *fa)
+int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa)
{
int err;
@@ -721,7 +858,9 @@ static const struct inode_operations ovl_file_inode_operations = {
.permission = ovl_permission,
.getattr = ovl_getattr,
.listxattr = ovl_listxattr,
+ .get_inode_acl = ovl_get_inode_acl,
.get_acl = ovl_get_acl,
+ .set_acl = ovl_set_acl,
.update_time = ovl_update_time,
.fiemap = ovl_fiemap,
.fileattr_get = ovl_fileattr_get,
@@ -741,7 +880,9 @@ static const struct inode_operations ovl_special_inode_operations = {
.permission = ovl_permission,
.getattr = ovl_getattr,
.listxattr = ovl_listxattr,
+ .get_inode_acl = ovl_get_inode_acl,
.get_acl = ovl_get_acl,
+ .set_acl = ovl_set_acl,
.update_time = ovl_update_time,
};
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 69dc577974f8..46753134533a 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -26,7 +26,7 @@ struct ovl_lookup_data {
bool metacopy;
};
-static int ovl_check_redirect(struct path *path, struct ovl_lookup_data *d,
+static int ovl_check_redirect(const struct path *path, struct ovl_lookup_data *d,
size_t prelen, const char *post)
{
int res;
@@ -194,7 +194,7 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh,
return real;
}
-static bool ovl_is_opaquedir(struct ovl_fs *ofs, struct path *path)
+static bool ovl_is_opaquedir(struct ovl_fs *ofs, const struct path *path)
{
return ovl_path_check_dir_xattr(ofs, path, OVL_XATTR_OPAQUE);
}
@@ -487,7 +487,8 @@ fail:
}
/* Get upper dentry from index */
-struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index)
+struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index,
+ bool connected)
{
struct ovl_fh *fh;
struct dentry *upper;
@@ -499,7 +500,7 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index)
if (IS_ERR_OR_NULL(fh))
return ERR_CAST(fh);
- upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), true);
+ upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), connected);
kfree(fh);
if (IS_ERR_OR_NULL(upper))
@@ -572,7 +573,7 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index)
* directly from the index dentry, but for dir index we first need to
* decode the upper directory.
*/
- upper = ovl_index_upper(ofs, index);
+ upper = ovl_index_upper(ofs, index, false);
if (IS_ERR_OR_NULL(upper)) {
err = PTR_ERR(upper);
/*
@@ -1085,6 +1086,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
.mnt = ovl_upper_mnt(ofs),
};
+ /*
+ * It's safe to assign upperredirect here: the previous
+ * assignment of happens only if upperdentry is non-NULL, and
+ * this one only if upperdentry is NULL.
+ */
upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0);
if (IS_ERR(upperredirect)) {
err = PTR_ERR(upperredirect);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 87759165d32b..1df7f850ff3b 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -8,6 +8,8 @@
#include <linux/uuid.h>
#include <linux/fs.h>
#include <linux/namei.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
#include "ovl_entry.h"
#undef pr_fmt
@@ -108,7 +110,7 @@ struct ovl_fh {
u8 padding[3]; /* make sure fb.fid is 32bit aligned */
union {
struct ovl_fb fb;
- u8 buf[0];
+ DECLARE_FLEX_ARRAY(u8, buf);
};
} __packed;
@@ -208,7 +210,7 @@ static inline int ovl_do_symlink(struct ovl_fs *ofs,
return err;
}
-static inline ssize_t ovl_do_getxattr(struct path *path, const char *name,
+static inline ssize_t ovl_do_getxattr(const struct path *path, const char *name,
void *value, size_t size)
{
int err, len;
@@ -238,7 +240,7 @@ static inline ssize_t ovl_getxattr_upper(struct ovl_fs *ofs,
}
static inline ssize_t ovl_path_getxattr(struct ovl_fs *ofs,
- struct path *path,
+ const struct path *path,
enum ovl_xattr ox, void *value,
size_t size)
{
@@ -250,7 +252,7 @@ static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry,
size_t size, int flags)
{
int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name,
- (void *)value, size, flags);
+ value, size, flags);
pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n",
dentry, name, min((int)size, 48), value, size, flags, err);
@@ -278,6 +280,18 @@ static inline int ovl_removexattr(struct ovl_fs *ofs, struct dentry *dentry,
return ovl_do_removexattr(ofs, dentry, ovl_xattr(ofs, ox));
}
+static inline int ovl_do_set_acl(struct ovl_fs *ofs, struct dentry *dentry,
+ const char *acl_name, struct posix_acl *acl)
+{
+ return vfs_set_acl(ovl_upper_mnt_userns(ofs), dentry, acl_name, acl);
+}
+
+static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry,
+ const char *acl_name)
+{
+ return vfs_remove_acl(ovl_upper_mnt_userns(ofs), dentry, acl_name);
+}
+
static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir,
struct dentry *olddentry, struct inode *newdir,
struct dentry *newdentry, unsigned int flags)
@@ -310,14 +324,16 @@ static inline int ovl_do_whiteout(struct ovl_fs *ofs,
return err;
}
-static inline struct dentry *ovl_do_tmpfile(struct ovl_fs *ofs,
- struct dentry *dentry, umode_t mode)
+static inline struct file *ovl_do_tmpfile(struct ovl_fs *ofs,
+ struct dentry *dentry, umode_t mode)
{
- struct dentry *ret = vfs_tmpfile(ovl_upper_mnt_userns(ofs), dentry, mode, 0);
- int err = PTR_ERR_OR_ZERO(ret);
+ struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = dentry };
+ struct file *file = vfs_tmpfile_open(ovl_upper_mnt_userns(ofs), &path, mode,
+ O_LARGEFILE | O_WRONLY, current_cred());
+ int err = PTR_ERR_OR_ZERO(file);
pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err);
- return ret;
+ return file;
}
static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs,
@@ -399,15 +415,15 @@ const char *ovl_dentry_get_redirect(struct dentry *dentry);
void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect);
void ovl_inode_update(struct inode *inode, struct dentry *upperdentry);
void ovl_dir_modified(struct dentry *dentry, bool impurity);
-u64 ovl_dentry_version_get(struct dentry *dentry);
+u64 ovl_inode_version_get(struct inode *inode);
bool ovl_is_whiteout(struct dentry *dentry);
-struct file *ovl_path_open(struct path *path, int flags);
+struct file *ovl_path_open(const struct path *path, int flags);
int ovl_copy_up_start(struct dentry *dentry, int flags);
void ovl_copy_up_end(struct dentry *dentry);
bool ovl_already_copied_up(struct dentry *dentry, int flags);
-bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, struct path *path,
+bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
enum ovl_xattr ox);
-bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, struct path *path);
+bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path);
static inline bool ovl_check_origin_xattr(struct ovl_fs *ofs,
struct dentry *upperdentry)
@@ -430,9 +446,9 @@ bool ovl_need_index(struct dentry *dentry);
int ovl_nlink_start(struct dentry *dentry);
void ovl_nlink_end(struct dentry *dentry);
int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir);
-int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct path *path);
+int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path);
bool ovl_is_metacopy_dentry(struct dentry *dentry);
-char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct path *path, int padding);
+char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding);
int ovl_sync_status(struct ovl_fs *ofs);
static inline void ovl_set_flag(unsigned long flag, struct inode *inode)
@@ -523,7 +539,8 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
enum ovl_xattr ox, struct dentry *real, bool is_upper,
bool set);
-struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index);
+struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index,
+ bool connected);
int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index);
int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
struct qstr *name);
@@ -556,7 +573,7 @@ void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper,
struct list_head *list);
void ovl_cache_free(struct list_head *list);
void ovl_dir_cache_free(struct inode *inode);
-int ovl_check_d_type_supported(struct path *realpath);
+int ovl_check_d_type_supported(const struct path *realpath);
int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir,
struct vfsmount *mnt, struct dentry *dentry, int level);
int ovl_indexdir_cleanup(struct ovl_fs *ofs);
@@ -568,9 +585,9 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs);
* lower dir was removed under it and possibly before it was rotated from upper
* to lower layer.
*/
-static inline bool ovl_dir_is_real(struct dentry *dir)
+static inline bool ovl_dir_is_real(struct inode *dir)
{
- return !ovl_test_flag(OVL_WHITEOUTS, d_inode(dir));
+ return !ovl_test_flag(OVL_WHITEOUTS, dir);
}
/* inode.c */
@@ -592,9 +609,33 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
#ifdef CONFIG_FS_POSIX_ACL
-struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu);
+struct posix_acl *do_ovl_get_acl(struct user_namespace *mnt_userns,
+ struct inode *inode, int type,
+ bool rcu, bool noperm);
+static inline struct posix_acl *ovl_get_inode_acl(struct inode *inode, int type,
+ bool rcu)
+{
+ return do_ovl_get_acl(&init_user_ns, inode, type, rcu, true);
+}
+static inline struct posix_acl *ovl_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, int type)
+{
+ return do_ovl_get_acl(mnt_userns, d_inode(dentry), type, false, false);
+}
+int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct posix_acl *acl, int type);
+struct posix_acl *ovl_get_acl_path(const struct path *path,
+ const char *acl_name, bool noperm);
#else
-#define ovl_get_acl NULL
+#define ovl_get_inode_acl NULL
+#define ovl_get_acl NULL
+#define ovl_set_acl NULL
+static inline struct posix_acl *ovl_get_acl_path(const struct path *path,
+ const char *acl_name,
+ bool noperm)
+{
+ return NULL;
+}
#endif
int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags);
@@ -673,8 +714,8 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
extern const struct file_operations ovl_file_operations;
int __init ovl_aio_request_cache_init(void);
void ovl_aio_request_cache_destroy(void);
-int ovl_real_fileattr_get(struct path *realpath, struct fileattr *fa);
-int ovl_real_fileattr_set(struct path *realpath, struct fileattr *fa);
+int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa);
+int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa);
int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa);
int ovl_fileattr_set(struct user_namespace *mnt_userns,
struct dentry *dentry, struct fileattr *fa);
@@ -683,7 +724,7 @@ int ovl_fileattr_set(struct user_namespace *mnt_userns,
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_with_data(struct dentry *dentry);
int ovl_maybe_copy_up(struct dentry *dentry, int flags);
-int ovl_copy_xattr(struct super_block *sb, struct path *path, struct dentry *new);
+int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentry *new);
int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat);
struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
bool is_upper);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 78f62cc1797b..8cd2b9947de1 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -170,7 +170,7 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
return p;
}
-static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
+static bool ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
const char *name, int len, u64 ino,
unsigned int d_type)
{
@@ -179,22 +179,22 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
struct ovl_cache_entry *p;
if (ovl_cache_entry_find_link(name, len, &newp, &parent))
- return 0;
+ return true;
p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
if (p == NULL) {
rdd->err = -ENOMEM;
- return -ENOMEM;
+ return false;
}
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
rb_insert_color(&p->node, rdd->root);
- return 0;
+ return true;
}
-static int ovl_fill_lowest(struct ovl_readdir_data *rdd,
+static bool ovl_fill_lowest(struct ovl_readdir_data *rdd,
const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
@@ -211,7 +211,7 @@ static int ovl_fill_lowest(struct ovl_readdir_data *rdd,
list_add_tail(&p->l_node, &rdd->middle);
}
- return rdd->err;
+ return rdd->err == 0;
}
void ovl_cache_free(struct list_head *list)
@@ -235,22 +235,22 @@ void ovl_dir_cache_free(struct inode *inode)
}
}
-static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
+static void ovl_cache_put(struct ovl_dir_file *od, struct inode *inode)
{
struct ovl_dir_cache *cache = od->cache;
WARN_ON(cache->refcount <= 0);
cache->refcount--;
if (!cache->refcount) {
- if (ovl_dir_cache(d_inode(dentry)) == cache)
- ovl_set_dir_cache(d_inode(dentry), NULL);
+ if (ovl_dir_cache(inode) == cache)
+ ovl_set_dir_cache(inode, NULL);
ovl_cache_free(&cache->entries);
kfree(cache);
}
}
-static int ovl_fill_merge(struct dir_context *ctx, const char *name,
+static bool ovl_fill_merge(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
@@ -264,7 +264,7 @@ static int ovl_fill_merge(struct dir_context *ctx, const char *name,
return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type);
}
-static int ovl_check_whiteouts(struct path *path, struct ovl_readdir_data *rdd)
+static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd)
{
int err;
struct ovl_cache_entry *p;
@@ -291,7 +291,7 @@ static int ovl_check_whiteouts(struct path *path, struct ovl_readdir_data *rdd)
return err;
}
-static inline int ovl_dir_read(struct path *realpath,
+static inline int ovl_dir_read(const struct path *realpath,
struct ovl_readdir_data *rdd)
{
struct file *realfile;
@@ -323,15 +323,15 @@ static void ovl_dir_reset(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
struct ovl_dir_cache *cache = od->cache;
- struct dentry *dentry = file->f_path.dentry;
+ struct inode *inode = file_inode(file);
bool is_real;
- if (cache && ovl_dentry_version_get(dentry) != cache->version) {
- ovl_cache_put(od, dentry);
+ if (cache && ovl_inode_version_get(inode) != cache->version) {
+ ovl_cache_put(od, inode);
od->cache = NULL;
od->cursor = NULL;
}
- is_real = ovl_dir_is_real(dentry);
+ is_real = ovl_dir_is_real(inode);
if (od->is_real != is_real) {
/* is_real can only become false when dir is copied up */
if (WARN_ON(is_real))
@@ -394,9 +394,10 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
{
int res;
struct ovl_dir_cache *cache;
+ struct inode *inode = d_inode(dentry);
- cache = ovl_dir_cache(d_inode(dentry));
- if (cache && ovl_dentry_version_get(dentry) == cache->version) {
+ cache = ovl_dir_cache(inode);
+ if (cache && ovl_inode_version_get(inode) == cache->version) {
WARN_ON(!cache->refcount);
cache->refcount++;
return cache;
@@ -418,8 +419,8 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
return ERR_PTR(res);
}
- cache->version = ovl_dentry_version_get(dentry);
- ovl_set_dir_cache(d_inode(dentry), cache);
+ cache->version = ovl_inode_version_get(inode);
+ ovl_set_dir_cache(inode, cache);
return cache;
}
@@ -455,7 +456,7 @@ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid,
* copy up origin, call vfs_getattr() on the overlay entry to make
* sure that d_ino will be consistent with st_ino from stat(2).
*/
-static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p)
+static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry *p)
{
struct dentry *dir = path->dentry;
@@ -528,7 +529,7 @@ fail:
goto out;
}
-static int ovl_fill_plain(struct dir_context *ctx, const char *name,
+static bool ovl_fill_plain(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
@@ -540,14 +541,14 @@ static int ovl_fill_plain(struct dir_context *ctx, const char *name,
p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
if (p == NULL) {
rdd->err = -ENOMEM;
- return -ENOMEM;
+ return false;
}
list_add_tail(&p->l_node, rdd->list);
- return 0;
+ return true;
}
-static int ovl_dir_read_impure(struct path *path, struct list_head *list,
+static int ovl_dir_read_impure(const struct path *path, struct list_head *list,
struct rb_root *root)
{
int err;
@@ -592,20 +593,21 @@ static int ovl_dir_read_impure(struct path *path, struct list_head *list,
return 0;
}
-static struct ovl_dir_cache *ovl_cache_get_impure(struct path *path)
+static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path)
{
int res;
struct dentry *dentry = path->dentry;
+ struct inode *inode = d_inode(dentry);
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct ovl_dir_cache *cache;
- cache = ovl_dir_cache(d_inode(dentry));
- if (cache && ovl_dentry_version_get(dentry) == cache->version)
+ cache = ovl_dir_cache(inode);
+ if (cache && ovl_inode_version_get(inode) == cache->version)
return cache;
/* Impure cache is not refcounted, free it here */
- ovl_dir_cache_free(d_inode(dentry));
- ovl_set_dir_cache(d_inode(dentry), NULL);
+ ovl_dir_cache_free(inode);
+ ovl_set_dir_cache(inode, NULL);
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
if (!cache)
@@ -627,13 +629,13 @@ static struct ovl_dir_cache *ovl_cache_get_impure(struct path *path)
OVL_XATTR_IMPURE);
ovl_drop_write(dentry);
}
- ovl_clear_flag(OVL_IMPURE, d_inode(dentry));
+ ovl_clear_flag(OVL_IMPURE, inode);
kfree(cache);
return NULL;
}
- cache->version = ovl_dentry_version_get(dentry);
- ovl_set_dir_cache(d_inode(dentry), cache);
+ cache->version = ovl_inode_version_get(inode);
+ ovl_set_dir_cache(inode, cache);
return cache;
}
@@ -648,7 +650,7 @@ struct ovl_readdir_translate {
bool xinowarn;
};
-static int ovl_fill_real(struct dir_context *ctx, const char *name,
+static bool ovl_fill_real(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
@@ -675,7 +677,7 @@ static int ovl_fill_real(struct dir_context *ctx, const char *name,
static bool ovl_is_impure_dir(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
- struct inode *dir = d_inode(file->f_path.dentry);
+ struct inode *dir = file_inode(file);
/*
* Only upper dir can be impure, but if we are in the middle of
@@ -834,7 +836,7 @@ out_unlock:
}
static struct file *ovl_dir_open_realfile(const struct file *file,
- struct path *realpath)
+ const struct path *realpath)
{
struct file *res;
const struct cred *old_cred;
@@ -893,7 +895,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
struct file *realfile;
int err;
- err = ovl_sync_status(OVL_FS(file->f_path.dentry->d_sb));
+ err = ovl_sync_status(OVL_FS(file_inode(file)->i_sb));
if (err <= 0)
return err;
@@ -913,7 +915,7 @@ static int ovl_dir_release(struct inode *inode, struct file *file)
if (od->cache) {
inode_lock(inode);
- ovl_cache_put(od, file->f_path.dentry);
+ ovl_cache_put(od, inode);
inode_unlock(inode);
}
fput(od->realfile);
@@ -942,7 +944,7 @@ static int ovl_dir_open(struct inode *inode, struct file *file)
return PTR_ERR(realfile);
}
od->realfile = realfile;
- od->is_real = ovl_dir_is_real(file->f_path.dentry);
+ od->is_real = ovl_dir_is_real(inode);
od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
@@ -1027,7 +1029,7 @@ void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper,
inode_unlock(upper->d_inode);
}
-static int ovl_check_d_type(struct dir_context *ctx, const char *name,
+static bool ovl_check_d_type(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
@@ -1036,19 +1038,19 @@ static int ovl_check_d_type(struct dir_context *ctx, const char *name,
/* Even if d_type is not supported, DT_DIR is returned for . and .. */
if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen))
- return 0;
+ return true;
if (d_type != DT_UNKNOWN)
rdd->d_type_supported = true;
- return 0;
+ return true;
}
/*
* Returns 1 if d_type is supported, 0 not supported/unknown. Negative values
* if error is encountered.
*/
-int ovl_check_d_type_supported(struct path *realpath)
+int ovl_check_d_type_supported(const struct path *realpath)
{
int err;
struct ovl_readdir_data rdd = {
@@ -1065,20 +1067,16 @@ int ovl_check_d_type_supported(struct path *realpath)
#define OVL_INCOMPATDIR_NAME "incompat"
-static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, struct path *path,
+static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *path,
int level)
{
int err;
struct inode *dir = path->dentry->d_inode;
LIST_HEAD(list);
- struct rb_root root = RB_ROOT;
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
- .ctx.actor = ovl_fill_merge,
- .dentry = NULL,
+ .ctx.actor = ovl_fill_plain,
.list = &list,
- .root = &root,
- .is_lowest = false,
};
bool incompat = false;
@@ -1159,14 +1157,10 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
struct inode *dir = indexdir->d_inode;
struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir };
LIST_HEAD(list);
- struct rb_root root = RB_ROOT;
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
- .ctx.actor = ovl_fill_merge,
- .dentry = NULL,
+ .ctx.actor = ovl_fill_plain,
.list = &list,
- .root = &root,
- .is_lowest = false,
};
err = ovl_dir_read(&path, &rdd);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index ec746d447f1b..85b891152a2c 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -15,6 +15,7 @@
#include <linux/seq_file.h>
#include <linux/posix_acl_xattr.h>
#include <linux/exportfs.h>
+#include <linux/file.h>
#include "overlayfs.h"
MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
@@ -138,11 +139,16 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry,
unsigned int flags, bool weak)
{
struct ovl_entry *oe = dentry->d_fsdata;
+ struct inode *inode = d_inode_rcu(dentry);
struct dentry *upper;
unsigned int i;
int ret = 1;
- upper = ovl_dentry_upper(dentry);
+ /* Careful in RCU mode */
+ if (!inode)
+ return -ECHILD;
+
+ upper = ovl_i_dentry_upper(inode);
if (upper)
ret = ovl_revalidate_real(upper, flags, weak);
@@ -812,13 +818,11 @@ retry:
* allowed as upper are limited to "normal" ones, where checking
* for the above two errors is sufficient.
*/
- err = ovl_do_removexattr(ofs, work,
- XATTR_NAME_POSIX_ACL_DEFAULT);
+ err = ovl_do_remove_acl(ofs, work, XATTR_NAME_POSIX_ACL_DEFAULT);
if (err && err != -ENODATA && err != -EOPNOTSUPP)
goto out_dput;
- err = ovl_do_removexattr(ofs, work,
- XATTR_NAME_POSIX_ACL_ACCESS);
+ err = ovl_do_remove_acl(ofs, work, XATTR_NAME_POSIX_ACL_ACCESS);
if (err && err != -ENODATA && err != -EOPNOTSUPP)
goto out_dput;
@@ -908,7 +912,7 @@ static int ovl_mount_dir(const char *name, struct path *path)
return err;
}
-static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs,
+static int ovl_check_namelen(const struct path *path, struct ovl_fs *ofs,
const char *name)
{
struct kstatfs statfs;
@@ -1000,70 +1004,6 @@ static unsigned int ovl_split_lowerdirs(char *str)
return ctr;
}
-static int __maybe_unused
-ovl_posix_acl_xattr_get(const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *inode,
- const char *name, void *buffer, size_t size)
-{
- return ovl_xattr_get(dentry, inode, handler->name, buffer, size);
-}
-
-static int __maybe_unused
-ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
- struct user_namespace *mnt_userns,
- struct dentry *dentry, struct inode *inode,
- const char *name, const void *value,
- size_t size, int flags)
-{
- struct dentry *workdir = ovl_workdir(dentry);
- struct inode *realinode = ovl_inode_real(inode);
- struct posix_acl *acl = NULL;
- int err;
-
- /* Check that everything is OK before copy-up */
- if (value) {
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- }
- err = -EOPNOTSUPP;
- if (!IS_POSIXACL(d_inode(workdir)))
- goto out_acl_release;
- if (!realinode->i_op->set_acl)
- goto out_acl_release;
- if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) {
- err = acl ? -EACCES : 0;
- goto out_acl_release;
- }
- err = -EPERM;
- if (!inode_owner_or_capable(&init_user_ns, inode))
- goto out_acl_release;
-
- posix_acl_release(acl);
-
- /*
- * Check if sgid bit needs to be cleared (actual setacl operation will
- * be done with mounter's capabilities and so that won't do it for us).
- */
- if (unlikely(inode->i_mode & S_ISGID) &&
- handler->flags == ACL_TYPE_ACCESS &&
- !in_group_p(inode->i_gid) &&
- !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) {
- struct iattr iattr = { .ia_valid = ATTR_KILL_SGID };
-
- err = ovl_setattr(&init_user_ns, dentry, &iattr);
- if (err)
- return err;
- }
-
- err = ovl_xattr_set(dentry, inode, handler->name, value, size, flags);
- return err;
-
-out_acl_release:
- posix_acl_release(acl);
- return err;
-}
-
static int ovl_own_xattr_get(const struct xattr_handler *handler,
struct dentry *dentry, struct inode *inode,
const char *name, void *buffer, size_t size)
@@ -1096,22 +1036,6 @@ static int ovl_other_xattr_set(const struct xattr_handler *handler,
return ovl_xattr_set(dentry, inode, name, value, size, flags);
}
-static const struct xattr_handler __maybe_unused
-ovl_posix_acl_access_xattr_handler = {
- .name = XATTR_NAME_POSIX_ACL_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .get = ovl_posix_acl_xattr_get,
- .set = ovl_posix_acl_xattr_set,
-};
-
-static const struct xattr_handler __maybe_unused
-ovl_posix_acl_default_xattr_handler = {
- .name = XATTR_NAME_POSIX_ACL_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .get = ovl_posix_acl_xattr_get,
- .set = ovl_posix_acl_xattr_set,
-};
-
static const struct xattr_handler ovl_own_trusted_xattr_handler = {
.prefix = OVL_XATTR_TRUSTED_PREFIX,
.get = ovl_own_xattr_get,
@@ -1132,8 +1056,8 @@ static const struct xattr_handler ovl_other_xattr_handler = {
static const struct xattr_handler *ovl_trusted_xattr_handlers[] = {
#ifdef CONFIG_FS_POSIX_ACL
- &ovl_posix_acl_access_xattr_handler,
- &ovl_posix_acl_default_xattr_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
&ovl_own_trusted_xattr_handler,
&ovl_other_xattr_handler,
@@ -1142,8 +1066,8 @@ static const struct xattr_handler *ovl_trusted_xattr_handlers[] = {
static const struct xattr_handler *ovl_user_xattr_handlers[] = {
#ifdef CONFIG_FS_POSIX_ACL
- &ovl_posix_acl_access_xattr_handler,
- &ovl_posix_acl_default_xattr_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
&ovl_own_user_xattr_handler,
&ovl_other_xattr_handler,
@@ -1353,10 +1277,11 @@ static int ovl_create_volatile_dirty(struct ovl_fs *ofs)
}
static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
- struct path *workpath)
+ const struct path *workpath)
{
struct vfsmount *mnt = ovl_upper_mnt(ofs);
- struct dentry *temp, *workdir;
+ struct dentry *workdir;
+ struct file *tmpfile;
bool rename_whiteout;
bool d_type;
int fh_type;
@@ -1392,10 +1317,10 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
pr_warn("upper fs needs to support d_type.\n");
/* Check if upper/work fs supports O_TMPFILE */
- temp = ovl_do_tmpfile(ofs, ofs->workdir, S_IFREG | 0);
- ofs->tmpfile = !IS_ERR(temp);
+ tmpfile = ovl_do_tmpfile(ofs, ofs->workdir, S_IFREG | 0);
+ ofs->tmpfile = !IS_ERR(tmpfile);
if (ofs->tmpfile)
- dput(temp);
+ fput(tmpfile);
else
pr_warn("upper fs does not support tmpfile.\n");
@@ -1482,7 +1407,7 @@ out:
}
static int ovl_get_workdir(struct super_block *sb, struct ovl_fs *ofs,
- struct path *upperpath)
+ const struct path *upperpath)
{
int err;
struct path workpath = { };
@@ -1525,7 +1450,7 @@ out:
}
static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
- struct ovl_entry *oe, struct path *upperpath)
+ struct ovl_entry *oe, const struct path *upperpath)
{
struct vfsmount *mnt = ovl_upper_mnt(ofs);
struct dentry *indexdir;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 87f811c089e4..bde291623c8c 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -463,7 +463,7 @@ static void ovl_dir_version_inc(struct dentry *dentry, bool impurity)
* which have been copied up and have origins), so only need to note
* changes to impure entries.
*/
- if (!ovl_dir_is_real(dentry) || impurity)
+ if (!ovl_dir_is_real(inode) || impurity)
OVL_I(inode)->version++;
}
@@ -475,10 +475,8 @@ void ovl_dir_modified(struct dentry *dentry, bool impurity)
ovl_dir_version_inc(dentry, impurity);
}
-u64 ovl_dentry_version_get(struct dentry *dentry)
+u64 ovl_inode_version_get(struct inode *inode)
{
- struct inode *inode = d_inode(dentry);
-
WARN_ON(!inode_is_locked(inode));
return OVL_I(inode)->version;
}
@@ -490,7 +488,7 @@ bool ovl_is_whiteout(struct dentry *dentry)
return inode && IS_WHITEOUT(inode);
}
-struct file *ovl_path_open(struct path *path, int flags)
+struct file *ovl_path_open(const struct path *path, int flags)
{
struct inode *inode = d_inode(path->dentry);
struct user_namespace *real_mnt_userns = mnt_user_ns(path->mnt);
@@ -578,7 +576,7 @@ void ovl_copy_up_end(struct dentry *dentry)
ovl_inode_unlock(d_inode(dentry));
}
-bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, struct path *path)
+bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path)
{
int res;
@@ -591,7 +589,7 @@ bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, struct path *path)
return false;
}
-bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, struct path *path,
+bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
enum ovl_xattr ox)
{
int res;
@@ -971,7 +969,7 @@ err:
}
/* err < 0, 0 if no metacopy xattr, 1 if metacopy xattr found */
-int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct path *path)
+int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path)
{
int res;
@@ -1015,7 +1013,7 @@ bool ovl_is_metacopy_dentry(struct dentry *dentry)
return (oe->numlower > 1);
}
-char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct path *path, int padding)
+char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding)
{
int res;
char *s, *next, *buf = NULL;
@@ -1104,13 +1102,18 @@ void ovl_copyattr(struct inode *inode)
struct path realpath;
struct inode *realinode;
struct user_namespace *real_mnt_userns;
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
ovl_i_path_real(inode, &realpath);
realinode = d_inode(realpath.dentry);
real_mnt_userns = mnt_user_ns(realpath.mnt);
- inode->i_uid = i_uid_into_mnt(real_mnt_userns, realinode);
- inode->i_gid = i_gid_into_mnt(real_mnt_userns, realinode);
+ vfsuid = i_uid_into_vfsuid(real_mnt_userns, realinode);
+ vfsgid = i_gid_into_vfsgid(real_mnt_userns, realinode);
+
+ inode->i_uid = vfsuid_into_kuid(vfsuid);
+ inode->i_gid = vfsgid_into_kgid(vfsgid);
inode->i_mode = realinode->i_mode;
inode->i_atime = realinode->i_atime;
inode->i_mtime = realinode->i_mtime;
diff --git a/fs/pipe.c b/fs/pipe.c
index 74ae9fafd25a..42c7ff41c2db 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -860,7 +860,7 @@ static struct vfsmount *pipe_mnt __read_mostly;
*/
static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
{
- return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
+ return dynamic_dname(buffer, buflen, "pipe:[%lu]",
d_inode(dentry)->i_ino);
}
diff --git a/fs/pnode.c b/fs/pnode.c
index 1106137c747a..468e4e65a615 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -244,7 +244,7 @@ static int propagate_one(struct mount *m)
}
do {
struct mount *parent = last_source->mnt_parent;
- if (last_source == first_source)
+ if (peers(last_source, first_source))
break;
done = parent->mnt_master == p;
if (done && peers(n, parent))
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 5af33800743e..d7bc81fc0840 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -24,6 +24,12 @@
#include <linux/user_namespace.h>
#include <linux/namei.h>
#include <linux/mnt_idmapping.h>
+#include <linux/iversion.h>
+#include <linux/security.h>
+#include <linux/evm.h>
+#include <linux/fsnotify.h>
+
+#include "internal.h"
static struct posix_acl **acl_by_type(struct inode *inode, int type)
{
@@ -63,7 +69,7 @@ struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
if (acl == ACL_DONT_CACHE) {
struct posix_acl *ret;
- ret = inode->i_op->get_acl(inode, type, LOOKUP_RCU);
+ ret = inode->i_op->get_inode_acl(inode, type, LOOKUP_RCU);
if (!IS_ERR(ret))
acl = ret;
}
@@ -105,15 +111,17 @@ void forget_all_cached_acls(struct inode *inode)
}
EXPORT_SYMBOL(forget_all_cached_acls);
-struct posix_acl *get_acl(struct inode *inode, int type)
+static struct posix_acl *__get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct inode *inode,
+ int type)
{
- void *sentinel;
+ struct posix_acl *sentinel;
struct posix_acl **p;
struct posix_acl *acl;
/*
* The sentinel is used to detect when another operation like
- * set_cached_acl() or forget_cached_acl() races with get_acl().
+ * set_cached_acl() or forget_cached_acl() races with get_inode_acl().
* It is guaranteed that is_uncached_acl(sentinel) is true.
*/
@@ -132,25 +140,27 @@ struct posix_acl *get_acl(struct inode *inode, int type)
* current value of the ACL will not be ACL_NOT_CACHED and so our own
* sentinel will not be set; another task will update the cache. We
* could wait for that other task to complete its job, but it's easier
- * to just call ->get_acl to fetch the ACL ourself. (This is going to
- * be an unlikely race.)
+ * to just call ->get_inode_acl to fetch the ACL ourself. (This is
+ * going to be an unlikely race.)
*/
cmpxchg(p, ACL_NOT_CACHED, sentinel);
/*
- * Normally, the ACL returned by ->get_acl will be cached.
+ * Normally, the ACL returned by ->get{_inode}_acl will be cached.
* A filesystem can prevent that by calling
- * forget_cached_acl(inode, type) in ->get_acl.
+ * forget_cached_acl(inode, type) in ->get{_inode}_acl.
*
- * If the filesystem doesn't have a get_acl() function at all, we'll
- * just create the negative cache entry.
+ * If the filesystem doesn't have a get{_inode}_ acl() function at all,
+ * we'll just create the negative cache entry.
*/
- if (!inode->i_op->get_acl) {
+ if (dentry && inode->i_op->get_acl) {
+ acl = inode->i_op->get_acl(mnt_userns, dentry, type);
+ } else if (inode->i_op->get_inode_acl) {
+ acl = inode->i_op->get_inode_acl(inode, type, false);
+ } else {
set_cached_acl(inode, type, NULL);
return NULL;
}
- acl = inode->i_op->get_acl(inode, type, false);
-
if (IS_ERR(acl)) {
/*
* Remove our sentinel so that we don't block future attempts
@@ -168,7 +178,12 @@ struct posix_acl *get_acl(struct inode *inode, int type)
posix_acl_release(acl);
return acl;
}
-EXPORT_SYMBOL(get_acl);
+
+struct posix_acl *get_inode_acl(struct inode *inode, int type)
+{
+ return __get_acl(&init_user_ns, NULL, inode, type);
+}
+EXPORT_SYMBOL(get_inode_acl);
/*
* Init a fresh posix_acl
@@ -577,19 +592,20 @@ EXPORT_SYMBOL(__posix_acl_chmod);
* posix_acl_chmod - chmod a posix acl
*
* @mnt_userns: user namespace of the mount @inode was found from
- * @inode: inode to check permissions on
+ * @dentry: dentry to check permissions on
* @mode: the new mode of @inode
*
- * If the inode has been found through an idmapped mount the user namespace of
+ * If the dentry has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then
* take care to map the inode according to @mnt_userns before checking
* permissions. On non-idmapped mounts or if permission checking is to be
* performed on the raw inode simply passs init_user_ns.
*/
int
- posix_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode,
+ posix_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry,
umode_t mode)
{
+ struct inode *inode = d_inode(dentry);
struct posix_acl *acl;
int ret = 0;
@@ -598,7 +614,7 @@ int
if (!inode->i_op->set_acl)
return -EOPNOTSUPP;
- acl = get_acl(inode, ACL_TYPE_ACCESS);
+ acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR_OR_NULL(acl)) {
if (acl == ERR_PTR(-EOPNOTSUPP))
return 0;
@@ -608,7 +624,7 @@ int
ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
if (ret)
return ret;
- ret = inode->i_op->set_acl(mnt_userns, inode, acl, ACL_TYPE_ACCESS);
+ ret = inode->i_op->set_acl(mnt_userns, dentry, acl, ACL_TYPE_ACCESS);
posix_acl_release(acl);
return ret;
}
@@ -628,7 +644,7 @@ posix_acl_create(struct inode *dir, umode_t *mode,
if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
return 0;
- p = get_acl(dir, ACL_TYPE_DEFAULT);
+ p = get_inode_acl(dir, ACL_TYPE_DEFAULT);
if (!p || p == ERR_PTR(-EOPNOTSUPP)) {
*mode &= ~current_umask();
return 0;
@@ -710,9 +726,9 @@ EXPORT_SYMBOL(posix_acl_update_mode);
/*
* Fix up the uids and gids in posix acl extended attributes in place.
*/
-static int posix_acl_fix_xattr_common(void *value, size_t size)
+static int posix_acl_fix_xattr_common(const void *value, size_t size)
{
- struct posix_acl_xattr_header *header = value;
+ const struct posix_acl_xattr_header *header = value;
int count;
if (!header)
@@ -720,149 +736,43 @@ static int posix_acl_fix_xattr_common(void *value, size_t size)
if (size < sizeof(struct posix_acl_xattr_header))
return -EINVAL;
if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
- return -EINVAL;
+ return -EOPNOTSUPP;
count = posix_acl_xattr_count(size);
if (count < 0)
return -EINVAL;
if (count == 0)
- return -EINVAL;
+ return 0;
return count;
}
-void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns,
- const struct inode *inode,
- void *value, size_t size)
-{
- struct posix_acl_xattr_header *header = value;
- struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
- struct user_namespace *fs_userns = i_user_ns(inode);
- int count;
- vfsuid_t vfsuid;
- vfsgid_t vfsgid;
- kuid_t uid;
- kgid_t gid;
-
- if (no_idmapping(mnt_userns, i_user_ns(inode)))
- return;
-
- count = posix_acl_fix_xattr_common(value, size);
- if (count < 0)
- return;
-
- for (end = entry + count; entry != end; entry++) {
- switch (le16_to_cpu(entry->e_tag)) {
- case ACL_USER:
- uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id));
- vfsuid = make_vfsuid(mnt_userns, fs_userns, uid);
- entry->e_id = cpu_to_le32(from_kuid(&init_user_ns,
- vfsuid_into_kuid(vfsuid)));
- break;
- case ACL_GROUP:
- gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id));
- vfsgid = make_vfsgid(mnt_userns, fs_userns, gid);
- entry->e_id = cpu_to_le32(from_kgid(&init_user_ns,
- vfsgid_into_kgid(vfsgid)));
- break;
- default:
- break;
- }
- }
-}
-
-void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns,
- const struct inode *inode,
- void *value, size_t size)
-{
- struct posix_acl_xattr_header *header = value;
- struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
- struct user_namespace *fs_userns = i_user_ns(inode);
- int count;
- vfsuid_t vfsuid;
- vfsgid_t vfsgid;
- kuid_t uid;
- kgid_t gid;
-
- if (no_idmapping(mnt_userns, i_user_ns(inode)))
- return;
-
- count = posix_acl_fix_xattr_common(value, size);
- if (count < 0)
- return;
-
- for (end = entry + count; entry != end; entry++) {
- switch (le16_to_cpu(entry->e_tag)) {
- case ACL_USER:
- uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id));
- vfsuid = VFSUIDT_INIT(uid);
- uid = from_vfsuid(mnt_userns, fs_userns, vfsuid);
- entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, uid));
- break;
- case ACL_GROUP:
- gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id));
- vfsgid = VFSGIDT_INIT(gid);
- gid = from_vfsgid(mnt_userns, fs_userns, vfsgid);
- entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, gid));
- break;
- default:
- break;
- }
- }
-}
-
-static void posix_acl_fix_xattr_userns(
- struct user_namespace *to, struct user_namespace *from,
- void *value, size_t size)
-{
- struct posix_acl_xattr_header *header = value;
- struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
- int count;
- kuid_t uid;
- kgid_t gid;
-
- count = posix_acl_fix_xattr_common(value, size);
- if (count < 0)
- return;
-
- for (end = entry + count; entry != end; entry++) {
- switch(le16_to_cpu(entry->e_tag)) {
- case ACL_USER:
- uid = make_kuid(from, le32_to_cpu(entry->e_id));
- entry->e_id = cpu_to_le32(from_kuid(to, uid));
- break;
- case ACL_GROUP:
- gid = make_kgid(from, le32_to_cpu(entry->e_id));
- entry->e_id = cpu_to_le32(from_kgid(to, gid));
- break;
- default:
- break;
- }
- }
-}
-
-void posix_acl_fix_xattr_from_user(void *value, size_t size)
-{
- struct user_namespace *user_ns = current_user_ns();
- if (user_ns == &init_user_ns)
- return;
- posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size);
-}
-
-void posix_acl_fix_xattr_to_user(void *value, size_t size)
-{
- struct user_namespace *user_ns = current_user_ns();
- if (user_ns == &init_user_ns)
- return;
- posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size);
-}
-
-/*
- * Convert from extended attribute to in-memory representation.
+/**
+ * posix_acl_from_xattr - convert POSIX ACLs from backing store to VFS format
+ * @userns: the filesystem's idmapping
+ * @value: the uapi representation of POSIX ACLs
+ * @size: the size of @void
+ *
+ * Filesystems that store POSIX ACLs in the unaltered uapi format should use
+ * posix_acl_from_xattr() when reading them from the backing store and
+ * converting them into the struct posix_acl VFS format. The helper is
+ * specifically intended to be called from the acl inode operation.
+ *
+ * The posix_acl_from_xattr() function will map the raw {g,u}id values stored
+ * in ACL_{GROUP,USER} entries into idmapping in @userns.
+ *
+ * Note that posix_acl_from_xattr() does not take idmapped mounts into account.
+ * If it did it calling it from the get acl inode operation would return POSIX
+ * ACLs mapped according to an idmapped mount which would mean that the value
+ * couldn't be cached for the filesystem. Idmapped mounts are taken into
+ * account on the fly during permission checking or right at the VFS -
+ * userspace boundary before reporting them to the user.
+ *
+ * Return: Allocated struct posix_acl on success, NULL for a valid header but
+ * without actual POSIX ACL entries, or ERR_PTR() encoded error code.
*/
-struct posix_acl *
-posix_acl_from_xattr(struct user_namespace *user_ns,
- const void *value, size_t size)
+struct posix_acl *posix_acl_from_xattr(struct user_namespace *userns,
+ const void *value, size_t size)
{
const struct posix_acl_xattr_header *header = value;
const struct posix_acl_xattr_entry *entry = (const void *)(header + 1), *end;
@@ -870,16 +780,9 @@ posix_acl_from_xattr(struct user_namespace *user_ns,
struct posix_acl *acl;
struct posix_acl_entry *acl_e;
- if (!value)
- return NULL;
- if (size < sizeof(struct posix_acl_xattr_header))
- return ERR_PTR(-EINVAL);
- if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
- return ERR_PTR(-EOPNOTSUPP);
-
- count = posix_acl_xattr_count(size);
+ count = posix_acl_fix_xattr_common(value, size);
if (count < 0)
- return ERR_PTR(-EINVAL);
+ return ERR_PTR(count);
if (count == 0)
return NULL;
@@ -900,16 +803,14 @@ posix_acl_from_xattr(struct user_namespace *user_ns,
break;
case ACL_USER:
- acl_e->e_uid =
- make_kuid(user_ns,
- le32_to_cpu(entry->e_id));
+ acl_e->e_uid = make_kuid(userns,
+ le32_to_cpu(entry->e_id));
if (!uid_valid(acl_e->e_uid))
goto fail;
break;
case ACL_GROUP:
- acl_e->e_gid =
- make_kgid(user_ns,
- le32_to_cpu(entry->e_id));
+ acl_e->e_gid = make_kgid(userns,
+ le32_to_cpu(entry->e_id));
if (!gid_valid(acl_e->e_gid))
goto fail;
break;
@@ -968,35 +869,76 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
}
EXPORT_SYMBOL (posix_acl_to_xattr);
-static int
-posix_acl_xattr_get(const struct xattr_handler *handler,
- struct dentry *unused, struct inode *inode,
- const char *name, void *value, size_t size)
-{
- struct posix_acl *acl;
- int error;
+/**
+ * vfs_posix_acl_to_xattr - convert from kernel to userspace representation
+ * @idmap: idmap of the mount
+ * @inode: inode the posix acls are set on
+ * @acl: the posix acls as represented by the vfs
+ * @buffer: the buffer into which to convert @acl
+ * @size: size of @buffer
+ *
+ * This converts @acl from the VFS representation in the filesystem idmapping
+ * to the uapi form reportable to userspace. And mount and caller idmappings
+ * are handled appropriately.
+ *
+ * Return: On success, the size of the stored uapi posix acls, on error a
+ * negative errno.
+ */
+static ssize_t vfs_posix_acl_to_xattr(struct mnt_idmap *idmap,
+ struct inode *inode,
+ const struct posix_acl *acl, void *buffer,
+ size_t size)
- if (!IS_POSIXACL(inode))
- return -EOPNOTSUPP;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
+{
+ struct posix_acl_xattr_header *ext_acl = buffer;
+ struct posix_acl_xattr_entry *ext_entry;
+ struct user_namespace *fs_userns, *caller_userns;
+ struct user_namespace *mnt_userns;
+ ssize_t real_size, n;
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
- acl = get_acl(inode, handler->flags);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
+ real_size = posix_acl_xattr_size(acl->a_count);
+ if (!buffer)
+ return real_size;
+ if (real_size > size)
+ return -ERANGE;
- error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
- posix_acl_release(acl);
+ ext_entry = (void *)(ext_acl + 1);
+ ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
- return error;
+ fs_userns = i_user_ns(inode);
+ caller_userns = current_user_ns();
+ mnt_userns = mnt_idmap_owner(idmap);
+ for (n=0; n < acl->a_count; n++, ext_entry++) {
+ const struct posix_acl_entry *acl_e = &acl->a_entries[n];
+ ext_entry->e_tag = cpu_to_le16(acl_e->e_tag);
+ ext_entry->e_perm = cpu_to_le16(acl_e->e_perm);
+ switch(acl_e->e_tag) {
+ case ACL_USER:
+ vfsuid = make_vfsuid(mnt_userns, fs_userns, acl_e->e_uid);
+ ext_entry->e_id = cpu_to_le32(from_kuid(
+ caller_userns, vfsuid_into_kuid(vfsuid)));
+ break;
+ case ACL_GROUP:
+ vfsgid = make_vfsgid(mnt_userns, fs_userns, acl_e->e_gid);
+ ext_entry->e_id = cpu_to_le32(from_kgid(
+ caller_userns, vfsgid_into_kgid(vfsgid)));
+ break;
+ default:
+ ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
+ break;
+ }
+ }
+ return real_size;
}
int
-set_posix_acl(struct user_namespace *mnt_userns, struct inode *inode,
+set_posix_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
int type, struct posix_acl *acl)
{
+ struct inode *inode = d_inode(dentry);
+
if (!IS_POSIXACL(inode))
return -EOPNOTSUPP;
if (!inode->i_op->set_acl)
@@ -1012,30 +954,10 @@ set_posix_acl(struct user_namespace *mnt_userns, struct inode *inode,
if (ret)
return ret;
}
- return inode->i_op->set_acl(mnt_userns, inode, acl, type);
+ return inode->i_op->set_acl(mnt_userns, dentry, acl, type);
}
EXPORT_SYMBOL(set_posix_acl);
-static int
-posix_acl_xattr_set(const struct xattr_handler *handler,
- struct user_namespace *mnt_userns,
- struct dentry *unused, struct inode *inode,
- const char *name, const void *value, size_t size,
- int flags)
-{
- struct posix_acl *acl = NULL;
- int ret;
-
- if (value) {
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- }
- ret = set_posix_acl(mnt_userns, inode, handler->flags, acl);
- posix_acl_release(acl);
- return ret;
-}
-
static bool
posix_acl_xattr_list(struct dentry *dentry)
{
@@ -1046,8 +968,6 @@ const struct xattr_handler posix_acl_access_xattr_handler = {
.name = XATTR_NAME_POSIX_ACL_ACCESS,
.flags = ACL_TYPE_ACCESS,
.list = posix_acl_xattr_list,
- .get = posix_acl_xattr_get,
- .set = posix_acl_xattr_set,
};
EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler);
@@ -1055,15 +975,14 @@ const struct xattr_handler posix_acl_default_xattr_handler = {
.name = XATTR_NAME_POSIX_ACL_DEFAULT,
.flags = ACL_TYPE_DEFAULT,
.list = posix_acl_xattr_list,
- .get = posix_acl_xattr_get,
- .set = posix_acl_xattr_set,
};
EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler);
-int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int simple_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int error;
+ struct inode *inode = d_inode(dentry);
if (type == ACL_TYPE_ACCESS) {
error = posix_acl_update_mode(mnt_userns, inode,
@@ -1073,6 +992,8 @@ int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
}
inode->i_ctime = current_time(inode);
+ if (IS_I_VERSION(inode))
+ inode_inc_iversion(inode);
set_cached_acl(inode, type, acl);
return 0;
}
@@ -1095,3 +1016,252 @@ int simple_acl_create(struct inode *dir, struct inode *inode)
posix_acl_release(acl);
return 0;
}
+
+static int vfs_set_acl_idmapped_mnt(struct user_namespace *mnt_userns,
+ struct user_namespace *fs_userns,
+ struct posix_acl *acl)
+{
+ for (int n = 0; n < acl->a_count; n++) {
+ struct posix_acl_entry *acl_e = &acl->a_entries[n];
+
+ switch (acl_e->e_tag) {
+ case ACL_USER:
+ acl_e->e_uid = from_vfsuid(mnt_userns, fs_userns,
+ VFSUIDT_INIT(acl_e->e_uid));
+ break;
+ case ACL_GROUP:
+ acl_e->e_gid = from_vfsgid(mnt_userns, fs_userns,
+ VFSGIDT_INIT(acl_e->e_gid));
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * vfs_set_acl - set posix acls
+ * @mnt_userns: user namespace of the mount
+ * @dentry: the dentry based on which to set the posix acls
+ * @acl_name: the name of the posix acl
+ * @kacl: the posix acls in the appropriate VFS format
+ *
+ * This function sets @kacl. The caller must all posix_acl_release() on @kacl
+ * afterwards.
+ *
+ * Return: On success 0, on error negative errno.
+ */
+int vfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ const char *acl_name, struct posix_acl *kacl)
+{
+ int acl_type;
+ int error;
+ struct inode *inode = d_inode(dentry);
+ struct inode *delegated_inode = NULL;
+
+ acl_type = posix_acl_type(acl_name);
+ if (acl_type < 0)
+ return -EINVAL;
+
+ if (kacl) {
+ /*
+ * If we're on an idmapped mount translate from mount specific
+ * vfs{g,u}id_t into global filesystem k{g,u}id_t.
+ * Afterwards we can cache the POSIX ACLs filesystem wide and -
+ * if this is a filesystem with a backing store - ultimately
+ * translate them to backing store values.
+ */
+ error = vfs_set_acl_idmapped_mnt(mnt_userns, i_user_ns(inode), kacl);
+ if (error)
+ return error;
+ }
+
+retry_deleg:
+ inode_lock(inode);
+
+ /*
+ * We only care about restrictions the inode struct itself places upon
+ * us otherwise POSIX ACLs aren't subject to any VFS restrictions.
+ */
+ error = may_write_xattr(mnt_userns, inode);
+ if (error)
+ goto out_inode_unlock;
+
+ error = security_inode_set_acl(mnt_userns, dentry, acl_name, kacl);
+ if (error)
+ goto out_inode_unlock;
+
+ error = try_break_deleg(inode, &delegated_inode);
+ if (error)
+ goto out_inode_unlock;
+
+ if (inode->i_opflags & IOP_XATTR)
+ error = set_posix_acl(mnt_userns, dentry, acl_type, kacl);
+ else if (unlikely(is_bad_inode(inode)))
+ error = -EIO;
+ else
+ error = -EOPNOTSUPP;
+ if (!error) {
+ fsnotify_xattr(dentry);
+ evm_inode_post_set_acl(dentry, acl_name, kacl);
+ }
+
+out_inode_unlock:
+ inode_unlock(inode);
+
+ if (delegated_inode) {
+ error = break_deleg_wait(&delegated_inode);
+ if (!error)
+ goto retry_deleg;
+ }
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(vfs_set_acl);
+
+/**
+ * vfs_get_acl - get posix acls
+ * @mnt_userns: user namespace of the mount
+ * @dentry: the dentry based on which to retrieve the posix acls
+ * @acl_name: the name of the posix acl
+ *
+ * This function retrieves @kacl from the filesystem. The caller must all
+ * posix_acl_release() on @kacl.
+ *
+ * Return: On success POSIX ACLs in VFS format, on error negative errno.
+ */
+struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, const char *acl_name)
+{
+ struct inode *inode = d_inode(dentry);
+ struct posix_acl *acl;
+ int acl_type, error;
+
+ acl_type = posix_acl_type(acl_name);
+ if (acl_type < 0)
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * The VFS has no restrictions on reading POSIX ACLs so calling
+ * something like xattr_permission() isn't needed. Only LSMs get a say.
+ */
+ error = security_inode_get_acl(mnt_userns, dentry, acl_name);
+ if (error)
+ return ERR_PTR(error);
+
+ if (!IS_POSIXACL(inode))
+ return ERR_PTR(-EOPNOTSUPP);
+ if (S_ISLNK(inode->i_mode))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ acl = __get_acl(mnt_userns, dentry, inode, acl_type);
+ if (IS_ERR(acl))
+ return acl;
+ if (!acl)
+ return ERR_PTR(-ENODATA);
+
+ return acl;
+}
+EXPORT_SYMBOL_GPL(vfs_get_acl);
+
+/**
+ * vfs_remove_acl - remove posix acls
+ * @mnt_userns: user namespace of the mount
+ * @dentry: the dentry based on which to retrieve the posix acls
+ * @acl_name: the name of the posix acl
+ *
+ * This function removes posix acls.
+ *
+ * Return: On success 0, on error negative errno.
+ */
+int vfs_remove_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ const char *acl_name)
+{
+ int acl_type;
+ int error;
+ struct inode *inode = d_inode(dentry);
+ struct inode *delegated_inode = NULL;
+
+ acl_type = posix_acl_type(acl_name);
+ if (acl_type < 0)
+ return -EINVAL;
+
+retry_deleg:
+ inode_lock(inode);
+
+ /*
+ * We only care about restrictions the inode struct itself places upon
+ * us otherwise POSIX ACLs aren't subject to any VFS restrictions.
+ */
+ error = may_write_xattr(mnt_userns, inode);
+ if (error)
+ goto out_inode_unlock;
+
+ error = security_inode_remove_acl(mnt_userns, dentry, acl_name);
+ if (error)
+ goto out_inode_unlock;
+
+ error = try_break_deleg(inode, &delegated_inode);
+ if (error)
+ goto out_inode_unlock;
+
+ if (inode->i_opflags & IOP_XATTR)
+ error = set_posix_acl(mnt_userns, dentry, acl_type, NULL);
+ else if (unlikely(is_bad_inode(inode)))
+ error = -EIO;
+ else
+ error = -EOPNOTSUPP;
+ if (!error) {
+ fsnotify_xattr(dentry);
+ evm_inode_post_remove_acl(mnt_userns, dentry, acl_name);
+ }
+
+out_inode_unlock:
+ inode_unlock(inode);
+
+ if (delegated_inode) {
+ error = break_deleg_wait(&delegated_inode);
+ if (!error)
+ goto retry_deleg;
+ }
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(vfs_remove_acl);
+
+int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
+ const char *acl_name, const void *kvalue, size_t size)
+{
+ int error;
+ struct posix_acl *acl = NULL;
+
+ if (size) {
+ /*
+ * Note that posix_acl_from_xattr() uses GFP_NOFS when it
+ * probably doesn't need to here.
+ */
+ acl = posix_acl_from_xattr(current_user_ns(), kvalue, size);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ }
+
+ error = vfs_set_acl(mnt_idmap_owner(idmap), dentry, acl_name, acl);
+ posix_acl_release(acl);
+ return error;
+}
+
+ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry,
+ const char *acl_name, void *kvalue, size_t size)
+{
+ ssize_t error;
+ struct posix_acl *acl;
+
+ acl = vfs_get_acl(mnt_idmap_owner(idmap), dentry, acl_name);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+
+ error = vfs_posix_acl_to_xattr(idmap, d_inode(dentry),
+ acl, kvalue, size);
+ posix_acl_release(acl);
+ return error;
+}
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index c930001056f9..32b1116ae137 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -92,6 +92,7 @@ config PROC_PAGE_MONITOR
config PROC_CHILDREN
bool "Include /proc/<pid>/task/<tid>/children file"
+ depends on PROC_FS
default n
help
Provides a fast way to retrieve first level children pids of a task. See
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 99fcbfda8e25..49283b8103c7 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -279,7 +279,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
collect_sigign_sigcatch(p, &ignored, &caught);
num_threads = get_nr_threads(p);
rcu_read_lock(); /* FIXME: is this correct? */
- qsize = get_ucounts_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING);
+ qsize = get_rlimit_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING);
rcu_read_unlock();
qlim = task_rlimit(p, RLIMIT_SIGPENDING);
unlock_task_sighand(p, &flags);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 93f7e3d971e4..9e479d7d202b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1761,7 +1761,7 @@ out:
return ERR_PTR(error);
}
-static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
+static int do_proc_readlink(const struct path *path, char __user *buffer, int buflen)
{
char *tmp = kmalloc(PATH_MAX, GFP_KERNEL);
char *pathname;
@@ -2350,6 +2350,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
GENRADIX(struct map_files_info) fa;
struct map_files_info *p;
int ret;
+ struct vma_iterator vmi;
genradix_init(&fa);
@@ -2388,7 +2389,9 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
* routine might require mmap_lock taken in might_fault().
*/
- for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+ pos = 2;
+ vma_iter_init(&vmi, mm, 0);
+ for_each_vma(vmi, vma) {
if (!vma->vm_file)
continue;
if (++pos <= ctx->pos)
@@ -2728,7 +2731,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
return -ESRCH;
length = security_getprocattr(task, PROC_I(inode)->op.lsm,
- (char*)file->f_path.dentry->d_name.name,
+ file->f_path.dentry->d_name.name,
&p);
put_task_struct(task);
if (length > 0)
@@ -3196,6 +3199,19 @@ static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *
return 0;
}
+static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct mm_struct *mm;
+
+ mm = get_task_mm(task);
+ if (mm) {
+ seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
+ mmput(mm);
+ }
+
+ return 0;
+}
#endif /* CONFIG_KSM */
#ifdef CONFIG_STACKLEAK_METRICS
@@ -3331,6 +3347,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#endif
#ifdef CONFIG_KSM
ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages),
+ ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat),
#endif
};
@@ -3668,6 +3685,7 @@ static const struct pid_entry tid_base_stuff[] = {
#endif
#ifdef CONFIG_KSM
ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages),
+ ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat),
#endif
};
diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
index fa762c5fbcb2..91fe1597af7b 100644
--- a/fs/proc/cmdline.c
+++ b/fs/proc/cmdline.c
@@ -3,6 +3,7 @@
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include "internal.h"
static int cmdline_proc_show(struct seq_file *m, void *v)
{
@@ -13,7 +14,10 @@ static int cmdline_proc_show(struct seq_file *m, void *v)
static int __init proc_cmdline_init(void)
{
- proc_create_single("cmdline", 0, NULL, cmdline_proc_show);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("cmdline", 0, NULL, cmdline_proc_show);
+ pde->size = saved_command_line_len + 1;
return 0;
}
fs_initcall(proc_cmdline_init);
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index dfe6ce3505ce..e0758fe7936d 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -33,7 +33,16 @@ static int show_console_dev(struct seq_file *m, void *v)
if (con->device) {
const struct tty_driver *driver;
int index;
+
+ /*
+ * Take console_lock to serialize device() callback with
+ * other console operations. For example, fg_console is
+ * modified under console_lock when switching vt.
+ */
+ console_lock();
driver = con->device(con, &index);
+ console_unlock();
+
if (driver) {
dev = MKDEV(driver->major, driver->minor_start);
dev += index;
@@ -63,7 +72,12 @@ static void *c_start(struct seq_file *m, loff_t *pos)
struct console *con;
loff_t off = 0;
- console_lock();
+ /*
+ * Hold the console_list_lock to guarantee safe traversal of the
+ * console list. SRCU cannot be used because there is no
+ * place to store the SRCU cookie.
+ */
+ console_list_lock();
for_each_console(con)
if (off++ == *pos)
break;
@@ -74,13 +88,14 @@ static void *c_start(struct seq_file *m, loff_t *pos)
static void *c_next(struct seq_file *m, void *v, loff_t *pos)
{
struct console *con = v;
+
++*pos;
- return con->next;
+ return hlist_entry_safe(con->node.next, struct console, node);
}
static void c_stop(struct seq_file *m, void *v)
{
- console_unlock();
+ console_list_unlock();
}
static const struct seq_operations consoles_op = {
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 837971e74109..fe7bfcb7d049 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -4,6 +4,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/blkdev.h>
+#include "internal.h"
static int devinfo_show(struct seq_file *f, void *v)
{
@@ -54,7 +55,10 @@ static const struct seq_operations devinfo_ops = {
static int __init proc_devices_init(void)
{
- proc_create_seq("devices", 0, NULL, &devinfo_ops);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_seq("devices", 0, NULL, &devinfo_ops);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_devices_init);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 913bef0d2a36..fc46d6fe080c 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -7,6 +7,7 @@
#include <linux/namei.h>
#include <linux/pid.h>
#include <linux/ptrace.h>
+#include <linux/bitmap.h>
#include <linux/security.h>
#include <linux/file.h>
#include <linux/seq_file.h>
@@ -279,6 +280,30 @@ out:
return 0;
}
+static int proc_readfd_count(struct inode *inode, loff_t *count)
+{
+ struct task_struct *p = get_proc_task(inode);
+ struct fdtable *fdt;
+
+ if (!p)
+ return -ENOENT;
+
+ task_lock(p);
+ if (p->files) {
+ rcu_read_lock();
+
+ fdt = files_fdtable(p->files);
+ *count = bitmap_weight(fdt->open_fds, fdt->max_fds);
+
+ rcu_read_unlock();
+ }
+ task_unlock(p);
+
+ put_task_struct(p);
+
+ return 0;
+}
+
static int proc_readfd(struct file *file, struct dir_context *ctx)
{
return proc_readfd_common(file, ctx, proc_fd_instantiate);
@@ -319,9 +344,29 @@ int proc_fd_permission(struct user_namespace *mnt_userns,
return rv;
}
+static int proc_fd_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+ int rv = 0;
+
+ generic_fillattr(&init_user_ns, inode, stat);
+
+ /* If it's a directory, put the number of open fds there */
+ if (S_ISDIR(inode->i_mode)) {
+ rv = proc_readfd_count(inode, &stat->size);
+ if (rv < 0)
+ return rv;
+ }
+
+ return rv;
+}
+
const struct inode_operations proc_fd_inode_operations = {
.lookup = proc_lookupfd,
.permission = proc_fd_permission,
+ .getattr = proc_fd_getattr,
.setattr = proc_setattr,
};
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 06a80f78433d..b701d0207edf 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -79,6 +79,11 @@ static inline bool pde_is_permanent(const struct proc_dir_entry *pde)
return pde->flags & PROC_ENTRY_PERMANENT;
}
+static inline void pde_make_permanent(struct proc_dir_entry *pde)
+{
+ pde->flags |= PROC_ENTRY_PERMANENT;
+}
+
extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);
@@ -285,7 +290,7 @@ struct proc_maps_private {
struct task_struct *task;
struct mm_struct *mm;
#ifdef CONFIG_MMU
- struct vm_area_struct *tail_vma;
+ struct vma_iterator iter;
#endif
#ifdef CONFIG_NUMA
struct mempolicy *task_mempolicy;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index dff921f7ca33..71157ee35c1a 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -18,7 +18,6 @@
#include <linux/capability.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
-#include <linux/notifier.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/printk.h>
@@ -541,25 +540,17 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
fallthrough;
case KCORE_VMEMMAP:
case KCORE_TEXT:
- if (kern_addr_valid(start)) {
- /*
- * Using bounce buffer to bypass the
- * hardened user copy kernel text checks.
- */
- if (copy_from_kernel_nofault(buf, (void *)start,
- tsz)) {
- if (clear_user(buffer, tsz)) {
- ret = -EFAULT;
- goto out;
- }
- } else {
- if (copy_to_user(buffer, buf, tsz)) {
- ret = -EFAULT;
- goto out;
- }
+ /*
+ * Using bounce buffer to bypass the
+ * hardened user copy kernel text checks.
+ */
+ if (copy_from_kernel_nofault(buf, (void *)start, tsz)) {
+ if (clear_user(buffer, tsz)) {
+ ret = -EFAULT;
+ goto out;
}
} else {
- if (clear_user(buffer, tsz)) {
+ if (copy_to_user(buffer, buf, tsz)) {
ret = -EFAULT;
goto out;
}
@@ -638,10 +629,6 @@ static int __meminit kcore_callback(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block kcore_callback_nb __meminitdata = {
- .notifier_call = kcore_callback,
- .priority = 0,
-};
static struct kcore_list kcore_vmalloc;
@@ -694,7 +681,7 @@ static int __init proc_kcore_init(void)
add_modules_range();
/* Store direct-map area from physical memory map */
kcore_update_ram();
- register_hotmemory_notifier(&kcore_callback_nb);
+ hotplug_memory_notifier(kcore_callback, DEFAULT_CALLBACK_PRI);
return 0;
}
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index 592e6dc7c110..2fc92a13f9f8 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -17,8 +17,6 @@
#include <asm/io.h>
-extern wait_queue_head_t log_wait;
-
static int kmsg_open(struct inode * inode, struct file * file)
{
return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC);
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index f32878d9a39f..817981e57223 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -9,6 +9,7 @@
#include <linux/seq_file.h>
#include <linux/seqlock.h>
#include <linux/time.h>
+#include "internal.h"
static int loadavg_proc_show(struct seq_file *m, void *v)
{
@@ -27,7 +28,10 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
static int __init proc_loadavg_init(void)
{
- proc_create_single("loadavg", 0, NULL, loadavg_proc_show);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("loadavg", 0, NULL, loadavg_proc_show);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_loadavg_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 6e89f0e2fd20..440960110a42 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -115,6 +115,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#endif
show_val_kb(m, "PageTables: ",
global_node_page_state(NR_PAGETABLE));
+ show_val_kb(m, "SecPageTables: ",
+ global_node_page_state(NR_SECONDARY_PAGETABLE));
show_val_kb(m, "NFS_Unstable: ", 0);
show_val_kb(m, "Bounce: ",
@@ -162,7 +164,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
static int __init proc_meminfo_init(void)
{
- proc_create_single("meminfo", 0, NULL, meminfo_proc_show);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("meminfo", 0, NULL, meminfo_proc_show);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_meminfo_init);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index a2873a617ae8..6249c347809a 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -91,6 +91,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
}
static const struct proc_ops kpagecount_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_lseek = mem_lseek,
.proc_read = kpagecount_read,
};
@@ -218,8 +219,9 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2);
u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1);
u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1);
-#ifdef CONFIG_64BIT
+#ifdef CONFIG_ARCH_USES_PG_ARCH_X
u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2);
+ u |= kpf_copy_bit(k, KPF_ARCH_3, PG_arch_3);
#endif
return u;
@@ -268,6 +270,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
}
static const struct proc_ops kpageflags_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_lseek = mem_lseek,
.proc_read = kpageflags_read,
};
@@ -322,6 +325,7 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
}
static const struct proc_ops kpagecgroup_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
.proc_lseek = mem_lseek,
.proc_read = kpagecgroup_read,
};
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 021e83fe831f..48f2d60bd78a 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -28,13 +28,6 @@ static const struct inode_operations proc_sys_inode_operations;
static const struct file_operations proc_sys_dir_file_operations;
static const struct inode_operations proc_sys_dir_operations;
-/* shared constants to be used in various sysctls */
-const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 };
-EXPORT_SYMBOL(sysctl_vals);
-
-const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX };
-EXPORT_SYMBOL_GPL(sysctl_long_vals);
-
/* Support for permanently empty directories */
struct ctl_table sysctl_mount_point[] = {
@@ -1246,7 +1239,7 @@ static bool get_links(struct ctl_dir *dir,
static int insert_links(struct ctl_table_header *head)
{
struct ctl_table_set *root_set = &sysctl_table_root.default_set;
- struct ctl_dir *core_parent = NULL;
+ struct ctl_dir *core_parent;
struct ctl_table_header *links;
int err;
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 12901dcf57e2..f4616083faef 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -3,6 +3,7 @@
#include <linux/kernel_stat.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include "internal.h"
/*
* /proc/softirqs ... display the number of softirqs
@@ -27,7 +28,10 @@ static int show_softirqs(struct seq_file *p, void *v)
static int __init proc_softirqs_init(void)
{
- proc_create_single("softirqs", 0, NULL, show_softirqs);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("softirqs", 0, NULL, show_softirqs);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_softirqs_init);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4e0023643f8b..e35a0398db63 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/pagewalk.h>
-#include <linux/vmacache.h>
#include <linux/mm_inline.h>
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
@@ -124,12 +123,26 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
}
#endif
+static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
+ loff_t *ppos)
+{
+ struct vm_area_struct *vma = vma_next(&priv->iter);
+
+ if (vma) {
+ *ppos = vma->vm_start;
+ } else {
+ *ppos = -2UL;
+ vma = get_gate_vma(priv->mm);
+ }
+
+ return vma;
+}
+
static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
unsigned long last_addr = *ppos;
struct mm_struct *mm;
- struct vm_area_struct *vma;
/* See m_next(). Zero at the start or after lseek. */
if (last_addr == -1UL)
@@ -153,31 +166,21 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
return ERR_PTR(-EINTR);
}
+ vma_iter_init(&priv->iter, mm, last_addr);
hold_task_mempolicy(priv);
- priv->tail_vma = get_gate_vma(mm);
-
- vma = find_vma(mm, last_addr);
- if (vma)
- return vma;
+ if (last_addr == -2UL)
+ return get_gate_vma(mm);
- return priv->tail_vma;
+ return proc_get_vma(priv, ppos);
}
static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
{
- struct proc_maps_private *priv = m->private;
- struct vm_area_struct *next, *vma = v;
-
- if (vma == priv->tail_vma)
- next = NULL;
- else if (vma->vm_next)
- next = vma->vm_next;
- else
- next = priv->tail_vma;
-
- *ppos = next ? next->vm_start : -1UL;
-
- return next;
+ if (*ppos == -2UL) {
+ *ppos = -1UL;
+ return NULL;
+ }
+ return proc_get_vma(m->private, ppos);
}
static void m_stop(struct seq_file *m, void *v)
@@ -271,6 +274,7 @@ static void show_vma_header_prefix(struct seq_file *m,
static void
show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
{
+ struct anon_vma_name *anon_name = NULL;
struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
vm_flags_t flags = vma->vm_flags;
@@ -290,6 +294,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
start = vma->vm_start;
end = vma->vm_end;
show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
+ if (mm)
+ anon_name = anon_vma_name(vma);
/*
* Print the dentry name for named mappings, and a
@@ -297,7 +303,14 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
*/
if (file) {
seq_pad(m, ' ');
- seq_file_path(m, file, "\n");
+ /*
+ * If user named this anon shared memory via
+ * prctl(PR_SET_VMA ..., use the provided name.
+ */
+ if (anon_name)
+ seq_printf(m, "[anon_shmem:%s]", anon_name->name);
+ else
+ seq_file_path(m, file, "\n");
goto done;
}
@@ -309,8 +322,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
name = arch_vma_name(vma);
if (!name) {
- struct anon_vma_name *anon_name;
-
if (!mm) {
name = "[vdso]";
goto done;
@@ -327,7 +338,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
goto done;
}
- anon_name = anon_vma_name(vma);
if (anon_name) {
seq_pad(m, ' ');
seq_printf(m, "[anon:%s]", anon_name->name);
@@ -664,6 +674,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_RAND_READ)] = "rr",
[ilog2(VM_DONTCOPY)] = "dc",
[ilog2(VM_DONTEXPAND)] = "de",
+ [ilog2(VM_LOCKONFAULT)] = "lf",
[ilog2(VM_ACCOUNT)] = "ac",
[ilog2(VM_NORESERVE)] = "nr",
[ilog2(VM_HUGETLB)] = "ht",
@@ -864,7 +875,7 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
seq_printf(m, "THPeligible: %d\n",
- hugepage_vma_check(vma, vma->vm_flags, true, false));
+ hugepage_vma_check(vma, vma->vm_flags, true, false, true));
if (arch_pkeys_enabled())
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
@@ -877,16 +888,16 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
struct mem_size_stats mss;
- struct mm_struct *mm;
+ struct mm_struct *mm = priv->mm;
struct vm_area_struct *vma;
- unsigned long last_vma_end = 0;
+ unsigned long vma_start = 0, last_vma_end = 0;
int ret = 0;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
priv->task = get_proc_task(priv->inode);
if (!priv->task)
return -ESRCH;
- mm = priv->mm;
if (!mm || !mmget_not_zero(mm)) {
ret = -ESRCH;
goto out_put_task;
@@ -899,8 +910,13 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
goto out_put_mm;
hold_task_mempolicy(priv);
+ vma = mas_find(&mas, ULONG_MAX);
+
+ if (unlikely(!vma))
+ goto empty_set;
- for (vma = priv->mm->mmap; vma;) {
+ vma_start = vma->vm_start;
+ do {
smap_gather_stats(vma, &mss, 0);
last_vma_end = vma->vm_end;
@@ -909,6 +925,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
* access it for write request.
*/
if (mmap_lock_is_contended(mm)) {
+ mas_pause(&mas);
mmap_read_unlock(mm);
ret = mmap_read_lock_killable(mm);
if (ret) {
@@ -952,7 +969,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
* contains last_vma_end.
* Iterate VMA' from last_vma_end.
*/
- vma = find_vma(mm, last_vma_end - 1);
+ vma = mas_find(&mas, ULONG_MAX);
/* Case 3 above */
if (!vma)
break;
@@ -966,11 +983,10 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
smap_gather_stats(vma, &mss, last_vma_end);
}
/* Case 2 above */
- vma = vma->vm_next;
- }
+ } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
- show_vma_header_prefix(m, priv->mm->mmap->vm_start,
- last_vma_end, 0, 0, 0, 0);
+empty_set:
+ show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0);
seq_pad(m, ' ');
seq_puts(m, "[rollup]\n");
@@ -1263,6 +1279,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
return -ESRCH;
mm = get_task_mm(task);
if (mm) {
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
struct mmu_notifier_range range;
struct clear_refs_private cp = {
.type = type,
@@ -1282,7 +1299,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
}
if (type == CLEAR_REFS_SOFT_DIRTY) {
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (!(vma->vm_flags & VM_SOFTDIRTY))
continue;
vma->vm_flags &= ~VM_SOFTDIRTY;
@@ -1294,8 +1311,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
0, NULL, mm, 0, -1UL);
mmu_notifier_invalidate_range_start(&range);
}
- walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops,
- &cp);
+ walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp);
if (type == CLEAR_REFS_SOFT_DIRTY) {
mmu_notifier_invalidate_range_end(&range);
flush_tlb_mm(mm);
@@ -1418,9 +1434,19 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
if (pte_swp_uffd_wp(pte))
flags |= PM_UFFD_WP;
entry = pte_to_swp_entry(pte);
- if (pm->show_pfn)
+ if (pm->show_pfn) {
+ pgoff_t offset;
+ /*
+ * For PFN swap offsets, keeping the offset field
+ * to be PFN only to be compatible with old smaps.
+ */
+ if (is_pfn_swap_entry(entry))
+ offset = swp_offset_pfn(entry);
+ else
+ offset = swp_offset(entry);
frame = swp_type(entry) |
- (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
+ (offset << MAX_SWAPFILES_SHIFT);
+ }
flags |= PM_SWAP;
migration = is_migration_entry(entry);
if (is_pfn_swap_entry(entry))
@@ -1477,7 +1503,11 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
unsigned long offset;
if (pm->show_pfn) {
- offset = swp_offset(entry) +
+ if (is_pfn_swap_entry(entry))
+ offset = swp_offset_pfn(entry);
+ else
+ offset = swp_offset(entry);
+ offset = offset +
((addr & ~PMD_MASK) >> PAGE_SHIFT);
frame = swp_type(entry) |
(offset << MAX_SWAPFILES_SHIFT);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index a6d21fc0033c..2fd06f52b6a4 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -20,15 +20,13 @@
*/
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
struct vm_region *region;
- struct rb_node *p;
unsigned long bytes = 0, sbytes = 0, slack = 0, size;
-
- mmap_read_lock(mm);
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
- vma = rb_entry(p, struct vm_area_struct, vm_rb);
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma) {
bytes += kobjsize(vma);
region = vma->vm_region;
@@ -82,15 +80,13 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
unsigned long task_vsize(struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
- struct rb_node *p;
unsigned long vsize = 0;
mmap_read_lock(mm);
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
- vma = rb_entry(p, struct vm_area_struct, vm_rb);
+ for_each_vma(vmi, vma)
vsize += vma->vm_end - vma->vm_start;
- }
mmap_read_unlock(mm);
return vsize;
}
@@ -99,14 +95,13 @@ unsigned long task_statm(struct mm_struct *mm,
unsigned long *shared, unsigned long *text,
unsigned long *data, unsigned long *resident)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
struct vm_region *region;
- struct rb_node *p;
unsigned long size = kobjsize(mm);
mmap_read_lock(mm);
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
- vma = rb_entry(p, struct vm_area_struct, vm_rb);
+ for_each_vma(vmi, vma) {
size += kobjsize(vma);
region = vma->vm_region;
if (region) {
@@ -190,17 +185,19 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
*/
static int show_map(struct seq_file *m, void *_p)
{
- struct rb_node *p = _p;
-
- return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
+ return nommu_vma_show(m, _p);
}
static void *m_start(struct seq_file *m, loff_t *pos)
{
struct proc_maps_private *priv = m->private;
struct mm_struct *mm;
- struct rb_node *p;
- loff_t n = *pos;
+ struct vm_area_struct *vma;
+ unsigned long addr = *pos;
+
+ /* See m_next(). Zero at the start or after lseek. */
+ if (addr == -1UL)
+ return NULL;
/* pin the task and mm whilst we play with them */
priv->task = get_proc_task(priv->inode);
@@ -216,10 +213,10 @@ static void *m_start(struct seq_file *m, loff_t *pos)
return ERR_PTR(-EINTR);
}
- /* start from the Nth VMA */
- for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
- if (n-- == 0)
- return p;
+ /* start the next element from addr */
+ vma = find_vma(mm, addr);
+ if (vma)
+ return vma;
mmap_read_unlock(mm);
mmput(mm);
@@ -242,10 +239,10 @@ static void m_stop(struct seq_file *m, void *_vml)
static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
{
- struct rb_node *p = _p;
+ struct vm_area_struct *vma = _p;
- (*pos)++;
- return p ? rb_next(p) : NULL;
+ *pos = vma->vm_end;
+ return find_vma(vma->vm_mm, vma->vm_end);
}
static const struct seq_operations proc_pid_maps_ops = {
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index deb99bc9b7e6..b5343d209381 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -7,6 +7,7 @@
#include <linux/time.h>
#include <linux/time_namespace.h>
#include <linux/kernel_stat.h>
+#include "internal.h"
static int uptime_proc_show(struct seq_file *m, void *v)
{
@@ -39,7 +40,10 @@ static int uptime_proc_show(struct seq_file *m, void *v)
static int __init proc_uptime_init(void)
{
- proc_create_single("uptime", 0, NULL, uptime_proc_show);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("uptime", 0, NULL, uptime_proc_show);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_uptime_init);
diff --git a/fs/proc/version.c b/fs/proc/version.c
index b449f186577f..02e3c3cd4a9a 100644
--- a/fs/proc/version.c
+++ b/fs/proc/version.c
@@ -5,6 +5,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/utsname.h>
+#include "internal.h"
static int version_proc_show(struct seq_file *m, void *v)
{
@@ -17,7 +18,10 @@ static int version_proc_show(struct seq_file *m, void *v)
static int __init proc_version_init(void)
{
- proc_create_single("version", 0, NULL, version_proc_show);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("version", 0, NULL, version_proc_show);
+ pde_make_permanent(pde);
return 0;
}
fs_initcall(proc_version_init);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index f2aa86c421f2..09a81e4b1273 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -199,7 +199,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
struct kvec kvec = { .iov_base = buf, .iov_len = count };
struct iov_iter iter;
- iov_iter_kvec(&iter, READ, &kvec, 1, count);
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
return read_from_oldmem(&iter, count, ppos, false);
}
@@ -212,7 +212,7 @@ ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
struct kvec kvec = { .iov_base = buf, .iov_len = count };
struct iov_iter iter;
- iov_iter_kvec(&iter, READ, &kvec, 1, count);
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
return read_from_oldmem(&iter, count, ppos,
cc_platform_has(CC_ATTR_MEM_ENCRYPT));
@@ -437,7 +437,7 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf)
offset = (loff_t) index << PAGE_SHIFT;
kvec.iov_base = page_address(page);
kvec.iov_len = PAGE_SIZE;
- iov_iter_kvec(&iter, READ, &kvec, 1, PAGE_SIZE);
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, PAGE_SIZE);
rc = __read_vmcore(&iter, &offset);
if (rc < 0) {
@@ -1567,6 +1567,7 @@ static int __init vmcore_init(void)
return rc;
rc = parse_crash_elf_headers();
if (rc) {
+ elfcorehdr_free(elfcorehdr_addr);
pr_warn("Kdump: vmcore not initialized\n");
return rc;
}
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index b2fd3c20e7c2..cbc0b468c1ab 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -28,14 +28,11 @@
#include <linux/crypto.h>
#include <linux/string.h>
#include <linux/timer.h>
-#include <linux/scatterlist.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/jiffies.h>
#include <linux/workqueue.h>
-#include <crypto/acompress.h>
-
#include "internal.h"
/*
@@ -92,9 +89,13 @@ static char *compress =
module_param(compress, charp, 0444);
MODULE_PARM_DESC(compress, "compression to use");
+/* How much of the kernel log to snapshot */
+unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES;
+module_param(kmsg_bytes, ulong, 0444);
+MODULE_PARM_DESC(kmsg_bytes, "amount of kernel log to snapshot (in bytes)");
+
/* Compression parameters */
-static struct crypto_acomp *tfm;
-static struct acomp_req *creq;
+static struct crypto_comp *tfm;
struct pstore_zbackend {
int (*zbufsize)(size_t size);
@@ -104,9 +105,6 @@ struct pstore_zbackend {
static char *big_oops_buf;
static size_t big_oops_buf_sz;
-/* How much of the console log to snapshot */
-unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES;
-
void pstore_set_kmsg_bytes(int bytes)
{
kmsg_bytes = bytes;
@@ -272,21 +270,12 @@ static const struct pstore_zbackend zbackends[] = {
static int pstore_compress(const void *in, void *out,
unsigned int inlen, unsigned int outlen)
{
- struct scatterlist src, dst;
int ret;
if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS))
return -EINVAL;
- sg_init_table(&src, 1);
- sg_set_buf(&src, in, inlen);
-
- sg_init_table(&dst, 1);
- sg_set_buf(&dst, out, outlen);
-
- acomp_request_set_params(creq, &src, &dst, inlen, outlen);
-
- ret = crypto_acomp_compress(creq);
+ ret = crypto_comp_compress(tfm, in, inlen, out, &outlen);
if (ret) {
pr_err("crypto_comp_compress failed, ret = %d!\n", ret);
return ret;
@@ -297,7 +286,7 @@ static int pstore_compress(const void *in, void *out,
static void allocate_buf_for_compression(void)
{
- struct crypto_acomp *acomp;
+ struct crypto_comp *ctx;
int size;
char *buf;
@@ -309,7 +298,7 @@ static void allocate_buf_for_compression(void)
if (!psinfo || tfm)
return;
- if (!crypto_has_acomp(zbackend->name, 0, CRYPTO_ALG_ASYNC)) {
+ if (!crypto_has_comp(zbackend->name, 0, 0)) {
pr_err("Unknown compression: %s\n", zbackend->name);
return;
}
@@ -328,24 +317,16 @@ static void allocate_buf_for_compression(void)
return;
}
- acomp = crypto_alloc_acomp(zbackend->name, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR_OR_NULL(acomp)) {
+ ctx = crypto_alloc_comp(zbackend->name, 0, 0);
+ if (IS_ERR_OR_NULL(ctx)) {
kfree(buf);
pr_err("crypto_alloc_comp('%s') failed: %ld\n", zbackend->name,
- PTR_ERR(acomp));
- return;
- }
-
- creq = acomp_request_alloc(acomp);
- if (!creq) {
- crypto_free_acomp(acomp);
- kfree(buf);
- pr_err("acomp_request_alloc('%s') failed\n", zbackend->name);
+ PTR_ERR(ctx));
return;
}
/* A non-NULL big_oops_buf indicates compression is available. */
- tfm = acomp;
+ tfm = ctx;
big_oops_buf_sz = size;
big_oops_buf = buf;
@@ -355,8 +336,7 @@ static void allocate_buf_for_compression(void)
static void free_buf_for_compression(void)
{
if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm) {
- acomp_request_free(creq);
- crypto_free_acomp(tfm);
+ crypto_free_comp(tfm);
tfm = NULL;
}
kfree(big_oops_buf);
@@ -413,6 +393,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
const char *why;
unsigned int part = 1;
unsigned long flags = 0;
+ int saved_ret = 0;
int ret;
why = kmsg_dump_reason_str(reason);
@@ -483,12 +464,21 @@ static void pstore_dump(struct kmsg_dumper *dumper,
if (ret == 0 && reason == KMSG_DUMP_OOPS) {
pstore_new_entry = 1;
pstore_timer_kick();
+ } else {
+ /* Preserve only the first non-zero returned value. */
+ if (!saved_ret)
+ saved_ret = ret;
}
total += record.size;
part++;
}
spin_unlock_irqrestore(&psinfo->buf_lock, flags);
+
+ if (saved_ret) {
+ pr_err_once("backend (%s) writing error (%d)\n", psinfo->name,
+ saved_ret);
+ }
}
static struct kmsg_dumper pstore_dumper = {
@@ -584,8 +574,9 @@ out:
int pstore_register(struct pstore_info *psi)
{
if (backend && strcmp(backend, psi->name)) {
- pr_warn("ignoring unexpected backend '%s'\n", psi->name);
- return -EPERM;
+ pr_warn("backend '%s' already in use: ignoring '%s'\n",
+ backend, psi->name);
+ return -EBUSY;
}
/* Sanity check flags. */
@@ -684,6 +675,8 @@ void pstore_unregister(struct pstore_info *psi)
psinfo = NULL;
kfree(backend);
backend = NULL;
+
+ pr_info("Unregistered %s as persistent store backend\n", psi->name);
mutex_unlock(&psinfo_lock);
}
EXPORT_SYMBOL_GPL(pstore_unregister);
@@ -693,8 +686,6 @@ static void decompress_record(struct pstore_record *record)
int ret;
int unzipped_len;
char *unzipped, *workspace;
- struct acomp_req *dreq;
- struct scatterlist src, dst;
if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !record->compressed)
return;
@@ -718,30 +709,16 @@ static void decompress_record(struct pstore_record *record)
if (!workspace)
return;
- dreq = acomp_request_alloc(tfm);
- if (!dreq) {
- kfree(workspace);
- return;
- }
-
- sg_init_table(&src, 1);
- sg_set_buf(&src, record->buf, record->size);
-
- sg_init_table(&dst, 1);
- sg_set_buf(&dst, workspace, unzipped_len);
-
- acomp_request_set_params(dreq, &src, &dst, record->size, unzipped_len);
-
/* After decompression "unzipped_len" is almost certainly smaller. */
- ret = crypto_acomp_decompress(dreq);
+ ret = crypto_comp_decompress(tfm, record->buf, record->size,
+ workspace, &unzipped_len);
if (ret) {
- pr_err("crypto_acomp_decompress failed, ret = %d!\n", ret);
+ pr_err("crypto_comp_decompress failed, ret = %d!\n", ret);
kfree(workspace);
return;
}
/* Append ECC notice to decompressed buffer. */
- unzipped_len = dreq->dlen;
memcpy(workspace + unzipped_len, record->buf + record->size,
record->ecc_notice_size);
@@ -749,7 +726,6 @@ static void decompress_record(struct pstore_record *record)
unzipped = kmemdup(workspace, unzipped_len + record->ecc_notice_size,
GFP_KERNEL);
kfree(workspace);
- acomp_request_free(dreq);
if (!unzipped)
return;
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
index d8542ec2f38c..b31c9c72d90b 100644
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c
@@ -46,7 +46,7 @@ static int pmsg_major;
#undef pr_fmt
#define pr_fmt(fmt) PMSG_NAME ": " fmt
-static char *pmsg_devnode(struct device *dev, umode_t *mode)
+static char *pmsg_devnode(const struct device *dev, umode_t *mode)
{
if (mode)
*mode = 0220;
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index fefe3d391d3a..9a5052431fd3 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -18,10 +18,11 @@
#include <linux/platform_device.h>
#include <linux/slab.h>
#include <linux/compiler.h>
-#include <linux/pstore_ram.h>
#include <linux/of.h>
#include <linux/of_address.h>
+
#include "internal.h"
+#include "ram_internal.h"
#define RAMOOPS_KERNMSG_HDR "===="
#define MIN_MEM_SIZE 4096UL
@@ -451,20 +452,28 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
{
int i;
+ /* Free pmsg PRZ */
+ persistent_ram_free(&cxt->mprz);
+
+ /* Free console PRZ */
+ persistent_ram_free(&cxt->cprz);
+
/* Free dump PRZs */
if (cxt->dprzs) {
for (i = 0; i < cxt->max_dump_cnt; i++)
- persistent_ram_free(cxt->dprzs[i]);
+ persistent_ram_free(&cxt->dprzs[i]);
kfree(cxt->dprzs);
+ cxt->dprzs = NULL;
cxt->max_dump_cnt = 0;
}
/* Free ftrace PRZs */
if (cxt->fprzs) {
for (i = 0; i < cxt->max_ftrace_cnt; i++)
- persistent_ram_free(cxt->fprzs[i]);
+ persistent_ram_free(&cxt->fprzs[i]);
kfree(cxt->fprzs);
+ cxt->fprzs = NULL;
cxt->max_ftrace_cnt = 0;
}
}
@@ -548,9 +557,10 @@ static int ramoops_init_przs(const char *name,
while (i > 0) {
i--;
- persistent_ram_free(prz_ar[i]);
+ persistent_ram_free(&prz_ar[i]);
}
kfree(prz_ar);
+ prz_ar = NULL;
goto fail;
}
*paddr += zone_sz;
@@ -735,6 +745,7 @@ static int ramoops_probe(struct platform_device *pdev)
/* Make sure we didn't get bogus platform data pointer. */
if (!pdata) {
pr_err("NULL platform data\n");
+ err = -EINVAL;
goto fail_out;
}
@@ -742,6 +753,7 @@ static int ramoops_probe(struct platform_device *pdev)
!pdata->ftrace_size && !pdata->pmsg_size)) {
pr_err("The memory size and the record/console size must be "
"non-zero\n");
+ err = -EINVAL;
goto fail_out;
}
@@ -772,12 +784,17 @@ static int ramoops_probe(struct platform_device *pdev)
dump_mem_sz, cxt->record_size,
&cxt->max_dump_cnt, 0, 0);
if (err)
- goto fail_out;
+ goto fail_init;
err = ramoops_init_prz("console", dev, cxt, &cxt->cprz, &paddr,
cxt->console_size, 0);
if (err)
- goto fail_init_cprz;
+ goto fail_init;
+
+ err = ramoops_init_prz("pmsg", dev, cxt, &cxt->mprz, &paddr,
+ cxt->pmsg_size, 0);
+ if (err)
+ goto fail_init;
cxt->max_ftrace_cnt = (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)
? nr_cpu_ids
@@ -788,12 +805,7 @@ static int ramoops_probe(struct platform_device *pdev)
(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)
? PRZ_FLAG_NO_LOCK : 0);
if (err)
- goto fail_init_fprz;
-
- err = ramoops_init_prz("pmsg", dev, cxt, &cxt->mprz, &paddr,
- cxt->pmsg_size, 0);
- if (err)
- goto fail_init_mprz;
+ goto fail_init;
cxt->pstore.data = cxt;
/*
@@ -857,11 +869,7 @@ fail_buf:
kfree(cxt->pstore.buf);
fail_clear:
cxt->pstore.bufsize = 0;
- persistent_ram_free(cxt->mprz);
-fail_init_mprz:
-fail_init_fprz:
- persistent_ram_free(cxt->cprz);
-fail_init_cprz:
+fail_init:
ramoops_free_przs(cxt);
fail_out:
return err;
@@ -876,8 +884,6 @@ static int ramoops_remove(struct platform_device *pdev)
kfree(cxt->pstore.buf);
cxt->pstore.bufsize = 0;
- persistent_ram_free(cxt->mprz);
- persistent_ram_free(cxt->cprz);
ramoops_free_przs(cxt);
return 0;
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index a89e33719fcf..966191d3a5ba 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -13,13 +13,14 @@
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/memblock.h>
-#include <linux/pstore_ram.h>
#include <linux/rslib.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <asm/page.h>
+#include "ram_internal.h"
+
/**
* struct persistent_ram_buffer - persistent circular RAM buffer
*
@@ -439,7 +440,11 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size,
phys_addr_t addr = page_start + i * PAGE_SIZE;
pages[i] = pfn_to_page(addr >> PAGE_SHIFT);
}
- vaddr = vmap(pages, page_count, VM_MAP, prot);
+ /*
+ * VM_IOREMAP used here to bypass this region during vread()
+ * and kmap_atomic() (i.e. kcore) to avoid __va() failures.
+ */
+ vaddr = vmap(pages, page_count, VM_MAP | VM_IOREMAP, prot);
kfree(pages);
/*
@@ -543,8 +548,14 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
return 0;
}
-void persistent_ram_free(struct persistent_ram_zone *prz)
+void persistent_ram_free(struct persistent_ram_zone **_prz)
{
+ struct persistent_ram_zone *prz;
+
+ if (!_prz)
+ return;
+
+ prz = *_prz;
if (!prz)
return;
@@ -568,6 +579,7 @@ void persistent_ram_free(struct persistent_ram_zone *prz)
persistent_ram_free_old(prz);
kfree(prz->label);
kfree(prz);
+ *_prz = NULL;
}
struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
@@ -604,6 +616,6 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
return prz;
err:
- persistent_ram_free(prz);
+ persistent_ram_free(&prz);
return ERR_PTR(ret);
}
diff --git a/fs/pstore/ram_internal.h b/fs/pstore/ram_internal.h
new file mode 100644
index 000000000000..5f694698351f
--- /dev/null
+++ b/fs/pstore/ram_internal.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2010 Marco Stornelli <marco.stornelli@gmail.com>
+ * Copyright (C) 2011 Kees Cook <keescook@chromium.org>
+ * Copyright (C) 2011 Google, Inc.
+ */
+
+#include <linux/pstore_ram.h>
+
+/*
+ * Choose whether access to the RAM zone requires locking or not. If a zone
+ * can be written to from different CPUs like with ftrace for example, then
+ * PRZ_FLAG_NO_LOCK is used. For all other cases, locking is required.
+ */
+#define PRZ_FLAG_NO_LOCK BIT(0)
+/*
+ * If a PRZ should only have a single-boot lifetime, this marks it as
+ * getting wiped after its contents get copied out after boot.
+ */
+#define PRZ_FLAG_ZAP_OLD BIT(1)
+
+/**
+ * struct persistent_ram_zone - Details of a persistent RAM zone (PRZ)
+ * used as a pstore backend
+ *
+ * @paddr: physical address of the mapped RAM area
+ * @size: size of mapping
+ * @label: unique name of this PRZ
+ * @type: frontend type for this PRZ
+ * @flags: holds PRZ_FLAGS_* bits
+ *
+ * @buffer_lock:
+ * locks access to @buffer "size" bytes and "start" offset
+ * @buffer:
+ * pointer to actual RAM area managed by this PRZ
+ * @buffer_size:
+ * bytes in @buffer->data (not including any trailing ECC bytes)
+ *
+ * @par_buffer:
+ * pointer into @buffer->data containing ECC bytes for @buffer->data
+ * @par_header:
+ * pointer into @buffer->data containing ECC bytes for @buffer header
+ * (i.e. all fields up to @data)
+ * @rs_decoder:
+ * RSLIB instance for doing ECC calculations
+ * @corrected_bytes:
+ * ECC corrected bytes accounting since boot
+ * @bad_blocks:
+ * ECC uncorrectable bytes accounting since boot
+ * @ecc_info:
+ * ECC configuration details
+ *
+ * @old_log:
+ * saved copy of @buffer->data prior to most recent wipe
+ * @old_log_size:
+ * bytes contained in @old_log
+ *
+ */
+struct persistent_ram_zone {
+ phys_addr_t paddr;
+ size_t size;
+ void *vaddr;
+ char *label;
+ enum pstore_type_id type;
+ u32 flags;
+
+ raw_spinlock_t buffer_lock;
+ struct persistent_ram_buffer *buffer;
+ size_t buffer_size;
+
+ char *par_buffer;
+ char *par_header;
+ struct rs_control *rs_decoder;
+ int corrected_bytes;
+ int bad_blocks;
+ struct persistent_ram_ecc_info ecc_info;
+
+ char *old_log;
+ size_t old_log_size;
+};
+
+struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
+ u32 sig, struct persistent_ram_ecc_info *ecc_info,
+ unsigned int memtype, u32 flags, char *label);
+void persistent_ram_free(struct persistent_ram_zone **_prz);
+void persistent_ram_zap(struct persistent_ram_zone *prz);
+
+int persistent_ram_write(struct persistent_ram_zone *prz, const void *s,
+ unsigned int count);
+int persistent_ram_write_user(struct persistent_ram_zone *prz,
+ const void __user *s, unsigned int count);
+
+void persistent_ram_save_old(struct persistent_ram_zone *prz);
+size_t persistent_ram_old_size(struct persistent_ram_zone *prz);
+void *persistent_ram_old(struct persistent_ram_zone *prz);
+void persistent_ram_free_old(struct persistent_ram_zone *prz);
+ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz,
+ char *str, size_t len);
diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c
index 017d0d4ad329..2770746bb7aa 100644
--- a/fs/pstore/zone.c
+++ b/fs/pstore/zone.c
@@ -761,7 +761,7 @@ static inline int notrace psz_kmsg_write_record(struct psz_context *cxt,
/* avoid destroying old data, allocate a new one */
len = zone->buffer_size + sizeof(*zone->buffer);
zone->oldbuf = zone->buffer;
- zone->buffer = kzalloc(len, GFP_KERNEL);
+ zone->buffer = kzalloc(len, GFP_ATOMIC);
if (!zone->buffer) {
zone->buffer = zone->oldbuf;
return -ENOMEM;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index b9895afca9d1..85b2fa3b211c 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -470,10 +470,8 @@ out2:
out1:
iput(sbi->inodes);
out:
- if (bh1)
- brelse(bh1);
- if (bh2)
- brelse(bh2);
+ brelse(bh1);
+ brelse(bh2);
outnobh:
kfree(qs);
s->s_fs_info = NULL;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 0427b44bfee5..f27faf5db554 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2324,6 +2324,8 @@ static int vfs_setup_quota_inode(struct inode *inode, int type)
struct super_block *sb = inode->i_sb;
struct quota_info *dqopt = sb_dqopt(sb);
+ if (is_bad_inode(inode))
+ return -EUCLEAN;
if (!S_ISREG(inode->i_mode))
return -EACCES;
if (IS_RDONLY(inode))
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 5f2405994280..0f1493e0f6d0 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -71,6 +71,40 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
return ret;
}
+static inline int do_check_range(struct super_block *sb, const char *val_name,
+ uint val, uint min_val, uint max_val)
+{
+ if (val < min_val || val > max_val) {
+ quota_error(sb, "Getting %s %u out of range %u-%u",
+ val_name, val, min_val, max_val);
+ return -EUCLEAN;
+ }
+
+ return 0;
+}
+
+static int check_dquot_block_header(struct qtree_mem_dqinfo *info,
+ struct qt_disk_dqdbheader *dh)
+{
+ int err = 0;
+
+ err = do_check_range(info->dqi_sb, "dqdh_next_free",
+ le32_to_cpu(dh->dqdh_next_free), 0,
+ info->dqi_blocks - 1);
+ if (err)
+ return err;
+ err = do_check_range(info->dqi_sb, "dqdh_prev_free",
+ le32_to_cpu(dh->dqdh_prev_free), 0,
+ info->dqi_blocks - 1);
+ if (err)
+ return err;
+ err = do_check_range(info->dqi_sb, "dqdh_entries",
+ le16_to_cpu(dh->dqdh_entries), 0,
+ qtree_dqstr_in_blk(info));
+
+ return err;
+}
+
/* Remove empty block from list and return it */
static int get_free_dqblk(struct qtree_mem_dqinfo *info)
{
@@ -85,6 +119,9 @@ static int get_free_dqblk(struct qtree_mem_dqinfo *info)
ret = read_blk(info, blk, buf);
if (ret < 0)
goto out_buf;
+ ret = check_dquot_block_header(info, dh);
+ if (ret)
+ goto out_buf;
info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
}
else {
@@ -232,6 +269,9 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
*err = read_blk(info, blk, buf);
if (*err < 0)
goto out_buf;
+ *err = check_dquot_block_header(info, dh);
+ if (*err)
+ goto out_buf;
} else {
blk = get_free_dqblk(info);
if ((int)blk < 0) {
@@ -313,6 +353,10 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
}
ref = (__le32 *)buf;
newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+ ret = do_check_range(dquot->dq_sb, "block", newblk, 0,
+ info->dqi_blocks - 1);
+ if (ret)
+ goto out_buf;
if (!newblk)
newson = 1;
if (depth == info->dqi_qtree_depth - 1) {
@@ -424,6 +468,9 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
goto out_buf;
}
dh = (struct qt_disk_dqdbheader *)buf;
+ ret = check_dquot_block_header(info, dh);
+ if (ret)
+ goto out_buf;
le16_add_cpu(&dh->dqdh_entries, -1);
if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */
ret = remove_free_dqentry(info, buf, blk);
@@ -480,12 +527,10 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
goto out_buf;
}
newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
- if (newblk < QT_TREEOFF || newblk >= info->dqi_blocks) {
- quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)",
- newblk, info->dqi_blocks);
- ret = -EUCLEAN;
+ ret = do_check_range(dquot->dq_sb, "block", newblk, QT_TREEOFF,
+ info->dqi_blocks - 1);
+ if (ret)
goto out_buf;
- }
if (depth == info->dqi_qtree_depth - 1) {
ret = free_dqentry(info, dquot, newblk);
@@ -586,12 +631,10 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
if (!blk) /* No reference? */
goto out_buf;
- if (blk < QT_TREEOFF || blk >= info->dqi_blocks) {
- quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)",
- blk, info->dqi_blocks);
- ret = -EUCLEAN;
+ ret = do_check_range(dquot->dq_sb, "block", blk, QT_TREEOFF,
+ info->dqi_blocks - 1);
+ if (ret)
goto out_buf;
- }
if (depth < info->dqi_qtree_depth - 1)
ret = find_tree_dqentry(info, dquot, blk, depth+1);
@@ -705,15 +748,21 @@ static int find_next_id(struct qtree_mem_dqinfo *info, qid_t *id,
goto out_buf;
}
for (i = __get_index(info, *id, depth); i < epb; i++) {
- if (ref[i] == cpu_to_le32(0)) {
+ uint blk_no = le32_to_cpu(ref[i]);
+
+ if (blk_no == 0) {
*id += level_inc;
continue;
}
+ ret = do_check_range(info->dqi_sb, "block", blk_no, 0,
+ info->dqi_blocks - 1);
+ if (ret)
+ goto out_buf;
if (depth == info->dqi_qtree_depth - 1) {
ret = 0;
goto out_buf;
}
- ret = find_next_id(info, id, le32_to_cpu(ref[i]), depth + 1);
+ ret = find_next_id(info, id, blk_no, depth + 1);
if (ret != -ENOENT)
break;
}
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index ba3525ccc27e..cb240eac5036 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -203,9 +203,9 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
- unsigned long maxpages, lpages, nr, loop, ret;
+ unsigned long maxpages, lpages, nr_folios, loop, ret, nr_pages, pfn;
struct inode *inode = file_inode(file);
- struct page **pages = NULL, **ptr, *page;
+ struct folio_batch fbatch;
loff_t isize;
/* the mapping mustn't extend beyond the EOF */
@@ -221,31 +221,39 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
goto out;
/* gang-find the pages */
- pages = kcalloc(lpages, sizeof(struct page *), GFP_KERNEL);
- if (!pages)
- goto out_free;
-
- nr = find_get_pages_contig(inode->i_mapping, pgoff, lpages, pages);
- if (nr != lpages)
- goto out_free_pages; /* leave if some pages were missing */
+ folio_batch_init(&fbatch);
+ nr_pages = 0;
+repeat:
+ nr_folios = filemap_get_folios_contig(inode->i_mapping, &pgoff,
+ ULONG_MAX, &fbatch);
+ if (!nr_folios) {
+ ret = -ENOSYS;
+ return ret;
+ }
+ if (ret == -ENOSYS) {
+ ret = (unsigned long) folio_address(fbatch.folios[0]);
+ pfn = folio_pfn(fbatch.folios[0]);
+ }
/* check the pages for physical adjacency */
- ptr = pages;
- page = *ptr++;
- page++;
- for (loop = lpages; loop > 1; loop--)
- if (*ptr++ != page++)
- goto out_free_pages;
+ for (loop = 0; loop < nr_folios; loop++) {
+ if (pfn + nr_pages != folio_pfn(fbatch.folios[loop])) {
+ ret = -ENOSYS;
+ goto out_free; /* leave if not physical adjacent */
+ }
+ nr_pages += folio_nr_pages(fbatch.folios[loop]);
+ if (nr_pages >= lpages)
+ goto out_free; /* successfully found desired pages*/
+ }
+ if (nr_pages < lpages) {
+ folio_batch_release(&fbatch);
+ goto repeat; /* loop if pages are missing */
+ }
/* okay - all conditions fulfilled */
- ret = (unsigned long) page_address(pages[0]);
-out_free_pages:
- ptr = pages;
- for (loop = nr; loop > 0; loop--)
- put_page(*ptr++);
out_free:
- kfree(pages);
+ folio_batch_release(&fbatch);
out:
return ret;
}
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index bc66d0173e33..b3257e852820 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -146,15 +146,15 @@ static int ramfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
}
static int ramfs_tmpfile(struct user_namespace *mnt_userns,
- struct inode *dir, struct dentry *dentry, umode_t mode)
+ struct inode *dir, struct file *file, umode_t mode)
{
struct inode *inode;
inode = ramfs_get_inode(dir->i_sb, dir, mode, 0);
if (!inode)
return -ENOSPC;
- d_tmpfile(dentry, inode);
- return 0;
+ d_tmpfile(file, inode);
+ return finish_open_simple(file, 0);
}
static const struct inode_operations ramfs_dir_inode_operations = {
diff --git a/fs/read_write.c b/fs/read_write.c
index 1a261dcf1778..7a2ff6157eda 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -384,7 +384,7 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = (ppos ? *ppos : 0);
- iov_iter_ubuf(&iter, READ, buf, len);
+ iov_iter_ubuf(&iter, ITER_DEST, buf, len);
ret = call_read_iter(filp, &kiocb, &iter);
BUG_ON(ret == -EIOCBQUEUED);
@@ -424,7 +424,7 @@ ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
init_sync_kiocb(&kiocb, file);
kiocb.ki_pos = pos ? *pos : 0;
- iov_iter_kvec(&iter, READ, &iov, 1, iov.iov_len);
+ iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len);
ret = file->f_op->read_iter(&kiocb, &iter);
if (ret > 0) {
if (pos)
@@ -486,7 +486,7 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = (ppos ? *ppos : 0);
- iov_iter_ubuf(&iter, WRITE, (void __user *)buf, len);
+ iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len);
ret = call_write_iter(filp, &kiocb, &iter);
BUG_ON(ret == -EIOCBQUEUED);
@@ -496,14 +496,9 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
}
/* caller is responsible for file_start_write/file_end_write */
-ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
+ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos)
{
- struct kvec iov = {
- .iov_base = (void *)buf,
- .iov_len = min_t(size_t, count, MAX_RW_COUNT),
- };
struct kiocb kiocb;
- struct iov_iter iter;
ssize_t ret;
if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
@@ -519,8 +514,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
init_sync_kiocb(&kiocb, file);
kiocb.ki_pos = pos ? *pos : 0;
- iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
- ret = file->f_op->write_iter(&kiocb, &iter);
+ ret = file->f_op->write_iter(&kiocb, from);
if (ret > 0) {
if (pos)
*pos = kiocb.ki_pos;
@@ -530,6 +524,18 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
inc_syscw(current);
return ret;
}
+
+/* caller is responsible for file_start_write/file_end_write */
+ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
+{
+ struct kvec iov = {
+ .iov_base = (void *)buf,
+ .iov_len = min_t(size_t, count, MAX_RW_COUNT),
+ };
+ struct iov_iter iter;
+ iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len);
+ return __kernel_write_iter(file, &iter, pos);
+}
/*
* This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
* but autofs is one of the few internal kernel users that actually
@@ -905,7 +911,7 @@ static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
struct iov_iter iter;
ssize_t ret;
- ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+ ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
if (ret >= 0) {
ret = do_iter_read(file, &iter, pos, flags);
kfree(iov);
@@ -922,7 +928,7 @@ static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
struct iov_iter iter;
ssize_t ret;
- ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+ ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
if (ret >= 0) {
file_start_write(file);
ret = do_iter_write(file, &iter, pos, flags);
@@ -1382,6 +1388,8 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t len, unsigned int flags)
{
+ lockdep_assert(sb_write_started(file_inode(file_out)->i_sb));
+
return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
}
@@ -1418,7 +1426,9 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
* and several different sets of file_operations, but they all end up
* using the same ->copy_file_range() function pointer.
*/
- if (file_out->f_op->copy_file_range) {
+ if (flags & COPY_FILE_SPLICE) {
+ /* cross sb splice is allowed */
+ } else if (file_out->f_op->copy_file_range) {
if (file_in->f_op->copy_file_range !=
file_out->f_op->copy_file_range)
return -EXDEV;
@@ -1468,8 +1478,9 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
size_t len, unsigned int flags)
{
ssize_t ret;
+ bool splice = flags & COPY_FILE_SPLICE;
- if (flags != 0)
+ if (flags & ~COPY_FILE_SPLICE)
return -EINVAL;
ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
@@ -1495,14 +1506,14 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
* same sb using clone, but for filesystems where both clone and copy
* are supported (e.g. nfs,cifs), we only call the copy method.
*/
- if (file_out->f_op->copy_file_range) {
+ if (!splice && file_out->f_op->copy_file_range) {
ret = file_out->f_op->copy_file_range(file_in, pos_in,
file_out, pos_out,
len, flags);
goto done;
}
- if (file_in->f_op->remap_file_range &&
+ if (!splice && file_in->f_op->remap_file_range &&
file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
ret = file_in->f_op->remap_file_range(file_in, pos_in,
file_out, pos_out,
@@ -1522,6 +1533,8 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
* consistent story about which filesystems support copy_file_range()
* and which filesystems do not, that will allow userspace tools to
* make consistent desicions w.r.t using copy_file_range().
+ *
+ * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE.
*/
ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
flags);
@@ -1576,6 +1589,10 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
pos_out = f_out.file->f_pos;
}
+ ret = -EINVAL;
+ if (flags != 0)
+ goto out;
+
ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
flags);
if (ret > 0) {
diff --git a/fs/readdir.c b/fs/readdir.c
index 09e8ed7d4161..9c53edb60c03 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -140,7 +140,7 @@ struct readdir_callback {
int result;
};
-static int fillonedir(struct dir_context *ctx, const char *name, int namlen,
+static bool fillonedir(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct readdir_callback *buf =
@@ -149,14 +149,14 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen,
unsigned long d_ino;
if (buf->result)
- return -EINVAL;
+ return false;
buf->result = verify_dirent_name(name, namlen);
- if (buf->result < 0)
- return buf->result;
+ if (buf->result)
+ return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
buf->result = -EOVERFLOW;
- return -EOVERFLOW;
+ return false;
}
buf->result++;
dirent = buf->dirent;
@@ -169,12 +169,12 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen,
unsafe_put_user(namlen, &dirent->d_namlen, efault_end);
unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end);
user_write_access_end();
- return 0;
+ return true;
efault_end:
user_write_access_end();
efault:
buf->result = -EFAULT;
- return -EFAULT;
+ return false;
}
SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
@@ -219,7 +219,7 @@ struct getdents_callback {
int error;
};
-static int filldir(struct dir_context *ctx, const char *name, int namlen,
+static bool filldir(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct linux_dirent __user *dirent, *prev;
@@ -232,18 +232,18 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen,
buf->error = verify_dirent_name(name, namlen);
if (unlikely(buf->error))
- return buf->error;
+ return false;
buf->error = -EINVAL; /* only used if we fail.. */
if (reclen > buf->count)
- return -EINVAL;
+ return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
buf->error = -EOVERFLOW;
- return -EOVERFLOW;
+ return false;
}
prev_reclen = buf->prev_reclen;
if (prev_reclen && signal_pending(current))
- return -EINTR;
+ return false;
dirent = buf->current_dir;
prev = (void __user *) dirent - prev_reclen;
if (!user_write_access_begin(prev, reclen + prev_reclen))
@@ -260,12 +260,12 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen,
buf->current_dir = (void __user *)dirent + reclen;
buf->prev_reclen = reclen;
buf->count -= reclen;
- return 0;
+ return true;
efault_end:
user_write_access_end();
efault:
buf->error = -EFAULT;
- return -EFAULT;
+ return false;
}
SYSCALL_DEFINE3(getdents, unsigned int, fd,
@@ -307,7 +307,7 @@ struct getdents_callback64 {
int error;
};
-static int filldir64(struct dir_context *ctx, const char *name, int namlen,
+static bool filldir64(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct linux_dirent64 __user *dirent, *prev;
@@ -319,13 +319,13 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen,
buf->error = verify_dirent_name(name, namlen);
if (unlikely(buf->error))
- return buf->error;
+ return false;
buf->error = -EINVAL; /* only used if we fail.. */
if (reclen > buf->count)
- return -EINVAL;
+ return false;
prev_reclen = buf->prev_reclen;
if (prev_reclen && signal_pending(current))
- return -EINTR;
+ return false;
dirent = buf->current_dir;
prev = (void __user *)dirent - prev_reclen;
if (!user_write_access_begin(prev, reclen + prev_reclen))
@@ -342,13 +342,13 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen,
buf->prev_reclen = reclen;
buf->current_dir = (void __user *)dirent + reclen;
buf->count -= reclen;
- return 0;
+ return true;
efault_end:
user_write_access_end();
efault:
buf->error = -EFAULT;
- return -EFAULT;
+ return false;
}
SYSCALL_DEFINE3(getdents64, unsigned int, fd,
@@ -397,7 +397,7 @@ struct compat_readdir_callback {
int result;
};
-static int compat_fillonedir(struct dir_context *ctx, const char *name,
+static bool compat_fillonedir(struct dir_context *ctx, const char *name,
int namlen, loff_t offset, u64 ino,
unsigned int d_type)
{
@@ -407,14 +407,14 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name,
compat_ulong_t d_ino;
if (buf->result)
- return -EINVAL;
+ return false;
buf->result = verify_dirent_name(name, namlen);
- if (buf->result < 0)
- return buf->result;
+ if (buf->result)
+ return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
buf->result = -EOVERFLOW;
- return -EOVERFLOW;
+ return false;
}
buf->result++;
dirent = buf->dirent;
@@ -427,12 +427,12 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name,
unsafe_put_user(namlen, &dirent->d_namlen, efault_end);
unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end);
user_write_access_end();
- return 0;
+ return true;
efault_end:
user_write_access_end();
efault:
buf->result = -EFAULT;
- return -EFAULT;
+ return false;
}
COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
@@ -471,7 +471,7 @@ struct compat_getdents_callback {
int error;
};
-static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
+static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct compat_linux_dirent __user *dirent, *prev;
@@ -484,18 +484,18 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
buf->error = verify_dirent_name(name, namlen);
if (unlikely(buf->error))
- return buf->error;
+ return false;
buf->error = -EINVAL; /* only used if we fail.. */
if (reclen > buf->count)
- return -EINVAL;
+ return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
buf->error = -EOVERFLOW;
- return -EOVERFLOW;
+ return false;
}
prev_reclen = buf->prev_reclen;
if (prev_reclen && signal_pending(current))
- return -EINTR;
+ return false;
dirent = buf->current_dir;
prev = (void __user *) dirent - prev_reclen;
if (!user_write_access_begin(prev, reclen + prev_reclen))
@@ -511,12 +511,12 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
buf->prev_reclen = reclen;
buf->current_dir = (void __user *)dirent + reclen;
buf->count -= reclen;
- return 0;
+ return true;
efault_end:
user_write_access_end();
efault:
buf->error = -EFAULT;
- return -EFAULT;
+ return false;
}
COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h
index d9052b8ce6dd..29c503a06db4 100644
--- a/fs/reiserfs/acl.h
+++ b/fs/reiserfs/acl.h
@@ -49,9 +49,9 @@ static inline int reiserfs_acl_count(size_t size)
#ifdef CONFIG_REISERFS_FS_POSIX_ACL
struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu);
-int reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
-int reiserfs_acl_chmod(struct inode *inode);
+int reiserfs_acl_chmod(struct dentry *dentry);
int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
struct inode *dir, struct dentry *dentry,
struct inode *inode);
@@ -63,7 +63,7 @@ int reiserfs_cache_default_acl(struct inode *dir);
#define reiserfs_get_acl NULL
#define reiserfs_set_acl NULL
-static inline int reiserfs_acl_chmod(struct inode *inode)
+static inline int reiserfs_acl_chmod(struct dentry *dentry)
{
return 0;
}
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 6e228bfbe7ef..467d13da198f 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -256,7 +256,7 @@ const struct inode_operations reiserfs_file_inode_operations = {
.setattr = reiserfs_setattr,
.listxattr = reiserfs_listxattr,
.permission = reiserfs_permission,
- .get_acl = reiserfs_get_acl,
+ .get_inode_acl = reiserfs_get_acl,
.set_acl = reiserfs_set_acl,
.fileattr_get = reiserfs_fileattr_get,
.fileattr_set = reiserfs_fileattr_set,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index b9580a6515ee..c7d1fa526dea 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3404,7 +3404,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (!error && reiserfs_posixacl(inode->i_sb)) {
if (attr->ia_valid & ATTR_MODE)
- error = reiserfs_acl_chmod(inode);
+ error = reiserfs_acl_chmod(dentry);
}
out:
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 94addfcefede..9f62da7471c9 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -868,7 +868,7 @@ loop_next:
*/
if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) {
spin_unlock(lock);
- ll_rw_block(REQ_OP_WRITE, 1, &bh);
+ write_dirty_buffer(bh, 0);
spin_lock(lock);
}
put_bh(bh);
@@ -1054,7 +1054,7 @@ static int flush_commit_list(struct super_block *s,
if (tbh) {
if (buffer_dirty(tbh)) {
depth = reiserfs_write_unlock_nested(s);
- ll_rw_block(REQ_OP_WRITE, 1, &tbh);
+ write_dirty_buffer(tbh, 0);
reiserfs_write_lock_nested(s, depth);
}
put_bh(tbh) ;
@@ -2240,7 +2240,7 @@ abort_replay:
}
}
/* read in the log blocks, memcpy to the corresponding real block */
- ll_rw_block(REQ_OP_READ, get_desc_trans_len(desc), log_blocks);
+ bh_read_batch(get_desc_trans_len(desc), log_blocks);
for (i = 0; i < get_desc_trans_len(desc); i++) {
wait_on_buffer(log_blocks[i]);
@@ -2342,10 +2342,11 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev,
} else
bhlist[j++] = bh;
}
- ll_rw_block(REQ_OP_READ, j, bhlist);
+ bh = bhlist[0];
+ bh_read_nowait(bh, 0);
+ bh_readahead_batch(j - 1, &bhlist[1], 0);
for (i = 1; i < j; i++)
brelse(bhlist[i]);
- bh = bhlist[0];
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return bh;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 3d7a35d6a18b..0b8aa99749f1 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -696,6 +696,7 @@ static int reiserfs_create(struct user_namespace *mnt_userns, struct inode *dir,
out_failed:
reiserfs_write_unlock(dir->i_sb);
+ reiserfs_security_free(&security);
return retval;
}
@@ -779,6 +780,7 @@ static int reiserfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
out_failed:
reiserfs_write_unlock(dir->i_sb);
+ reiserfs_security_free(&security);
return retval;
}
@@ -878,6 +880,7 @@ static int reiserfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
retval = journal_end(&th);
out_failed:
reiserfs_write_unlock(dir->i_sb);
+ reiserfs_security_free(&security);
return retval;
}
@@ -1194,6 +1197,7 @@ static int reiserfs_symlink(struct user_namespace *mnt_userns,
retval = journal_end(&th);
out_failed:
reiserfs_write_unlock(parent_dir->i_sb);
+ reiserfs_security_free(&security);
return retval;
}
@@ -1659,7 +1663,7 @@ const struct inode_operations reiserfs_dir_inode_operations = {
.setattr = reiserfs_setattr,
.listxattr = reiserfs_listxattr,
.permission = reiserfs_permission,
- .get_acl = reiserfs_get_acl,
+ .get_inode_acl = reiserfs_get_acl,
.set_acl = reiserfs_set_acl,
.fileattr_get = reiserfs_fileattr_get,
.fileattr_set = reiserfs_fileattr_set,
@@ -1683,6 +1687,6 @@ const struct inode_operations reiserfs_special_inode_operations = {
.setattr = reiserfs_setattr,
.listxattr = reiserfs_listxattr,
.permission = reiserfs_permission,
- .get_acl = reiserfs_get_acl,
+ .get_inode_acl = reiserfs_get_acl,
.set_acl = reiserfs_set_acl,
};
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 30319dc33c18..84a194b77f19 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -456,7 +456,7 @@ static int print_internal(struct buffer_head *bh, int first, int last)
to = B_NR_ITEMS(bh);
} else {
from = first;
- to = last < B_NR_ITEMS(bh) ? last : B_NR_ITEMS(bh);
+ to = min_t(int, last, B_NR_ITEMS(bh));
}
reiserfs_printk("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh);
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 4a7cb16e9345..3dba8acf4e83 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -411,7 +411,7 @@ int reiserfs_proc_info_init(struct super_block *sb)
char *s;
/* Some block devices use /'s */
- strlcpy(b, sb->s_id, BDEVNAME_SIZE);
+ strscpy(b, sb->s_id, BDEVNAME_SIZE);
s = strchr(b, '/');
if (s)
*s = '!';
@@ -441,7 +441,7 @@ int reiserfs_proc_info_done(struct super_block *sb)
char *s;
/* Some block devices use /'s */
- strlcpy(b, sb->s_id, BDEVNAME_SIZE);
+ strscpy(b, sb->s_id, BDEVNAME_SIZE);
s = strchr(b, '/');
if (s)
*s = '!';
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index 8096c74c38ac..7b498a0d060b 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -97,7 +97,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
* using the copy_size var below allows this code to work for
* both shrinking and expanding the FS.
*/
- copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr;
+ copy_size = min(bmap_nr_new, bmap_nr);
copy_size =
copy_size * sizeof(struct reiserfs_list_bitmap_node *);
for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 9a293609a022..84c12a1947b2 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -579,7 +579,7 @@ static int search_by_key_reada(struct super_block *s,
if (!buffer_uptodate(bh[j])) {
if (depth == -1)
depth = reiserfs_write_unlock_nested(s);
- ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, bh + j);
+ bh_readahead(bh[j], REQ_RAHEAD);
}
brelse(bh[j]);
}
@@ -685,7 +685,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,
if (!buffer_uptodate(bh) && depth == -1)
depth = reiserfs_write_unlock_nested(sb);
- ll_rw_block(REQ_OP_READ, 1, &bh);
+ bh_read_nowait(bh, 0);
wait_on_buffer(bh);
if (depth != -1)
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index c88cd2ce0665..929acce6e731 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1702,9 +1702,7 @@ static int read_super_block(struct super_block *s, int offset)
/* after journal replay, reread all bitmap and super blocks */
static int reread_meta_blocks(struct super_block *s)
{
- ll_rw_block(REQ_OP_READ, 1, &SB_BUFFER_WITH_SB(s));
- wait_on_buffer(SB_BUFFER_WITH_SB(s));
- if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
+ if (bh_read(SB_BUFFER_WITH_SB(s), 0) < 0) {
reiserfs_warning(s, "reiserfs-2504", "error reading the super");
return 1;
}
@@ -2504,9 +2502,7 @@ static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data,
len = i_size - off;
toread = len;
while (toread > 0) {
- tocopy =
- sb->s_blocksize - offset <
- toread ? sb->s_blocksize - offset : toread;
+ tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
tmp_bh.b_state = 0;
/*
* Quota files are without tails so we can safely
@@ -2554,8 +2550,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
return -EIO;
}
while (towrite > 0) {
- tocopy = sb->s_blocksize - offset < towrite ?
- sb->s_blocksize - offset : towrite;
+ tocopy = min_t(unsigned long, sb->s_blocksize - offset, towrite);
tmp_bh.b_state = 0;
reiserfs_write_lock(sb);
err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE);
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 436641369283..8b2d52443f41 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -189,7 +189,7 @@ struct reiserfs_dentry_buf {
struct dentry *dentries[8];
};
-static int
+static bool
fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
@@ -200,16 +200,16 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
WARN_ON_ONCE(!inode_is_locked(d_inode(dbuf->xadir)));
if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
- return -ENOSPC;
+ return false;
if (name[0] == '.' && (namelen < 2 ||
(namelen == 2 && name[1] == '.')))
- return 0;
+ return true;
dentry = lookup_one_len(name, dbuf->xadir, namelen);
if (IS_ERR(dentry)) {
dbuf->err = PTR_ERR(dentry);
- return PTR_ERR(dentry);
+ return false;
} else if (d_really_is_negative(dentry)) {
/* A directory entry exists, but no file? */
reiserfs_error(dentry->d_sb, "xattr-20003",
@@ -218,11 +218,11 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
dentry, dbuf->xadir);
dput(dentry);
dbuf->err = -EIO;
- return -EIO;
+ return false;
}
dbuf->dentries[dbuf->count++] = dentry;
- return 0;
+ return true;
}
static void
@@ -797,7 +797,7 @@ struct listxattr_buf {
struct dentry *dentry;
};
-static int listxattr_filler(struct dir_context *ctx, const char *name,
+static bool listxattr_filler(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
@@ -813,19 +813,19 @@ static int listxattr_filler(struct dir_context *ctx, const char *name,
name);
if (!handler /* Unsupported xattr name */ ||
(handler->list && !handler->list(b->dentry)))
- return 0;
+ return true;
size = namelen + 1;
if (b->buf) {
if (b->pos + size > b->size) {
b->pos = -ERANGE;
- return -ERANGE;
+ return false;
}
memcpy(b->buf + b->pos, name, namelen);
b->buf[b->pos + namelen] = 0;
}
b->pos += size;
}
- return 0;
+ return true;
}
/*
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index d6fcddc46f5b..93fe414fed18 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -18,7 +18,7 @@ static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th,
int
-reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int error, error2;
@@ -26,6 +26,7 @@ reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
size_t jcreate_blocks;
int size = acl ? posix_acl_xattr_size(acl->a_count) : 0;
int update_mode = 0;
+ struct inode *inode = d_inode(dentry);
umode_t mode = inode->i_mode;
/*
@@ -371,7 +372,7 @@ int reiserfs_cache_default_acl(struct inode *inode)
if (IS_PRIVATE(inode))
return 0;
- acl = get_acl(inode, ACL_TYPE_DEFAULT);
+ acl = get_inode_acl(inode, ACL_TYPE_DEFAULT);
if (acl && !IS_ERR(acl)) {
int size = reiserfs_acl_size(acl->a_count);
@@ -396,13 +397,15 @@ int reiserfs_cache_default_acl(struct inode *inode)
/*
* Called under i_mutex
*/
-int reiserfs_acl_chmod(struct inode *inode)
+int reiserfs_acl_chmod(struct dentry *dentry)
{
+ struct inode *inode = d_inode(dentry);
+
if (IS_PRIVATE(inode))
return 0;
if (get_inode_sd_version(inode) == STAT_DATA_V1 ||
!reiserfs_posixacl(inode->i_sb))
return 0;
- return posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+ return posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
}
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 8965c8e5e172..857a65b05726 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -50,6 +50,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
int error;
sec->name = NULL;
+ sec->value = NULL;
/* Don't add selinux attributes on xattrs - they'll never get used */
if (IS_PRIVATE(dir))
@@ -95,7 +96,6 @@ int reiserfs_security_write(struct reiserfs_transaction_handle *th,
void reiserfs_security_free(struct reiserfs_security_handle *sec)
{
- kfree(sec->name);
kfree(sec->value);
sec->name = NULL;
sec->value = NULL;
diff --git a/fs/remap_range.c b/fs/remap_range.c
index 654912d06862..41f60477bb41 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -304,7 +304,7 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
/* Check that we don't violate system file offset limits. */
ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
remap_flags);
- if (ret)
+ if (ret || *len == 0)
return ret;
/* Wait for the completion of any pending IOs on both files */
@@ -328,9 +328,6 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (remap_flags & REMAP_FILE_DEDUP) {
bool is_same = false;
- if (*len == 0)
- return 0;
-
if (!IS_DAX(inode_in))
ret = vfs_dedupe_file_range_compare(file_in, pos_in,
file_out, pos_out, *len, &is_same);
@@ -348,7 +345,7 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
remap_flags);
- if (ret)
+ if (ret || *len == 0)
return ret;
/* If can't alter the file contents, we're done. */
@@ -429,7 +426,7 @@ static bool allow_file_dedupe(struct file *file)
return true;
if (file->f_mode & FMODE_WRITE)
return true;
- if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)))
+ if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), current_fsuid()))
return true;
if (!inode_permission(mnt_userns, inode, MAY_WRITE))
return true;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 9456a2032224..f5fdaf3b1572 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -156,7 +156,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
ssize_t ret;
init_sync_kiocb(&kiocb, file);
- iov_iter_init(&iter, READ, &iov, 1, size);
+ iov_iter_init(&iter, ITER_DEST, &iov, 1, size);
kiocb.ki_pos = *ppos;
ret = seq_read_iter(&kiocb, &iter);
diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h
index 2cab413fffee..7d605db3bb3b 100644
--- a/fs/smbfs_common/smb2pdu.h
+++ b/fs/smbfs_common/smb2pdu.h
@@ -1101,7 +1101,11 @@ struct smb2_change_notify_rsp {
#define SMB2_CREATE_REQUEST_LEASE "RqLs"
#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q"
#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C"
-#define SMB2_CREATE_TAG_POSIX "\x93\xAD\x25\x50\x9C\xB4\x11\xE7\xB4\x23\x83\xDE\x96\x8B\xCD\x7C"
+#define SMB2_CREATE_TAG_POSIX "\x93\xAD\x25\x50\x9C\xB4\x11\xE7\xB4\x23\x83\xDE\x96\x8B\xCD\x7C"
+#define SMB2_CREATE_APP_INSTANCE_ID "\x45\xBC\xA6\x6A\xEF\xA7\xF7\x4A\x90\x08\xFA\x46\x2E\x14\x4D\x74"
+#define SMB2_CREATE_APP_INSTANCE_VERSION "\xB9\x82\xD0\xB7\x3B\x56\x07\x4F\xA0\x7B\x52\x4A\x81\x16\xA0\x10"
+#define SVHDX_OPEN_DEVICE_CONTEXT "\x9C\xCB\xCF\x9E\x04\xC1\xE6\x43\x98\x0E\x15\x8D\xA1\xF6\xEC\x83"
+#define SMB2_CREATE_TAG_AAPL "AAPL"
/* Flag (SMB3 open response) values */
#define SMB2_CREATE_FLAG_REPARSEPOINT 0x01
diff --git a/fs/splice.c b/fs/splice.c
index 0878b852b355..5969b7a1d353 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -303,7 +303,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
struct kiocb kiocb;
int ret;
- iov_iter_pipe(&to, READ, pipe, len);
+ iov_iter_pipe(&to, ITER_DEST, pipe, len);
init_sync_kiocb(&kiocb, in);
kiocb.ki_pos = *ppos;
ret = call_read_iter(in, &kiocb, &to);
@@ -682,7 +682,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
n++;
}
- iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left);
+ iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
ret = vfs_iter_write(out, &from, &sd.pos, 0);
if (ret <= 0)
break;
@@ -1263,9 +1263,9 @@ static int vmsplice_type(struct fd f, int *type)
if (!f.file)
return -EBADF;
if (f.file->f_mode & FMODE_WRITE) {
- *type = WRITE;
+ *type = ITER_SOURCE;
} else if (f.file->f_mode & FMODE_READ) {
- *type = READ;
+ *type = ITER_DEST;
} else {
fdput(f);
return -EBADF;
@@ -1314,7 +1314,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
if (!iov_iter_count(&iter))
error = 0;
- else if (iov_iter_rw(&iter) == WRITE)
+ else if (type == ITER_SOURCE)
error = vmsplice_to_pipe(f.file, &iter, flags);
else
error = vmsplice_to_user(f.file, &iter, flags);
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 916e78fabcaa..60fc98bdf421 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -54,9 +54,35 @@ config SQUASHFS_FILE_DIRECT
endchoice
+config SQUASHFS_DECOMP_SINGLE
+ depends on SQUASHFS
+ def_bool n
+
+config SQUASHFS_DECOMP_MULTI
+ depends on SQUASHFS
+ def_bool n
+
+config SQUASHFS_DECOMP_MULTI_PERCPU
+ depends on SQUASHFS
+ def_bool n
+
+config SQUASHFS_CHOICE_DECOMP_BY_MOUNT
+ bool "Select the parallel decompression mode during mount"
+ depends on SQUASHFS
+ default n
+ select SQUASHFS_DECOMP_SINGLE
+ select SQUASHFS_DECOMP_MULTI
+ select SQUASHFS_DECOMP_MULTI_PERCPU
+ select SQUASHFS_MOUNT_DECOMP_THREADS
+ help
+ Compile all parallel decompression modes and specify the
+ decompression mode by setting "threads=" during mount.
+ default Decompressor parallelisation is SQUASHFS_DECOMP_SINGLE
+
choice
- prompt "Decompressor parallelisation options"
+ prompt "Select decompression parallel mode at compile time"
depends on SQUASHFS
+ depends on !SQUASHFS_CHOICE_DECOMP_BY_MOUNT
help
Squashfs now supports three parallelisation options for
decompression. Each one exhibits various trade-offs between
@@ -64,15 +90,17 @@ choice
If in doubt, select "Single threaded compression"
-config SQUASHFS_DECOMP_SINGLE
+config SQUASHFS_COMPILE_DECOMP_SINGLE
bool "Single threaded compression"
+ select SQUASHFS_DECOMP_SINGLE
help
Traditionally Squashfs has used single-threaded decompression.
Only one block (data or metadata) can be decompressed at any
one time. This limits CPU and memory usage to a minimum.
-config SQUASHFS_DECOMP_MULTI
+config SQUASHFS_COMPILE_DECOMP_MULTI
bool "Use multiple decompressors for parallel I/O"
+ select SQUASHFS_DECOMP_MULTI
help
By default Squashfs uses a single decompressor but it gives
poor performance on parallel I/O workloads when using multiple CPU
@@ -85,8 +113,9 @@ config SQUASHFS_DECOMP_MULTI
decompressors per core. It dynamically allocates decompressors
on a demand basis.
-config SQUASHFS_DECOMP_MULTI_PERCPU
+config SQUASHFS_COMPILE_DECOMP_MULTI_PERCPU
bool "Use percpu multiple decompressors for parallel I/O"
+ select SQUASHFS_DECOMP_MULTI_PERCPU
help
By default Squashfs uses a single decompressor but it gives
poor performance on parallel I/O workloads when using multiple CPU
@@ -95,9 +124,21 @@ config SQUASHFS_DECOMP_MULTI_PERCPU
This decompressor implementation uses a maximum of one
decompressor per core. It uses percpu variables to ensure
decompression is load-balanced across the cores.
-
endchoice
+config SQUASHFS_MOUNT_DECOMP_THREADS
+ bool "Add the mount parameter 'threads=' for squashfs"
+ depends on SQUASHFS
+ depends on SQUASHFS_DECOMP_MULTI
+ default n
+ help
+ Use threads= to set the decompression parallel mode and the number of threads.
+ If SQUASHFS_CHOICE_DECOMP_BY_MOUNT=y
+ threads=<single|multi|percpu|1|2|3|...>
+ else
+ threads=<2|3|...>
+ The upper limit is num_online_cpus() * 2.
+
config SQUASHFS_XATTR
bool "Squashfs XATTR support"
depends on SQUASHFS
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 833aca92301f..bed3bb8b27fa 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -216,7 +216,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
res = -EIO;
goto out_free_bio;
}
- res = squashfs_decompress(msblk, bio, offset, length, output);
+ res = msblk->thread_ops->decompress(msblk, bio, offset, length, output);
} else {
res = copy_bio_to_actor(bio, output, offset, length);
}
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index d57bef91ab08..8893cb9b4198 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -134,7 +134,7 @@ void *squashfs_decompressor_setup(struct super_block *sb, unsigned short flags)
if (IS_ERR(comp_opts))
return comp_opts;
- stream = squashfs_decompressor_create(msblk, comp_opts);
+ stream = msblk->thread_ops->create(msblk, comp_opts);
if (IS_ERR(stream))
kfree(comp_opts);
diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c
index db9f12a3ea05..416c53eedbd1 100644
--- a/fs/squashfs/decompressor_multi.c
+++ b/fs/squashfs/decompressor_multi.c
@@ -29,12 +29,11 @@
#define MAX_DECOMPRESSOR (num_online_cpus() * 2)
-int squashfs_max_decompressors(void)
+static int squashfs_max_decompressors(void)
{
return MAX_DECOMPRESSOR;
}
-
struct squashfs_stream {
void *comp_opts;
struct list_head strm_list;
@@ -59,7 +58,7 @@ static void put_decomp_stream(struct decomp_stream *decomp_strm,
wake_up(&stream->wait);
}
-void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
void *comp_opts)
{
struct squashfs_stream *stream;
@@ -103,7 +102,7 @@ out:
}
-void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
{
struct squashfs_stream *stream = msblk->stream;
if (stream) {
@@ -145,7 +144,7 @@ static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk,
* If there is no available decomp and already full,
* let's wait for releasing decomp from other users.
*/
- if (stream->avail_decomp >= MAX_DECOMPRESSOR)
+ if (stream->avail_decomp >= msblk->max_thread_num)
goto wait;
/* Let's allocate new decomp */
@@ -161,7 +160,7 @@ static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk,
}
stream->avail_decomp++;
- WARN_ON(stream->avail_decomp > MAX_DECOMPRESSOR);
+ WARN_ON(stream->avail_decomp > msblk->max_thread_num);
mutex_unlock(&stream->mutex);
break;
@@ -180,7 +179,7 @@ wait:
}
-int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
+static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
int offset, int length,
struct squashfs_page_actor *output)
{
@@ -195,3 +194,10 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
msblk->decompressor->name);
return res;
}
+
+const struct squashfs_decompressor_thread_ops squashfs_decompressor_multi = {
+ .create = squashfs_decompressor_create,
+ .destroy = squashfs_decompressor_destroy,
+ .decompress = squashfs_decompress,
+ .max_decompressors = squashfs_max_decompressors,
+};
diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c
index b881b9283b7f..1dfadf76ed9a 100644
--- a/fs/squashfs/decompressor_multi_percpu.c
+++ b/fs/squashfs/decompressor_multi_percpu.c
@@ -25,7 +25,7 @@ struct squashfs_stream {
local_lock_t lock;
};
-void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
void *comp_opts)
{
struct squashfs_stream *stream;
@@ -59,7 +59,7 @@ out:
return ERR_PTR(err);
}
-void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
{
struct squashfs_stream __percpu *percpu =
(struct squashfs_stream __percpu *) msblk->stream;
@@ -75,19 +75,21 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
}
}
-int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
+static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
int offset, int length, struct squashfs_page_actor *output)
{
struct squashfs_stream *stream;
+ struct squashfs_stream __percpu *percpu =
+ (struct squashfs_stream __percpu *) msblk->stream;
int res;
- local_lock(&msblk->stream->lock);
- stream = this_cpu_ptr(msblk->stream);
+ local_lock(&percpu->lock);
+ stream = this_cpu_ptr(percpu);
res = msblk->decompressor->decompress(msblk, stream->stream, bio,
offset, length, output);
- local_unlock(&msblk->stream->lock);
+ local_unlock(&percpu->lock);
if (res < 0)
ERROR("%s decompression failed, data probably corrupt\n",
@@ -96,7 +98,14 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
return res;
}
-int squashfs_max_decompressors(void)
+static int squashfs_max_decompressors(void)
{
return num_possible_cpus();
}
+
+const struct squashfs_decompressor_thread_ops squashfs_decompressor_percpu = {
+ .create = squashfs_decompressor_create,
+ .destroy = squashfs_decompressor_destroy,
+ .decompress = squashfs_decompress,
+ .max_decompressors = squashfs_max_decompressors,
+};
diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c
index 4eb3d083d45e..6f161887710b 100644
--- a/fs/squashfs/decompressor_single.c
+++ b/fs/squashfs/decompressor_single.c
@@ -24,7 +24,7 @@ struct squashfs_stream {
struct mutex mutex;
};
-void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
void *comp_opts)
{
struct squashfs_stream *stream;
@@ -49,7 +49,7 @@ out:
return ERR_PTR(err);
}
-void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
{
struct squashfs_stream *stream = msblk->stream;
@@ -59,7 +59,7 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
}
}
-int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
+static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
int offset, int length,
struct squashfs_page_actor *output)
{
@@ -78,7 +78,14 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
return res;
}
-int squashfs_max_decompressors(void)
+static int squashfs_max_decompressors(void)
{
return 1;
}
+
+const struct squashfs_decompressor_thread_ops squashfs_decompressor_single = {
+ .create = squashfs_decompressor_create,
+ .destroy = squashfs_decompressor_destroy,
+ .decompress = squashfs_decompress,
+ .max_decompressors = squashfs_max_decompressors,
+};
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index e56510964b22..8ba8c4c50770 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -506,8 +506,9 @@ static int squashfs_readahead_fragment(struct page **page,
squashfs_i(inode)->fragment_size);
struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
unsigned int n, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
+ int error = buffer->error;
- if (buffer->error)
+ if (error)
goto out;
expected += squashfs_i(inode)->fragment_offset;
@@ -529,7 +530,7 @@ static int squashfs_readahead_fragment(struct page **page,
out:
squashfs_cache_put(buffer);
- return buffer->error;
+ return error;
}
static void squashfs_readahead(struct readahead_control *ractl)
@@ -557,6 +558,13 @@ static void squashfs_readahead(struct readahead_control *ractl)
int res, bsize;
u64 block = 0;
unsigned int expected;
+ struct page *last_page;
+
+ expected = start >> msblk->block_log == file_end ?
+ (i_size_read(inode) & (msblk->block_size - 1)) :
+ msblk->block_size;
+
+ max_pages = (expected + PAGE_SIZE - 1) >> PAGE_SHIFT;
nr_pages = __readahead_batch(ractl, pages, max_pages);
if (!nr_pages)
@@ -566,13 +574,10 @@ static void squashfs_readahead(struct readahead_control *ractl)
goto skip_pages;
index = pages[0]->index >> shift;
+
if ((pages[nr_pages - 1]->index >> shift) != index)
goto skip_pages;
- expected = index == file_end ?
- (i_size_read(inode) & (msblk->block_size - 1)) :
- msblk->block_size;
-
if (index == file_end && squashfs_i(inode)->fragment_block !=
SQUASHFS_INVALID_BLK) {
res = squashfs_readahead_fragment(pages, nr_pages,
@@ -593,15 +598,15 @@ static void squashfs_readahead(struct readahead_control *ractl)
res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor);
- squashfs_page_actor_free(actor);
+ last_page = squashfs_page_actor_free(actor);
if (res == expected) {
int bytes;
/* Last page (if present) may have trailing bytes not filled */
bytes = res % PAGE_SIZE;
- if (pages[nr_pages - 1]->index == file_end && bytes)
- memzero_page(pages[nr_pages - 1], bytes,
+ if (index == file_end && bytes && last_page)
+ memzero_page(last_page, bytes,
PAGE_SIZE - bytes);
for (i = 0; i < nr_pages; i++) {
diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c
index 54b93bf4a25c..81af6c4ca115 100644
--- a/fs/squashfs/page_actor.c
+++ b/fs/squashfs/page_actor.c
@@ -71,11 +71,13 @@ static void *handle_next_page(struct squashfs_page_actor *actor)
(actor->next_index != actor->page[actor->next_page]->index)) {
actor->next_index++;
actor->returned_pages++;
+ actor->last_page = NULL;
return actor->alloc_buffer ? actor->tmp_buffer : ERR_PTR(-ENOMEM);
}
actor->next_index++;
actor->returned_pages++;
+ actor->last_page = actor->page[actor->next_page];
return actor->pageaddr = kmap_local_page(actor->page[actor->next_page++]);
}
@@ -125,6 +127,7 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_
actor->returned_pages = 0;
actor->next_index = page[0]->index & ~((1 << (msblk->block_log - PAGE_SHIFT)) - 1);
actor->pageaddr = NULL;
+ actor->last_page = NULL;
actor->alloc_buffer = msblk->decompressor->alloc_buffer;
actor->squashfs_first_page = direct_first_page;
actor->squashfs_next_page = direct_next_page;
diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
index 95ffbb543d91..97d4983559b1 100644
--- a/fs/squashfs/page_actor.h
+++ b/fs/squashfs/page_actor.h
@@ -16,6 +16,7 @@ struct squashfs_page_actor {
void *(*squashfs_first_page)(struct squashfs_page_actor *);
void *(*squashfs_next_page)(struct squashfs_page_actor *);
void (*squashfs_finish_page)(struct squashfs_page_actor *);
+ struct page *last_page;
int pages;
int length;
int next_page;
@@ -29,10 +30,13 @@ extern struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
extern struct squashfs_page_actor *squashfs_page_actor_init_special(
struct squashfs_sb_info *msblk,
struct page **page, int pages, int length);
-static inline void squashfs_page_actor_free(struct squashfs_page_actor *actor)
+static inline struct page *squashfs_page_actor_free(struct squashfs_page_actor *actor)
{
+ struct page *last_page = actor->last_page;
+
kfree(actor->tmp_buffer);
kfree(actor);
+ return last_page;
}
static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
{
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 9783e01c8100..a6164fdf9435 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -38,11 +38,24 @@ extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
extern void *squashfs_decompressor_setup(struct super_block *, unsigned short);
/* decompressor_xxx.c */
-extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *);
-extern void squashfs_decompressor_destroy(struct squashfs_sb_info *);
-extern int squashfs_decompress(struct squashfs_sb_info *, struct bio *,
- int, int, struct squashfs_page_actor *);
-extern int squashfs_max_decompressors(void);
+
+struct squashfs_decompressor_thread_ops {
+ void * (*create)(struct squashfs_sb_info *msblk, void *comp_opts);
+ void (*destroy)(struct squashfs_sb_info *msblk);
+ int (*decompress)(struct squashfs_sb_info *msblk, struct bio *bio,
+ int offset, int length, struct squashfs_page_actor *output);
+ int (*max_decompressors)(void);
+};
+
+#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE
+extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_single;
+#endif
+#ifdef CONFIG_SQUASHFS_DECOMP_MULTI
+extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_multi;
+#endif
+#ifdef CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU
+extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_percpu;
+#endif
/* export.c */
extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64,
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 1e90c2575f9b..659082e9e51d 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -53,7 +53,7 @@ struct squashfs_sb_info {
__le64 *xattr_id_table;
struct mutex meta_index_mutex;
struct meta_index *meta_index;
- struct squashfs_stream *stream;
+ void *stream;
__le64 *inode_lookup_table;
u64 inode_table;
u64 directory_table;
@@ -66,5 +66,7 @@ struct squashfs_sb_info {
int xattr_ids;
unsigned int ids;
bool panic_on_errors;
+ const struct squashfs_decompressor_thread_ops *thread_ops;
+ int max_thread_num;
};
#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 32565dafa7f3..e090fae48e68 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -47,10 +47,13 @@ enum Opt_errors {
enum squashfs_param {
Opt_errors,
+ Opt_threads,
};
struct squashfs_mount_opts {
enum Opt_errors errors;
+ const struct squashfs_decompressor_thread_ops *thread_ops;
+ int thread_num;
};
static const struct constant_table squashfs_param_errors[] = {
@@ -61,9 +64,66 @@ static const struct constant_table squashfs_param_errors[] = {
static const struct fs_parameter_spec squashfs_fs_parameters[] = {
fsparam_enum("errors", Opt_errors, squashfs_param_errors),
+ fsparam_string("threads", Opt_threads),
{}
};
+
+static int squashfs_parse_param_threads_str(const char *str, struct squashfs_mount_opts *opts)
+{
+#ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT
+ if (strcmp(str, "single") == 0) {
+ opts->thread_ops = &squashfs_decompressor_single;
+ return 0;
+ }
+ if (strcmp(str, "multi") == 0) {
+ opts->thread_ops = &squashfs_decompressor_multi;
+ return 0;
+ }
+ if (strcmp(str, "percpu") == 0) {
+ opts->thread_ops = &squashfs_decompressor_percpu;
+ return 0;
+ }
+#endif
+ return -EINVAL;
+}
+
+static int squashfs_parse_param_threads_num(const char *str, struct squashfs_mount_opts *opts)
+{
+#ifdef CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS
+ int ret;
+ unsigned long num;
+
+ ret = kstrtoul(str, 0, &num);
+ if (ret != 0)
+ return -EINVAL;
+ if (num > 1) {
+ opts->thread_ops = &squashfs_decompressor_multi;
+ if (num > opts->thread_ops->max_decompressors())
+ return -EINVAL;
+ opts->thread_num = (int)num;
+ return 0;
+ }
+#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE
+ if (num == 1) {
+ opts->thread_ops = &squashfs_decompressor_single;
+ opts->thread_num = 1;
+ return 0;
+ }
+#endif
+#endif /* !CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS */
+ return -EINVAL;
+}
+
+static int squashfs_parse_param_threads(const char *str, struct squashfs_mount_opts *opts)
+{
+ int ret = squashfs_parse_param_threads_str(str, opts);
+
+ if (ret == 0)
+ return ret;
+ return squashfs_parse_param_threads_num(str, opts);
+}
+
static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct squashfs_mount_opts *opts = fc->fs_private;
@@ -78,6 +138,10 @@ static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *para
case Opt_errors:
opts->errors = result.uint_32;
break;
+ case Opt_threads:
+ if (squashfs_parse_param_threads(param->string, opts) != 0)
+ return -EINVAL;
+ break;
default:
return -EINVAL;
}
@@ -133,6 +197,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
return -ENOMEM;
}
msblk = sb->s_fs_info;
+ msblk->thread_ops = opts->thread_ops;
msblk->panic_on_errors = (opts->errors == Opt_errors_panic);
@@ -168,6 +233,12 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
goto failed_mount;
}
+ if (opts->thread_num == 0) {
+ msblk->max_thread_num = msblk->thread_ops->max_decompressors();
+ } else {
+ msblk->max_thread_num = opts->thread_num;
+ }
+
/* Check the MAJOR & MINOR versions and lookup compression type */
msblk->decompressor = supported_squashfs_filesystem(
fc,
@@ -252,7 +323,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
/* Allocate read_page block */
msblk->read_page = squashfs_cache_init("data",
- squashfs_max_decompressors(), msblk->block_size);
+ msblk->max_thread_num, msblk->block_size);
if (msblk->read_page == NULL) {
errorf(fc, "Failed to allocate read_page block");
goto failed_mount;
@@ -383,7 +454,7 @@ failed_mount:
squashfs_cache_delete(msblk->block_cache);
squashfs_cache_delete(msblk->fragment_cache);
squashfs_cache_delete(msblk->read_page);
- squashfs_decompressor_destroy(msblk);
+ msblk->thread_ops->destroy(msblk);
kfree(msblk->inode_lookup_table);
kfree(msblk->fragment_index);
kfree(msblk->id_table);
@@ -435,6 +506,19 @@ static int squashfs_show_options(struct seq_file *s, struct dentry *root)
else
seq_puts(s, ",errors=continue");
+#ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT
+ if (msblk->thread_ops == &squashfs_decompressor_single) {
+ seq_puts(s, ",threads=single");
+ return 0;
+ }
+ if (msblk->thread_ops == &squashfs_decompressor_percpu) {
+ seq_puts(s, ",threads=percpu");
+ return 0;
+ }
+#endif
+#ifdef CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS
+ seq_printf(s, ",threads=%d", msblk->max_thread_num);
+#endif
return 0;
}
@@ -446,6 +530,16 @@ static int squashfs_init_fs_context(struct fs_context *fc)
if (!opts)
return -ENOMEM;
+#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE
+ opts->thread_ops = &squashfs_decompressor_single;
+#elif defined(CONFIG_SQUASHFS_DECOMP_MULTI)
+ opts->thread_ops = &squashfs_decompressor_multi;
+#elif defined(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU)
+ opts->thread_ops = &squashfs_decompressor_percpu;
+#else
+#error "fail: unknown squashfs decompression thread mode?"
+#endif
+ opts->thread_num = 0;
fc->fs_private = opts;
fc->ops = &squashfs_context_ops;
return 0;
@@ -478,7 +572,7 @@ static void squashfs_put_super(struct super_block *sb)
squashfs_cache_delete(sbi->block_cache);
squashfs_cache_delete(sbi->fragment_cache);
squashfs_cache_delete(sbi->read_page);
- squashfs_decompressor_destroy(sbi);
+ sbi->thread_ops->destroy(sbi);
kfree(sbi->id_table);
kfree(sbi->fragment_index);
kfree(sbi->meta_index);
@@ -568,7 +662,7 @@ static struct file_system_type squashfs_fs_type = {
.init_fs_context = squashfs_init_fs_context,
.parameters = squashfs_fs_parameters,
.kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("squashfs");
diff --git a/fs/stat.c b/fs/stat.c
index 9ced8860e0f3..d6cc74ca8486 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -5,6 +5,7 @@
* Copyright (C) 1991, 1992 Linus Torvalds
*/
+#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/errno.h>
@@ -43,12 +44,15 @@
void generic_fillattr(struct user_namespace *mnt_userns, struct inode *inode,
struct kstat *stat)
{
+ vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+
stat->dev = inode->i_sb->s_dev;
stat->ino = inode->i_ino;
stat->mode = inode->i_mode;
stat->nlink = inode->i_nlink;
- stat->uid = i_uid_into_mnt(mnt_userns, inode);
- stat->gid = i_gid_into_mnt(mnt_userns, inode);
+ stat->uid = vfsuid_into_kuid(vfsuid);
+ stat->gid = vfsgid_into_kgid(vfsgid);
stat->rdev = inode->i_rdev;
stat->size = i_size_read(inode);
stat->atime = inode->i_atime;
@@ -230,11 +234,22 @@ retry:
goto out;
error = vfs_getattr(&path, stat, request_mask, flags);
+
stat->mnt_id = real_mount(path.mnt)->mnt_id;
stat->result_mask |= STATX_MNT_ID;
+
if (path.mnt->mnt_root == path.dentry)
stat->attributes |= STATX_ATTR_MOUNT_ROOT;
stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT;
+
+ /* Handle STATX_DIOALIGN for block devices. */
+ if (request_mask & STATX_DIOALIGN) {
+ struct inode *inode = d_backing_inode(path.dentry);
+
+ if (S_ISBLK(inode->i_mode))
+ bdev_statx_dioalign(inode, stat);
+ }
+
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
@@ -611,6 +626,8 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
tmp.stx_dev_major = MAJOR(stat->dev);
tmp.stx_dev_minor = MINOR(stat->dev);
tmp.stx_mnt_id = stat->mnt_id;
+ tmp.stx_dio_mem_align = stat->dio_mem_align;
+ tmp.stx_dio_offset_align = stat->dio_offset_align;
return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}
diff --git a/fs/super.c b/fs/super.c
index 734ed584a946..12c08cb20405 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -291,7 +291,7 @@ static void __put_super(struct super_block *s)
WARN_ON(s->s_inode_lru.node);
WARN_ON(!list_empty(&s->s_mounts));
security_sb_free(s);
- fscrypt_sb_free(s);
+ fscrypt_destroy_keyring(s);
put_user_ns(s->s_user_ns);
kfree(s->s_subtype);
call_rcu(&s->rcu, destroy_super_rcu);
@@ -480,6 +480,7 @@ void generic_shutdown_super(struct super_block *sb)
evict_inodes(sb);
/* only nonzero refcount inodes can have marks */
fsnotify_sb_delete(sb);
+ fscrypt_destroy_keyring(sb);
security_sb_delete(sb);
if (sb->s_dio_done_wq) {
@@ -1111,55 +1112,14 @@ static int test_single_super(struct super_block *s, struct fs_context *fc)
return 1;
}
-/**
- * vfs_get_super - Get a superblock with a search key set in s_fs_info.
- * @fc: The filesystem context holding the parameters
- * @keying: How to distinguish superblocks
- * @fill_super: Helper to initialise a new superblock
- *
- * Search for a superblock and create a new one if not found. The search
- * criterion is controlled by @keying. If the search fails, a new superblock
- * is created and @fill_super() is called to initialise it.
- *
- * @keying can take one of a number of values:
- *
- * (1) vfs_get_single_super - Only one superblock of this type may exist on the
- * system. This is typically used for special system filesystems.
- *
- * (2) vfs_get_keyed_super - Multiple superblocks may exist, but they must have
- * distinct keys (where the key is in s_fs_info). Searching for the same
- * key again will turn up the superblock for that key.
- *
- * (3) vfs_get_independent_super - Multiple superblocks may exist and are
- * unkeyed. Each call will get a new superblock.
- *
- * A permissions check is made by sget_fc() unless we're getting a superblock
- * for a kernel-internal mount or a submount.
- */
-int vfs_get_super(struct fs_context *fc,
- enum vfs_get_super_keying keying,
- int (*fill_super)(struct super_block *sb,
- struct fs_context *fc))
+static int vfs_get_super(struct fs_context *fc, bool reconf,
+ int (*test)(struct super_block *, struct fs_context *),
+ int (*fill_super)(struct super_block *sb,
+ struct fs_context *fc))
{
- int (*test)(struct super_block *, struct fs_context *);
struct super_block *sb;
int err;
- switch (keying) {
- case vfs_get_single_super:
- case vfs_get_single_reconf_super:
- test = test_single_super;
- break;
- case vfs_get_keyed_super:
- test = test_keyed_super;
- break;
- case vfs_get_independent_super:
- test = NULL;
- break;
- default:
- BUG();
- }
-
sb = sget_fc(fc, test, set_anon_super_fc);
if (IS_ERR(sb))
return PTR_ERR(sb);
@@ -1173,7 +1133,7 @@ int vfs_get_super(struct fs_context *fc,
fc->root = dget(sb->s_root);
} else {
fc->root = dget(sb->s_root);
- if (keying == vfs_get_single_reconf_super) {
+ if (reconf) {
err = reconfigure_super(fc);
if (err < 0) {
dput(fc->root);
@@ -1189,13 +1149,12 @@ error:
deactivate_locked_super(sb);
return err;
}
-EXPORT_SYMBOL(vfs_get_super);
int get_tree_nodev(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc))
{
- return vfs_get_super(fc, vfs_get_independent_super, fill_super);
+ return vfs_get_super(fc, false, NULL, fill_super);
}
EXPORT_SYMBOL(get_tree_nodev);
@@ -1203,7 +1162,7 @@ int get_tree_single(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc))
{
- return vfs_get_super(fc, vfs_get_single_super, fill_super);
+ return vfs_get_super(fc, false, test_single_super, fill_super);
}
EXPORT_SYMBOL(get_tree_single);
@@ -1211,7 +1170,7 @@ int get_tree_single_reconf(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc))
{
- return vfs_get_super(fc, vfs_get_single_reconf_super, fill_super);
+ return vfs_get_super(fc, true, test_single_super, fill_super);
}
EXPORT_SYMBOL(get_tree_single_reconf);
@@ -1221,7 +1180,7 @@ int get_tree_keyed(struct fs_context *fc,
void *key)
{
fc->s_fs_info = key;
- return vfs_get_super(fc, vfs_get_keyed_super, fill_super);
+ return vfs_get_super(fc, false, test_keyed_super, fill_super);
}
EXPORT_SYMBOL(get_tree_keyed);
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index d4ec9bb97de9..3b8567564e7e 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -438,7 +438,7 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size)
res += blocks;
direct = 1;
}
- return blocks;
+ return res;
}
int sysv_getattr(struct user_namespace *mnt_userns, const struct path *path,
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index c57b46a352d8..3125e76376ee 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -24,6 +24,17 @@ static bool ubifs_crypt_empty_dir(struct inode *inode)
return ubifs_check_dir_empty(inode) == 0;
}
+/**
+ * ubifs_encrypt - Encrypt data.
+ * @inode: inode which refers to the data node
+ * @dn: data node to encrypt
+ * @in_len: length of data to be compressed
+ * @out_len: allocated memory size for the data area of @dn
+ * @block: logical block number of the block
+ *
+ * This function encrypt a possibly-compressed data in the data node.
+ * The encrypted data length will store in @out_len.
+ */
int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn,
unsigned int in_len, unsigned int *out_len, int block)
{
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index fc718f6178f2..9c9d3f0e36a4 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2467,7 +2467,7 @@ error_dump:
static inline int chance(unsigned int n, unsigned int out_of)
{
- return !!((prandom_u32() % out_of) + 1 <= n);
+ return !!(get_random_u32_below(out_of) + 1 <= n);
}
@@ -2485,13 +2485,13 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write)
if (chance(1, 2)) {
d->pc_delay = 1;
/* Fail within 1 minute */
- delay = prandom_u32() % 60000;
+ delay = get_random_u32_below(60000);
d->pc_timeout = jiffies;
d->pc_timeout += msecs_to_jiffies(delay);
ubifs_warn(c, "failing after %lums", delay);
} else {
d->pc_delay = 2;
- delay = prandom_u32() % 10000;
+ delay = get_random_u32_below(10000);
/* Fail within 10000 operations */
d->pc_cnt_max = delay;
ubifs_warn(c, "failing after %lu calls", delay);
@@ -2571,7 +2571,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf,
unsigned int from, to, ffs = chance(1, 2);
unsigned char *p = (void *)buf;
- from = prandom_u32() % len;
+ from = get_random_u32_below(len);
/* Corruption span max to end of write unit */
to = min(len, ALIGN(from + 1, c->max_write_size));
@@ -2581,7 +2581,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf,
if (ffs)
memset(p + from, 0xFF, to - from);
else
- prandom_bytes(p + from, to - from);
+ get_random_bytes(p + from, to - from);
return to;
}
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 86151889548e..0f29cf201136 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -68,13 +68,14 @@ static int inherit_flags(const struct inode *dir, umode_t mode)
* @c: UBIFS file-system description object
* @dir: parent directory inode
* @mode: inode mode flags
+ * @is_xattr: whether the inode is xattr inode
*
* This function finds an unused inode number, allocates new inode and
* initializes it. Returns new inode in case of success and an error code in
* case of failure.
*/
struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
- umode_t mode)
+ umode_t mode, bool is_xattr)
{
int err;
struct inode *inode;
@@ -99,10 +100,12 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
current_time(inode);
inode->i_mapping->nrpages = 0;
- err = fscrypt_prepare_new_inode(dir, inode, &encrypted);
- if (err) {
- ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err);
- goto out_iput;
+ if (!is_xattr) {
+ err = fscrypt_prepare_new_inode(dir, inode, &encrypted);
+ if (err) {
+ ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err);
+ goto out_iput;
+ }
}
switch (mode & S_IFMT) {
@@ -309,7 +312,7 @@ static int ubifs_create(struct user_namespace *mnt_userns, struct inode *dir,
sz_change = CALC_DENT_SIZE(fname_len(&nm));
- inode = ubifs_new_inode(c, dir, mode);
+ inode = ubifs_new_inode(c, dir, mode, false);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out_fname;
@@ -370,7 +373,7 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry)
if (err)
return ERR_PTR(err);
- inode = ubifs_new_inode(c, dir, mode);
+ inode = ubifs_new_inode(c, dir, mode, false);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out_free;
@@ -424,8 +427,9 @@ static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
}
static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+ struct file *file, umode_t mode)
{
+ struct dentry *dentry = file->f_path.dentry;
struct inode *inode;
struct ubifs_info *c = dir->i_sb->s_fs_info;
struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
@@ -462,7 +466,7 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
return err;
}
- inode = ubifs_new_inode(c, dir, mode);
+ inode = ubifs_new_inode(c, dir, mode, false);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out_budg;
@@ -475,7 +479,7 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
mutex_lock(&ui->ui_mutex);
insert_inode_hash(inode);
- d_tmpfile(dentry, inode);
+ d_tmpfile(file, inode);
ubifs_assert(c, ui->dirty);
instantiated = 1;
@@ -489,7 +493,7 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
ubifs_release_budget(c, &req);
- return 0;
+ return finish_open_simple(file, 0);
out_cancel:
unlock_2_inodes(dir, inode);
@@ -872,7 +876,7 @@ out_fname:
}
/**
- * check_dir_empty - check if a directory is empty or not.
+ * ubifs_check_dir_empty - check if a directory is empty or not.
* @dir: VFS inode object of the directory to check
*
* This function checks if directory @dir is empty. Returns zero if the
@@ -1004,7 +1008,7 @@ static int ubifs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
sz_change = CALC_DENT_SIZE(fname_len(&nm));
- inode = ubifs_new_inode(c, dir, S_IFDIR | mode);
+ inode = ubifs_new_inode(c, dir, S_IFDIR | mode, false);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out_fname;
@@ -1091,7 +1095,7 @@ static int ubifs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
sz_change = CALC_DENT_SIZE(fname_len(&nm));
- inode = ubifs_new_inode(c, dir, mode);
+ inode = ubifs_new_inode(c, dir, mode, false);
if (IS_ERR(inode)) {
kfree(dev);
err = PTR_ERR(inode);
@@ -1173,7 +1177,7 @@ static int ubifs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
sz_change = CALC_DENT_SIZE(fname_len(&nm));
- inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO);
+ inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO, false);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out_fname;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 75dab0ae3939..d02509920baf 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -503,7 +503,7 @@ static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui)
static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent)
{
if (c->double_hash)
- dent->cookie = (__force __le32) prandom_u32();
+ dent->cookie = (__force __le32) get_random_u32();
else
dent->cookie = 0;
}
@@ -1472,23 +1472,25 @@ out_free:
* @block: data block number
* @dn: data node to re-compress
* @new_len: new length
+ * @dn_size: size of the data node @dn in memory
*
* This function is used when an inode is truncated and the last data node of
* the inode has to be re-compressed/encrypted and re-written.
*/
static int truncate_data_node(const struct ubifs_info *c, const struct inode *inode,
unsigned int block, struct ubifs_data_node *dn,
- int *new_len)
+ int *new_len, int dn_size)
{
void *buf;
- int err, dlen, compr_type, out_len, old_dlen;
+ int err, dlen, compr_type, out_len, data_size;
out_len = le32_to_cpu(dn->size);
buf = kmalloc_array(out_len, WORST_COMPR_FACTOR, GFP_NOFS);
if (!buf)
return -ENOMEM;
- dlen = old_dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+ dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+ data_size = dn_size - UBIFS_DATA_NODE_SZ;
compr_type = le16_to_cpu(dn->compr_type);
if (IS_ENCRYPTED(inode)) {
@@ -1508,11 +1510,11 @@ static int truncate_data_node(const struct ubifs_info *c, const struct inode *in
}
if (IS_ENCRYPTED(inode)) {
- err = ubifs_encrypt(inode, dn, out_len, &old_dlen, block);
+ err = ubifs_encrypt(inode, dn, out_len, &data_size, block);
if (err)
goto out;
- out_len = old_dlen;
+ out_len = data_size;
} else {
dn->compr_size = 0;
}
@@ -1550,6 +1552,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
struct ubifs_trun_node *trun;
struct ubifs_data_node *dn;
int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode);
+ int dn_size;
struct ubifs_inode *ui = ubifs_inode(inode);
ino_t inum = inode->i_ino;
unsigned int blk;
@@ -1562,10 +1565,13 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
ubifs_assert(c, S_ISREG(inode->i_mode));
ubifs_assert(c, mutex_is_locked(&ui->ui_mutex));
- sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ +
- UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR;
+ dn_size = COMPRESSED_DATA_NODE_BUF_SZ;
- sz += ubifs_auth_node_sz(c);
+ if (IS_ENCRYPTED(inode))
+ dn_size += UBIFS_CIPHER_BLOCK_SIZE;
+
+ sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ +
+ dn_size + ubifs_auth_node_sz(c);
ino = kmalloc(sz, GFP_NOFS);
if (!ino)
@@ -1596,15 +1602,15 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
if (dn_len <= 0 || dn_len > UBIFS_BLOCK_SIZE) {
ubifs_err(c, "bad data node (block %u, inode %lu)",
blk, inode->i_ino);
- ubifs_dump_node(c, dn, sz - UBIFS_INO_NODE_SZ -
- UBIFS_TRUN_NODE_SZ);
+ ubifs_dump_node(c, dn, dn_size);
goto out_free;
}
if (dn_len <= dlen)
dlen = 0; /* Nothing to do */
else {
- err = truncate_data_node(c, inode, blk, dn, &dlen);
+ err = truncate_data_node(c, inode, blk, dn,
+ &dlen, dn_size);
if (err)
goto out_free;
}
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index d76a19e460cd..c4d079328b92 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -1970,28 +1970,28 @@ static int dbg_populate_lsave(struct ubifs_info *c)
if (!dbg_is_chk_gen(c))
return 0;
- if (prandom_u32() & 3)
+ if (get_random_u32_below(4))
return 0;
for (i = 0; i < c->lsave_cnt; i++)
c->lsave[i] = c->main_first;
list_for_each_entry(lprops, &c->empty_list, list)
- c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum;
+ c->lsave[get_random_u32_below(c->lsave_cnt)] = lprops->lnum;
list_for_each_entry(lprops, &c->freeable_list, list)
- c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum;
+ c->lsave[get_random_u32_below(c->lsave_cnt)] = lprops->lnum;
list_for_each_entry(lprops, &c->frdi_idx_list, list)
- c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum;
+ c->lsave[get_random_u32_below(c->lsave_cnt)] = lprops->lnum;
heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
for (i = 0; i < heap->cnt; i++)
- c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum;
+ c->lsave[get_random_u32_below(c->lsave_cnt)] = heap->arr[i]->lnum;
heap = &c->lpt_heap[LPROPS_DIRTY - 1];
for (i = 0; i < heap->cnt; i++)
- c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum;
+ c->lsave[get_random_u32_below(c->lsave_cnt)] = heap->arr[i]->lnum;
heap = &c->lpt_heap[LPROPS_FREE - 1];
for (i = 0; i < heap->cnt; i++)
- c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum;
+ c->lsave[get_random_u32_below(c->lsave_cnt)] = heap->arr[i]->lnum;
return 1;
}
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 58c92c96ecef..a55e04822d16 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -700,7 +700,7 @@ static int alloc_idx_lebs(struct ubifs_info *c, int cnt)
c->ilebs[c->ileb_cnt++] = lnum;
dbg_cmt("LEB %d", lnum);
}
- if (dbg_is_chk_index(c) && !(prandom_u32() & 7))
+ if (dbg_is_chk_index(c) && !get_random_u32_below(8))
return -ENOSPC;
return 0;
}
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 7d6d2f152e03..478bbbb5382f 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -2026,7 +2026,7 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags);
/* dir.c */
struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
- umode_t mode);
+ umode_t mode, bool is_xattr);
int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags);
int ubifs_check_dir_empty(struct inode *dir);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index e4c4761aff7f..3db8486e3725 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -110,7 +110,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
if (err)
return err;
- inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO);
+ inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO, true);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out_budg;
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index cad3772f9dbe..be640f4b2f2c 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -130,7 +130,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
brelse(tmp);
}
if (num) {
- ll_rw_block(REQ_OP_READ | REQ_RAHEAD, num, bha);
+ bh_readahead_batch(num, bha, REQ_RAHEAD);
for (i = 0; i < num; i++)
brelse(bha[i]);
}
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index a2adf6293093..16bcf2c6b8b3 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -89,7 +89,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
brelse(tmp);
}
if (num) {
- ll_rw_block(REQ_OP_READ | REQ_RAHEAD, num, bha);
+ bh_readahead_batch(num, bha, REQ_RAHEAD);
for (i = 0; i < num; i++)
brelse(bha[i]);
}
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 09aef77269fe..5c659e23e578 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -252,6 +252,7 @@ const struct file_operations udf_file_operations = {
.release = udf_release_file,
.fsync = generic_file_fsync,
.splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.llseek = generic_file_llseek,
};
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 8d06daed549f..1d7c2a812fc1 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -182,11 +182,6 @@ static void udf_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int udf_writepage(struct page *page, struct writeback_control *wbc)
-{
- return block_write_full_page(page, udf_get_block, wbc);
-}
-
static int udf_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -239,12 +234,12 @@ const struct address_space_operations udf_aops = {
.invalidate_folio = block_invalidate_folio,
.read_folio = udf_read_folio,
.readahead = udf_readahead,
- .writepage = udf_writepage,
.writepages = udf_writepages,
.write_begin = udf_write_begin,
.write_end = generic_write_end,
.direct_IO = udf_direct_IO,
.bmap = udf_bmap,
+ .migrate_folio = buffer_migrate_folio,
};
/*
@@ -439,6 +434,12 @@ static int udf_get_block(struct inode *inode, sector_t block,
iinfo->i_next_alloc_goal++;
}
+ /*
+ * Block beyond EOF and prealloc extents? Just discard preallocation
+ * as it is not useful and complicates things.
+ */
+ if (((loff_t)block) << inode->i_blkbits > iinfo->i_lenExtents)
+ udf_discard_prealloc(inode);
udf_clear_extent_cache(inode);
phys = inode_getblk(inode, block, &err, &new);
if (!phys)
@@ -488,8 +489,6 @@ static int udf_do_extend_file(struct inode *inode,
uint32_t add;
int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
struct super_block *sb = inode->i_sb;
- struct kernel_lb_addr prealloc_loc = {};
- uint32_t prealloc_len = 0;
struct udf_inode_info *iinfo;
int err;
@@ -510,19 +509,6 @@ static int udf_do_extend_file(struct inode *inode,
~(sb->s_blocksize - 1);
}
- /* Last extent are just preallocated blocks? */
- if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) ==
- EXT_NOT_RECORDED_ALLOCATED) {
- /* Save the extent so that we can reattach it to the end */
- prealloc_loc = last_ext->extLocation;
- prealloc_len = last_ext->extLength;
- /* Mark the extent as a hole */
- last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
- (last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
- last_ext->extLocation.logicalBlockNum = 0;
- last_ext->extLocation.partitionReferenceNum = 0;
- }
-
/* Can we merge with the previous extent? */
if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) ==
EXT_NOT_RECORDED_NOT_ALLOCATED) {
@@ -550,7 +536,7 @@ static int udf_do_extend_file(struct inode *inode,
* more extents, we may need to enter possible following
* empty indirect extent.
*/
- if (new_block_bytes || prealloc_len)
+ if (new_block_bytes)
udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0);
}
@@ -584,17 +570,6 @@ static int udf_do_extend_file(struct inode *inode,
}
out:
- /* Do we have some preallocated blocks saved? */
- if (prealloc_len) {
- err = udf_add_aext(inode, last_pos, &prealloc_loc,
- prealloc_len, 1);
- if (err)
- return err;
- last_ext->extLocation = prealloc_loc;
- last_ext->extLength = prealloc_len;
- count++;
- }
-
/* last_pos should point to the last written extent... */
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
last_pos->offset -= sizeof(struct short_ad);
@@ -610,13 +585,17 @@ out:
static void udf_do_extend_final_block(struct inode *inode,
struct extent_position *last_pos,
struct kernel_long_ad *last_ext,
- uint32_t final_block_len)
+ uint32_t new_elen)
{
- struct super_block *sb = inode->i_sb;
uint32_t added_bytes;
- added_bytes = final_block_len -
- (last_ext->extLength & (sb->s_blocksize - 1));
+ /*
+ * Extent already large enough? It may be already rounded up to block
+ * size...
+ */
+ if (new_elen <= (last_ext->extLength & UDF_EXTENT_LENGTH_MASK))
+ return;
+ added_bytes = (last_ext->extLength & UDF_EXTENT_LENGTH_MASK) - new_elen;
last_ext->extLength += added_bytes;
UDF_I(inode)->i_lenExtents += added_bytes;
@@ -633,12 +612,12 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
int8_t etype;
struct super_block *sb = inode->i_sb;
sector_t first_block = newsize >> sb->s_blocksize_bits, offset;
- unsigned long partial_final_block;
+ loff_t new_elen;
int adsize;
struct udf_inode_info *iinfo = UDF_I(inode);
struct kernel_long_ad extent;
int err = 0;
- int within_final_block;
+ bool within_last_ext;
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
adsize = sizeof(struct short_ad);
@@ -647,8 +626,17 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
else
BUG();
+ /*
+ * When creating hole in file, just don't bother with preserving
+ * preallocation. It likely won't be very useful anyway.
+ */
+ udf_discard_prealloc(inode);
+
etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
- within_final_block = (etype != -1);
+ within_last_ext = (etype != -1);
+ /* We don't expect extents past EOF... */
+ WARN_ON_ONCE(within_last_ext &&
+ elen > ((loff_t)offset + 1) << inode->i_blkbits);
if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) ||
(epos.bh && epos.offset == sizeof(struct allocExtDesc))) {
@@ -664,19 +652,17 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
extent.extLength |= etype << 30;
}
- partial_final_block = newsize & (sb->s_blocksize - 1);
+ new_elen = ((loff_t)offset << inode->i_blkbits) |
+ (newsize & (sb->s_blocksize - 1));
/* File has extent covering the new size (could happen when extending
* inside a block)?
*/
- if (within_final_block) {
+ if (within_last_ext) {
/* Extending file within the last file block */
- udf_do_extend_final_block(inode, &epos, &extent,
- partial_final_block);
+ udf_do_extend_final_block(inode, &epos, &extent, new_elen);
} else {
- loff_t add = ((loff_t)offset << sb->s_blocksize_bits) |
- partial_final_block;
- err = udf_do_extend_file(inode, &epos, &extent, add);
+ err = udf_do_extend_file(inode, &epos, &extent, new_elen);
}
if (err < 0)
@@ -777,10 +763,11 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
goto out_free;
}
- /* Are we beyond EOF? */
+ /* Are we beyond EOF and preallocated extent? */
if (etype == -1) {
int ret;
loff_t hole_len;
+
isBeyondEOF = true;
if (count) {
if (c)
@@ -1211,13 +1198,7 @@ struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block,
if (!bh)
return NULL;
- if (buffer_uptodate(bh))
- return bh;
-
- ll_rw_block(REQ_OP_READ, 1, &bh);
-
- wait_on_buffer(bh);
- if (buffer_uptodate(bh))
+ if (bh_read(bh, 0) >= 0)
return bh;
brelse(bh);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index b3d5f97f16cd..7c95c549dd64 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -240,7 +240,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
poffset - lfi);
else {
if (!copy_name) {
- copy_name = kmalloc(UDF_NAME_LEN,
+ copy_name = kmalloc(UDF_NAME_LEN_CS0,
GFP_NOFS);
if (!copy_name) {
fi = ERR_PTR(-ENOMEM);
@@ -626,7 +626,7 @@ static int udf_create(struct user_namespace *mnt_userns, struct inode *dir,
}
static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+ struct file *file, umode_t mode)
{
struct inode *inode = udf_new_inode(dir, mode);
@@ -640,9 +640,9 @@ static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
inode->i_op = &udf_file_inode_operations;
inode->i_fop = &udf_file_operations;
mark_inode_dirty(inode);
- d_tmpfile(dentry, inode);
+ d_tmpfile(file, inode);
unlock_new_inode(inode);
- return 0;
+ return finish_open_simple(file, 0);
}
static int udf_mknod(struct user_namespace *mnt_userns, struct inode *dir,
@@ -1091,8 +1091,9 @@ static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
return -EINVAL;
ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
- if (IS_ERR(ofi)) {
- retval = PTR_ERR(ofi);
+ if (!ofi || IS_ERR(ofi)) {
+ if (IS_ERR(ofi))
+ retval = PTR_ERR(ofi);
goto end_rename;
}
@@ -1101,8 +1102,7 @@ static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
brelse(ofibh.sbh);
tloc = lelb_to_cpu(ocfi.icb.extLocation);
- if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0)
- != old_inode->i_ino)
+ if (udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) != old_inode->i_ino)
goto end_rename;
nfi = udf_find_entry(new_dir, &new_dentry->d_name, &nfibh, &ncfi);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 4042d9739fb7..06eda8177b5f 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -162,7 +162,7 @@ static void udf_free_in_core_inode(struct inode *inode)
static void init_once(void *foo)
{
- struct udf_inode_info *ei = (struct udf_inode_info *)foo;
+ struct udf_inode_info *ei = foo;
ei->i_data = NULL;
inode_init_once(&ei->vfs_inode);
@@ -820,7 +820,7 @@ static int udf_find_fileset(struct super_block *sb,
struct kernel_lb_addr *fileset,
struct kernel_lb_addr *root)
{
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
uint16_t ident;
int ret;
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 532cda99644e..036ebd892b85 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -120,60 +120,42 @@ void udf_truncate_tail_extent(struct inode *inode)
void udf_discard_prealloc(struct inode *inode)
{
- struct extent_position epos = { NULL, 0, {0, 0} };
+ struct extent_position epos = {};
+ struct extent_position prev_epos = {};
struct kernel_lb_addr eloc;
uint32_t elen;
uint64_t lbcount = 0;
int8_t etype = -1, netype;
- int adsize;
struct udf_inode_info *iinfo = UDF_I(inode);
+ int bsize = 1 << inode->i_blkbits;
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB ||
- inode->i_size == iinfo->i_lenExtents)
+ ALIGN(inode->i_size, bsize) == ALIGN(iinfo->i_lenExtents, bsize))
return;
- if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
- adsize = sizeof(struct short_ad);
- else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
- adsize = sizeof(struct long_ad);
- else
- adsize = 0;
-
epos.block = iinfo->i_location;
/* Find the last extent in the file */
- while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) {
- etype = netype;
+ while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 0)) != -1) {
+ brelse(prev_epos.bh);
+ prev_epos = epos;
+ if (prev_epos.bh)
+ get_bh(prev_epos.bh);
+
+ etype = udf_next_aext(inode, &epos, &eloc, &elen, 1);
lbcount += elen;
}
if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
- epos.offset -= adsize;
lbcount -= elen;
- extent_trunc(inode, &epos, &eloc, etype, elen, 0);
- if (!epos.bh) {
- iinfo->i_lenAlloc =
- epos.offset -
- udf_file_entry_alloc_offset(inode);
- mark_inode_dirty(inode);
- } else {
- struct allocExtDesc *aed =
- (struct allocExtDesc *)(epos.bh->b_data);
- aed->lengthAllocDescs =
- cpu_to_le32(epos.offset -
- sizeof(struct allocExtDesc));
- if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
- UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
- udf_update_tag(epos.bh->b_data, epos.offset);
- else
- udf_update_tag(epos.bh->b_data,
- sizeof(struct allocExtDesc));
- mark_buffer_dirty_inode(epos.bh, inode);
- }
+ udf_delete_aext(inode, prev_epos);
+ udf_free_blocks(inode->i_sb, inode, &eloc, 0,
+ DIV_ROUND_UP(elen, 1 << inode->i_blkbits));
}
/* This inode entry is in-memory only and thus we don't have to mark
* the inode dirty */
iinfo->i_lenExtents = lbcount;
brelse(epos.bh);
+ brelse(prev_epos.bh);
}
static void udf_update_alloc_ext_desc(struct inode *inode,
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 4fa620543d30..291b56dd011e 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -6,7 +6,11 @@
#include <linux/bitops.h>
#include <linux/magic.h>
-#define UDF_MAX_READ_VERSION 0x0250
+/*
+ * Even UDF 2.6 media should have version <= 0x250 but apparently there are
+ * some broken filesystems with version set to 0x260. Accommodate those.
+ */
+#define UDF_MAX_READ_VERSION 0x0260
#define UDF_MAX_WRITE_VERSION 0x0201
#define UDF_FLAG_USE_EXTENDED_FE 0
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index bd810d8239f2..2436e3f82147 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -295,14 +295,10 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
if (!buffer_mapped(bh))
map_bh(bh, inode->i_sb, oldb + pos);
- if (!buffer_uptodate(bh)) {
- ll_rw_block(REQ_OP_READ, 1, &bh);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
- ufs_error(inode->i_sb, __func__,
- "read of block failed\n");
- break;
- }
+ if (bh_read(bh, 0) < 0) {
+ ufs_error(inode->i_sb, __func__,
+ "read of block failed\n");
+ break;
}
UFSD(" change from %llu to %llu, pos %u\n",
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 175de70e3adf..98ac37e34e3d 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -30,6 +30,7 @@
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/swapops.h>
+#include <linux/miscdevice.h>
int sysctl_unprivileged_userfaultfd __read_mostly;
@@ -415,13 +416,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
if (ctx->features & UFFD_FEATURE_SIGBUS)
goto out;
- if ((vmf->flags & FAULT_FLAG_USER) == 0 &&
- ctx->flags & UFFD_USER_MODE_ONLY) {
- printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
- "sysctl knob to 1 if kernel faults must be handled "
- "without obtaining CAP_SYS_PTRACE capability\n");
+ if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
goto out;
- }
/*
* If it's already released don't get it. This avoids to loop
@@ -615,14 +611,16 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
if (release_new_ctx) {
struct vm_area_struct *vma;
struct mm_struct *mm = release_new_ctx->mm;
+ VMA_ITERATOR(vmi, mm, 0);
/* the various vma->vm_userfaultfd_ctx still points to it */
mmap_write_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next)
+ for_each_vma(vmi, vma) {
if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
vma->vm_flags &= ~__VM_UFFD_FLAGS;
}
+ }
mmap_write_unlock(mm);
userfaultfd_ctx_put(release_new_ctx);
@@ -803,11 +801,13 @@ static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
return false;
}
-int userfaultfd_unmap_prep(struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct list_head *unmaps)
+int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct list_head *unmaps)
{
- for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
+ VMA_ITERATOR(vmi, mm, start);
+ struct vm_area_struct *vma;
+
+ for_each_vma_range(vmi, vma, end) {
struct userfaultfd_unmap_ctx *unmap_ctx;
struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
@@ -857,6 +857,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
/* len == 0 means wake all */
struct userfaultfd_wake_range range = { .len = 0, };
unsigned long new_flags;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
WRITE_ONCE(ctx->released, true);
@@ -873,7 +874,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
*/
mmap_write_lock(mm);
prev = NULL;
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_for_each(&mas, vma, ULONG_MAX) {
cond_resched();
BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
!!(vma->vm_flags & __VM_UFFD_FLAGS));
@@ -887,10 +888,13 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
NULL_VM_UFFD_CTX, anon_vma_name(vma));
- if (prev)
+ if (prev) {
+ mas_pause(&mas);
vma = prev;
- else
+ } else {
prev = vma;
+ }
+
vma->vm_flags = new_flags;
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
}
@@ -991,7 +995,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *new,
int fd;
fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new,
- O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
+ O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
if (fd < 0)
return fd;
@@ -1272,6 +1276,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
bool found;
bool basic_ioctls;
unsigned long start, end, vma_end;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1314,7 +1319,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
goto out;
mmap_write_lock(mm);
- vma = find_vma_prev(mm, start, &prev);
+ mas_set(&mas, start);
+ vma = mas_find(&mas, ULONG_MAX);
if (!vma)
goto out_unlock;
@@ -1339,7 +1345,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
*/
found = false;
basic_ioctls = false;
- for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+ for (cur = vma; cur; cur = mas_next(&mas, end - 1)) {
cond_resched();
BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
@@ -1399,8 +1405,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
}
BUG_ON(!found);
- if (vma->vm_start < start)
- prev = vma;
+ mas_set(&mas, start);
+ prev = mas_prev(&mas, 0);
+ if (prev != vma)
+ mas_next(&mas, ULONG_MAX);
ret = 0;
do {
@@ -1430,6 +1438,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
((struct vm_userfaultfd_ctx){ ctx }),
anon_vma_name(vma));
if (prev) {
+ /* vma_merge() invalidated the mas */
+ mas_pause(&mas);
vma = prev;
goto next;
}
@@ -1437,11 +1447,15 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
ret = split_vma(mm, vma, start, 1);
if (ret)
break;
+ /* split_vma() invalidated the mas */
+ mas_pause(&mas);
}
if (vma->vm_end > end) {
ret = split_vma(mm, vma, end, 0);
if (ret)
break;
+ /* split_vma() invalidated the mas */
+ mas_pause(&mas);
}
next:
/*
@@ -1458,8 +1472,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
skip:
prev = vma;
start = vma->vm_end;
- vma = vma->vm_next;
- } while (vma && vma->vm_start < end);
+ vma = mas_next(&mas, end - 1);
+ } while (vma);
out_unlock:
mmap_write_unlock(mm);
mmput(mm);
@@ -1503,6 +1517,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
bool found;
unsigned long start, end, vma_end;
const void __user *buf = (void __user *)arg;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
ret = -EFAULT;
if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
@@ -1521,7 +1536,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
goto out;
mmap_write_lock(mm);
- vma = find_vma_prev(mm, start, &prev);
+ mas_set(&mas, start);
+ vma = mas_find(&mas, ULONG_MAX);
if (!vma)
goto out_unlock;
@@ -1546,7 +1562,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
*/
found = false;
ret = -EINVAL;
- for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+ for (cur = vma; cur; cur = mas_next(&mas, end - 1)) {
cond_resched();
BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
@@ -1566,8 +1582,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
}
BUG_ON(!found);
- if (vma->vm_start < start)
- prev = vma;
+ mas_set(&mas, start);
+ prev = mas_prev(&mas, 0);
+ if (prev != vma)
+ mas_next(&mas, ULONG_MAX);
ret = 0;
do {
@@ -1612,17 +1630,20 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
NULL_VM_UFFD_CTX, anon_vma_name(vma));
if (prev) {
vma = prev;
+ mas_pause(&mas);
goto next;
}
if (vma->vm_start < start) {
ret = split_vma(mm, vma, start, 1);
if (ret)
break;
+ mas_pause(&mas);
}
if (vma->vm_end > end) {
ret = split_vma(mm, vma, end, 0);
if (ret)
break;
+ mas_pause(&mas);
}
next:
/*
@@ -1636,8 +1657,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
skip:
prev = vma;
start = vma->vm_end;
- vma = vma->vm_next;
- } while (vma && vma->vm_start < end);
+ vma = mas_next(&mas, end - 1);
+ } while (vma);
out_unlock:
mmap_write_unlock(mm);
mmput(mm);
@@ -2056,20 +2077,11 @@ static void init_once_userfaultfd_ctx(void *mem)
seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
}
-SYSCALL_DEFINE1(userfaultfd, int, flags)
+static int new_userfaultfd(int flags)
{
struct userfaultfd_ctx *ctx;
int fd;
- if (!sysctl_unprivileged_userfaultfd &&
- (flags & UFFD_USER_MODE_ONLY) == 0 &&
- !capable(CAP_SYS_PTRACE)) {
- printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
- "sysctl knob to 1 if kernel faults must be handled "
- "without obtaining CAP_SYS_PTRACE capability\n");
- return -EPERM;
- }
-
BUG_ON(!current->mm);
/* Check the UFFD_* constants for consistency. */
@@ -2094,7 +2106,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
mmgrab(ctx->mm);
fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx,
- O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
+ O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
if (fd < 0) {
mmdrop(ctx->mm);
kmem_cache_free(userfaultfd_ctx_cachep, ctx);
@@ -2102,8 +2114,60 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
return fd;
}
+static inline bool userfaultfd_syscall_allowed(int flags)
+{
+ /* Userspace-only page faults are always allowed */
+ if (flags & UFFD_USER_MODE_ONLY)
+ return true;
+
+ /*
+ * The user is requesting a userfaultfd which can handle kernel faults.
+ * Privileged users are always allowed to do this.
+ */
+ if (capable(CAP_SYS_PTRACE))
+ return true;
+
+ /* Otherwise, access to kernel fault handling is sysctl controlled. */
+ return sysctl_unprivileged_userfaultfd;
+}
+
+SYSCALL_DEFINE1(userfaultfd, int, flags)
+{
+ if (!userfaultfd_syscall_allowed(flags))
+ return -EPERM;
+
+ return new_userfaultfd(flags);
+}
+
+static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
+{
+ if (cmd != USERFAULTFD_IOC_NEW)
+ return -EINVAL;
+
+ return new_userfaultfd(flags);
+}
+
+static const struct file_operations userfaultfd_dev_fops = {
+ .unlocked_ioctl = userfaultfd_dev_ioctl,
+ .compat_ioctl = userfaultfd_dev_ioctl,
+ .owner = THIS_MODULE,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice userfaultfd_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "userfaultfd",
+ .fops = &userfaultfd_dev_fops
+};
+
static int __init userfaultfd_init(void)
{
+ int ret;
+
+ ret = misc_register(&userfaultfd_misc);
+ if (ret)
+ return ret;
+
userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
sizeof(struct userfaultfd_ctx),
0,
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index 629785c95007..c7fcb855e068 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -32,6 +32,11 @@ struct fsverity_hash_alg {
unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */
unsigned int block_size; /* block size in bytes, e.g. 64 for SHA-256 */
mempool_t req_pool; /* mempool with a preallocated hash request */
+ /*
+ * The HASH_ALGO_* constant for this algorithm. This is different from
+ * FS_VERITY_HASH_ALG_*, which uses a different numbering scheme.
+ */
+ enum hash_algo algo_id;
};
/* Merkle tree parameters: hash algorithm, initial hash state, and topology */
@@ -70,8 +75,6 @@ struct fsverity_info {
const struct inode *inode;
};
-/* Arbitrary limit to bound the kmalloc() size. Can be changed. */
-#define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384
#define FS_VERITY_MAX_SIGNATURE_SIZE (FS_VERITY_MAX_DESCRIPTOR_SIZE - \
sizeof(struct fsverity_descriptor))
diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c
index 71d0fccb6d4c..6f8170cf4ae7 100644
--- a/fs/verity/hash_algs.c
+++ b/fs/verity/hash_algs.c
@@ -16,11 +16,13 @@ struct fsverity_hash_alg fsverity_hash_algs[] = {
.name = "sha256",
.digest_size = SHA256_DIGEST_SIZE,
.block_size = SHA256_BLOCK_SIZE,
+ .algo_id = HASH_ALGO_SHA256,
},
[FS_VERITY_HASH_ALG_SHA512] = {
.name = "sha512",
.digest_size = SHA512_DIGEST_SIZE,
.block_size = SHA512_BLOCK_SIZE,
+ .algo_id = HASH_ALGO_SHA512,
},
};
@@ -324,5 +326,9 @@ void __init fsverity_check_hash_algs(void)
*/
BUG_ON(!is_power_of_2(alg->digest_size));
BUG_ON(!is_power_of_2(alg->block_size));
+
+ /* Verify that there is a valid mapping to HASH_ALGO_*. */
+ BUG_ON(alg->algo_id == 0);
+ BUG_ON(alg->digest_size != hash_digest_size[alg->algo_id]);
}
}
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
index e99c00350c28..5c79ea1b2468 100644
--- a/fs/verity/measure.c
+++ b/fs/verity/measure.c
@@ -65,8 +65,7 @@ EXPORT_SYMBOL_GPL(fsverity_ioctl_measure);
* @alg: (out) pointer to the hash algorithm enumeration
*
* Return the file hash algorithm and digest of an fsverity protected file.
- * Assumption: before calling fsverity_get_digest(), the file must have been
- * opened.
+ * Assumption: before calling this, the file must have been opened.
*
* Return: 0 on success, -errno on failure
*/
@@ -76,27 +75,13 @@ int fsverity_get_digest(struct inode *inode,
{
const struct fsverity_info *vi;
const struct fsverity_hash_alg *hash_alg;
- int i;
vi = fsverity_get_info(inode);
if (!vi)
return -ENODATA; /* not a verity file */
hash_alg = vi->tree_params.hash_alg;
- memset(digest, 0, FS_VERITY_MAX_DIGEST_SIZE);
-
- /* convert the verity hash algorithm name to a hash_algo_name enum */
- i = match_string(hash_algo_name, HASH_ALGO__LAST, hash_alg->name);
- if (i < 0)
- return -EINVAL;
- *alg = i;
-
- if (WARN_ON_ONCE(hash_alg->digest_size != hash_digest_size[*alg]))
- return -EINVAL;
memcpy(digest, vi->file_digest, hash_alg->digest_size);
-
- pr_debug("file digest %s:%*phN\n", hash_algo_name[*alg],
- hash_digest_size[*alg], digest);
-
+ *alg = hash_alg->algo_id;
return 0;
}
diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c
index 6ee849dc7bc1..2aefc5565152 100644
--- a/fs/verity/read_metadata.c
+++ b/fs/verity/read_metadata.c
@@ -53,14 +53,14 @@ static int fsverity_read_merkle_tree(struct inode *inode,
break;
}
- virt = kmap(page);
+ virt = kmap_local_page(page);
if (copy_to_user(buf, virt + offs_in_page, bytes_to_copy)) {
- kunmap(page);
+ kunmap_local(virt);
put_page(page);
err = -EFAULT;
break;
}
- kunmap(page);
+ kunmap_local(virt);
put_page(page);
retval += bytes_to_copy;
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index 14e2fb49cff5..961ba248021f 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -39,16 +39,6 @@ static void hash_at_level(const struct merkle_tree_params *params,
(params->log_blocksize - params->log_arity);
}
-/* Extract a hash from a hash page */
-static void extract_hash(struct page *hpage, unsigned int hoffset,
- unsigned int hsize, u8 *out)
-{
- void *virt = kmap_atomic(hpage);
-
- memcpy(out, virt + hoffset, hsize);
- kunmap_atomic(virt);
-}
-
static inline int cmp_hashes(const struct fsverity_info *vi,
const u8 *want_hash, const u8 *real_hash,
pgoff_t index, int level)
@@ -129,7 +119,7 @@ static bool verify_page(struct inode *inode, const struct fsverity_info *vi,
}
if (PageChecked(hpage)) {
- extract_hash(hpage, hoffset, hsize, _want_hash);
+ memcpy_from_page(_want_hash, hpage, hoffset, hsize);
want_hash = _want_hash;
put_page(hpage);
pr_debug_ratelimited("Hash page already checked, want %s:%*phN\n",
@@ -158,7 +148,7 @@ descend:
if (err)
goto out;
SetPageChecked(hpage);
- extract_hash(hpage, hoffset, hsize, _want_hash);
+ memcpy_from_page(_want_hash, hpage, hoffset, hsize);
want_hash = _want_hash;
put_page(hpage);
pr_debug("Verified hash page at level %d, now want %s:%*phN\n",
@@ -210,9 +200,8 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page);
* @bio: the bio to verify
*
* Verify a set of pages that have just been read from a verity file. The pages
- * must be pagecache pages that are still locked and not yet uptodate. Pages
- * that fail verification are set to the Error state. Verification is skipped
- * for pages already in the Error state, e.g. due to fscrypt decryption failure.
+ * must be pagecache pages that are still locked and not yet uptodate. If a
+ * page fails verification, then bio->bi_status is set to an error status.
*
* This is a helper function for use by the ->readahead() method of filesystems
* that issue bios to read data directly into the page cache. Filesystems that
@@ -254,9 +243,10 @@ void fsverity_verify_bio(struct bio *bio)
unsigned long level0_ra_pages =
min(max_ra_pages, params->level0_blocks - level0_index);
- if (!PageError(page) &&
- !verify_page(inode, vi, req, page, level0_ra_pages))
- SetPageError(page);
+ if (!verify_page(inode, vi, req, page, level0_ra_pages)) {
+ bio->bi_status = BLK_STS_IOERR;
+ break;
+ }
}
fsverity_free_hash_request(params->hash_alg, req);
diff --git a/fs/xattr.c b/fs/xattr.c
index a1f4998bc6be..adab9a70b536 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -80,6 +80,31 @@ xattr_resolve_name(struct inode *inode, const char **name)
return ERR_PTR(-EOPNOTSUPP);
}
+/**
+ * may_write_xattr - check whether inode allows writing xattr
+ * @mnt_userns: User namespace of the mount the inode was found from
+ * @inode: the inode on which to set an xattr
+ *
+ * Check whether the inode allows writing xattrs. Specifically, we can never
+ * set or remove an extended attribute on a read-only filesystem or on an
+ * immutable / append-only inode.
+ *
+ * We also need to ensure that the inode has a mapping in the mount to
+ * not risk writing back invalid i_{g,u}id values.
+ *
+ * Return: On success zero is returned. On error a negative errno is returned.
+ */
+int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode)
+{
+ if (IS_IMMUTABLE(inode))
+ return -EPERM;
+ if (IS_APPEND(inode))
+ return -EPERM;
+ if (HAS_UNMAPPED_ID(mnt_userns, inode))
+ return -EPERM;
+ return 0;
+}
+
/*
* Check permissions for extended attribute access. This is a bit complicated
* because different namespaces have very different rules.
@@ -88,20 +113,12 @@ static int
xattr_permission(struct user_namespace *mnt_userns, struct inode *inode,
const char *name, int mask)
{
- /*
- * We can never set or remove an extended attribute on a read-only
- * filesystem or on an immutable / append-only inode.
- */
if (mask & MAY_WRITE) {
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- return -EPERM;
- /*
- * Updating an xattr will likely cause i_uid and i_gid
- * to be writen back improperly if their true value is
- * unknown to the vfs.
- */
- if (HAS_UNMAPPED_ID(mnt_userns, inode))
- return -EPERM;
+ int ret;
+
+ ret = may_write_xattr(mnt_userns, inode);
+ if (ret)
+ return ret;
}
/*
@@ -172,6 +189,9 @@ __vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
{
const struct xattr_handler *handler;
+ if (is_posix_acl_xattr(name))
+ return -EOPNOTSUPP;
+
handler = xattr_resolve_name(inode, &name);
if (IS_ERR(handler))
return PTR_ERR(handler);
@@ -282,15 +302,9 @@ out:
}
EXPORT_SYMBOL_GPL(__vfs_setxattr_locked);
-static inline bool is_posix_acl_xattr(const char *name)
-{
- return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
- (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0);
-}
-
int
vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
- const char *name, void *value, size_t size, int flags)
+ const char *name, const void *value, size_t size, int flags)
{
struct inode *inode = dentry->d_inode;
struct inode *delegated_inode = NULL;
@@ -298,16 +312,12 @@ vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
int error;
if (size && strcmp(name, XATTR_NAME_CAPS) == 0) {
- error = cap_convert_nscap(mnt_userns, dentry,
- (const void **)&value, size);
+ error = cap_convert_nscap(mnt_userns, dentry, &value, size);
if (error < 0)
return error;
size = error;
}
- if (size && is_posix_acl_xattr(name))
- posix_acl_setxattr_idmapped_mnt(mnt_userns, inode, value, size);
-
retry_deleg:
inode_lock(inode);
error = __vfs_setxattr_locked(mnt_userns, dentry, name, value, size,
@@ -358,11 +368,12 @@ out_noalloc:
* vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr
*
* Allocate memory, if not already allocated, or re-allocate correct size,
- * before retrieving the extended attribute.
+ * before retrieving the extended attribute. The xattr value buffer should
+ * always be freed by the caller, even on error.
*
* Returns the result of alloc, if failed, or the getxattr operation.
*/
-ssize_t
+int
vfs_getxattr_alloc(struct user_namespace *mnt_userns, struct dentry *dentry,
const char *name, char **xattr_value, size_t xattr_size,
gfp_t flags)
@@ -403,6 +414,9 @@ __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name,
{
const struct xattr_handler *handler;
+ if (is_posix_acl_xattr(name))
+ return -EOPNOTSUPP;
+
handler = xattr_resolve_name(inode, &name);
if (IS_ERR(handler))
return PTR_ERR(handler);
@@ -441,10 +455,7 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return ret;
}
nolsm:
- error = __vfs_getxattr(dentry, inode, name, value, size);
- if (error > 0 && is_posix_acl_xattr(name))
- posix_acl_getxattr_idmapped_mnt(mnt_userns, inode, value, size);
- return error;
+ return __vfs_getxattr(dentry, inode, name, value, size);
}
EXPORT_SYMBOL_GPL(vfs_getxattr);
@@ -475,6 +486,9 @@ __vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry,
struct inode *inode = d_inode(dentry);
const struct xattr_handler *handler;
+ if (is_posix_acl_xattr(name))
+ return -EOPNOTSUPP;
+
handler = xattr_resolve_name(inode, &name);
if (IS_ERR(handler))
return PTR_ERR(handler);
@@ -584,25 +598,19 @@ int setxattr_copy(const char __user *name, struct xattr_ctx *ctx)
return error;
}
-static void setxattr_convert(struct user_namespace *mnt_userns,
- struct dentry *d, struct xattr_ctx *ctx)
-{
- if (ctx->size &&
- ((strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
- (strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)))
- posix_acl_fix_xattr_from_user(ctx->kvalue, ctx->size);
-}
-
-int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct xattr_ctx *ctx)
{
- setxattr_convert(mnt_userns, dentry, ctx);
- return vfs_setxattr(mnt_userns, dentry, ctx->kname->name,
+ if (is_posix_acl_xattr(ctx->kname->name))
+ return do_set_acl(idmap, dentry, ctx->kname->name,
+ ctx->kvalue, ctx->size);
+
+ return vfs_setxattr(mnt_idmap_owner(idmap), dentry, ctx->kname->name,
ctx->kvalue, ctx->size, ctx->flags);
}
static long
-setxattr(struct user_namespace *mnt_userns, struct dentry *d,
+setxattr(struct mnt_idmap *idmap, struct dentry *d,
const char __user *name, const void __user *value, size_t size,
int flags)
{
@@ -620,7 +628,7 @@ setxattr(struct user_namespace *mnt_userns, struct dentry *d,
if (error)
return error;
- error = do_setxattr(mnt_userns, d, &ctx);
+ error = do_setxattr(idmap, d, &ctx);
kvfree(ctx.kvalue);
return error;
@@ -639,7 +647,7 @@ retry:
return error;
error = mnt_want_write(path.mnt);
if (!error) {
- error = setxattr(mnt_user_ns(path.mnt), path.dentry, name,
+ error = setxattr(mnt_idmap(path.mnt), path.dentry, name,
value, size, flags);
mnt_drop_write(path.mnt);
}
@@ -676,7 +684,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
audit_file(f.file);
error = mnt_want_write_file(f.file);
if (!error) {
- error = setxattr(file_mnt_user_ns(f.file),
+ error = setxattr(file_mnt_idmap(f.file),
f.file->f_path.dentry, name,
value, size, flags);
mnt_drop_write_file(f.file);
@@ -689,7 +697,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
* Extended attribute GET operations
*/
ssize_t
-do_getxattr(struct user_namespace *mnt_userns, struct dentry *d,
+do_getxattr(struct mnt_idmap *idmap, struct dentry *d,
struct xattr_ctx *ctx)
{
ssize_t error;
@@ -703,11 +711,12 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d,
return -ENOMEM;
}
- error = vfs_getxattr(mnt_userns, d, kname, ctx->kvalue, ctx->size);
+ if (is_posix_acl_xattr(ctx->kname->name))
+ error = do_get_acl(idmap, d, kname, ctx->kvalue, ctx->size);
+ else
+ error = vfs_getxattr(mnt_idmap_owner(idmap), d, kname,
+ ctx->kvalue, ctx->size);
if (error > 0) {
- if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
- (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
- posix_acl_fix_xattr_to_user(ctx->kvalue, error);
if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error))
error = -EFAULT;
} else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) {
@@ -720,7 +729,7 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d,
}
static ssize_t
-getxattr(struct user_namespace *mnt_userns, struct dentry *d,
+getxattr(struct mnt_idmap *idmap, struct dentry *d,
const char __user *name, void __user *value, size_t size)
{
ssize_t error;
@@ -739,7 +748,7 @@ getxattr(struct user_namespace *mnt_userns, struct dentry *d,
if (error < 0)
return error;
- error = do_getxattr(mnt_userns, d, &ctx);
+ error = do_getxattr(idmap, d, &ctx);
kvfree(ctx.kvalue);
return error;
@@ -755,7 +764,7 @@ retry:
error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
- error = getxattr(mnt_user_ns(path.mnt), path.dentry, name, value, size);
+ error = getxattr(mnt_idmap(path.mnt), path.dentry, name, value, size);
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
@@ -785,7 +794,7 @@ SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
if (!f.file)
return error;
audit_file(f.file);
- error = getxattr(file_mnt_user_ns(f.file), f.file->f_path.dentry,
+ error = getxattr(file_mnt_idmap(f.file), f.file->f_path.dentry,
name, value, size);
fdput(f);
return error;
@@ -870,7 +879,7 @@ SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
* Extended attribute REMOVE operations
*/
static long
-removexattr(struct user_namespace *mnt_userns, struct dentry *d,
+removexattr(struct mnt_idmap *idmap, struct dentry *d,
const char __user *name)
{
int error;
@@ -882,7 +891,10 @@ removexattr(struct user_namespace *mnt_userns, struct dentry *d,
if (error < 0)
return error;
- return vfs_removexattr(mnt_userns, d, kname);
+ if (is_posix_acl_xattr(kname))
+ return vfs_remove_acl(mnt_idmap_owner(idmap), d, kname);
+
+ return vfs_removexattr(mnt_idmap_owner(idmap), d, kname);
}
static int path_removexattr(const char __user *pathname,
@@ -896,7 +908,7 @@ retry:
return error;
error = mnt_want_write(path.mnt);
if (!error) {
- error = removexattr(mnt_user_ns(path.mnt), path.dentry, name);
+ error = removexattr(mnt_idmap(path.mnt), path.dentry, name);
mnt_drop_write(path.mnt);
}
path_put(&path);
@@ -929,7 +941,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
audit_file(f.file);
error = mnt_want_write_file(f.file);
if (!error) {
- error = removexattr(file_mnt_user_ns(f.file),
+ error = removexattr(file_mnt_idmap(f.file),
f.file->f_path.dentry, name);
mnt_drop_write_file(f.file);
}
@@ -999,8 +1011,29 @@ const char *xattr_full_name(const struct xattr_handler *handler,
}
EXPORT_SYMBOL(xattr_full_name);
-/*
- * Allocate new xattr and copy in the value; but leave the name to callers.
+/**
+ * free_simple_xattr - free an xattr object
+ * @xattr: the xattr object
+ *
+ * Free the xattr object. Can handle @xattr being NULL.
+ */
+static inline void free_simple_xattr(struct simple_xattr *xattr)
+{
+ if (xattr)
+ kfree(xattr->name);
+ kvfree(xattr);
+}
+
+/**
+ * simple_xattr_alloc - allocate new xattr object
+ * @value: value of the xattr object
+ * @size: size of @value
+ *
+ * Allocate a new xattr object and initialize respective members. The caller is
+ * responsible for handling the name of the xattr.
+ *
+ * Return: On success a new xattr object is returned. On failure NULL is
+ * returned.
*/
struct simple_xattr *simple_xattr_alloc(const void *value, size_t size)
{
@@ -1021,20 +1054,69 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size)
return new_xattr;
}
-/*
- * xattr GET operation for in-memory/pseudo filesystems
+/**
+ * rbtree_simple_xattr_cmp - compare xattr name with current rbtree xattr entry
+ * @key: xattr name
+ * @node: current node
+ *
+ * Compare the xattr name with the xattr name attached to @node in the rbtree.
+ *
+ * Return: Negative value if continuing left, positive if continuing right, 0
+ * if the xattr attached to @node matches @key.
+ */
+static int rbtree_simple_xattr_cmp(const void *key, const struct rb_node *node)
+{
+ const char *xattr_name = key;
+ const struct simple_xattr *xattr;
+
+ xattr = rb_entry(node, struct simple_xattr, rb_node);
+ return strcmp(xattr->name, xattr_name);
+}
+
+/**
+ * rbtree_simple_xattr_node_cmp - compare two xattr rbtree nodes
+ * @new_node: new node
+ * @node: current node
+ *
+ * Compare the xattr attached to @new_node with the xattr attached to @node.
+ *
+ * Return: Negative value if continuing left, positive if continuing right, 0
+ * if the xattr attached to @new_node matches the xattr attached to @node.
+ */
+static int rbtree_simple_xattr_node_cmp(struct rb_node *new_node,
+ const struct rb_node *node)
+{
+ struct simple_xattr *xattr;
+ xattr = rb_entry(new_node, struct simple_xattr, rb_node);
+ return rbtree_simple_xattr_cmp(xattr->name, node);
+}
+
+/**
+ * simple_xattr_get - get an xattr object
+ * @xattrs: the header of the xattr object
+ * @name: the name of the xattr to retrieve
+ * @buffer: the buffer to store the value into
+ * @size: the size of @buffer
+ *
+ * Try to find and retrieve the xattr object associated with @name.
+ * If @buffer is provided store the value of @xattr in @buffer
+ * otherwise just return the length. The size of @buffer is limited
+ * to XATTR_SIZE_MAX which currently is 65536.
+ *
+ * Return: On success the length of the xattr value is returned. On error a
+ * negative error code is returned.
*/
int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
void *buffer, size_t size)
{
- struct simple_xattr *xattr;
+ struct simple_xattr *xattr = NULL;
+ struct rb_node *rbp;
int ret = -ENODATA;
- spin_lock(&xattrs->lock);
- list_for_each_entry(xattr, &xattrs->head, list) {
- if (strcmp(name, xattr->name))
- continue;
-
+ read_lock(&xattrs->lock);
+ rbp = rb_find(name, &xattrs->rb_root, rbtree_simple_xattr_cmp);
+ if (rbp) {
+ xattr = rb_entry(rbp, struct simple_xattr, rb_node);
ret = xattr->size;
if (buffer) {
if (size < xattr->size)
@@ -1042,34 +1124,44 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
else
memcpy(buffer, xattr->value, xattr->size);
}
- break;
}
- spin_unlock(&xattrs->lock);
+ read_unlock(&xattrs->lock);
return ret;
}
/**
- * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems
- * @xattrs: target simple_xattr list
- * @name: name of the extended attribute
- * @value: value of the xattr. If %NULL, will remove the attribute.
- * @size: size of the new xattr
- * @flags: %XATTR_{CREATE|REPLACE}
- * @removed_size: returns size of the removed xattr, -1 if none removed
+ * simple_xattr_set - set an xattr object
+ * @xattrs: the header of the xattr object
+ * @name: the name of the xattr to retrieve
+ * @value: the value to store along the xattr
+ * @size: the size of @value
+ * @flags: the flags determining how to set the xattr
+ * @removed_size: the size of the removed xattr
+ *
+ * Set a new xattr object.
+ * If @value is passed a new xattr object will be allocated. If XATTR_REPLACE
+ * is specified in @flags a matching xattr object for @name must already exist.
+ * If it does it will be replaced with the new xattr object. If it doesn't we
+ * fail. If XATTR_CREATE is specified and a matching xattr does already exist
+ * we fail. If it doesn't we create a new xattr. If @flags is zero we simply
+ * insert the new xattr replacing any existing one.
*
- * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
- * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist;
- * otherwise, fails with -ENODATA.
+ * If @value is empty and a matching xattr object is found we delete it if
+ * XATTR_REPLACE is specified in @flags or @flags is zero.
*
- * Returns 0 on success, -errno on failure.
+ * If @value is empty and no matching xattr object for @name is found we do
+ * nothing if XATTR_CREATE is specified in @flags or @flags is zero. For
+ * XATTR_REPLACE we fail as mentioned above.
+ *
+ * Return: On success zero and on error a negative error code is returned.
*/
int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
const void *value, size_t size, int flags,
ssize_t *removed_size)
{
- struct simple_xattr *xattr;
- struct simple_xattr *new_xattr = NULL;
- int err = 0;
+ struct simple_xattr *xattr = NULL, *new_xattr = NULL;
+ struct rb_node *parent = NULL, **rbp;
+ int err = 0, ret;
if (removed_size)
*removed_size = -1;
@@ -1082,42 +1174,68 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
new_xattr->name = kstrdup(name, GFP_KERNEL);
if (!new_xattr->name) {
- kvfree(new_xattr);
+ free_simple_xattr(new_xattr);
return -ENOMEM;
}
}
- spin_lock(&xattrs->lock);
- list_for_each_entry(xattr, &xattrs->head, list) {
- if (!strcmp(name, xattr->name)) {
- if (flags & XATTR_CREATE) {
- xattr = new_xattr;
- err = -EEXIST;
- } else if (new_xattr) {
- list_replace(&xattr->list, &new_xattr->list);
- if (removed_size)
- *removed_size = xattr->size;
- } else {
- list_del(&xattr->list);
- if (removed_size)
- *removed_size = xattr->size;
- }
- goto out;
- }
- }
- if (flags & XATTR_REPLACE) {
- xattr = new_xattr;
- err = -ENODATA;
- } else {
- list_add(&new_xattr->list, &xattrs->head);
- xattr = NULL;
+ write_lock(&xattrs->lock);
+ rbp = &xattrs->rb_root.rb_node;
+ while (*rbp) {
+ parent = *rbp;
+ ret = rbtree_simple_xattr_cmp(name, *rbp);
+ if (ret < 0)
+ rbp = &(*rbp)->rb_left;
+ else if (ret > 0)
+ rbp = &(*rbp)->rb_right;
+ else
+ xattr = rb_entry(*rbp, struct simple_xattr, rb_node);
+ if (xattr)
+ break;
}
-out:
- spin_unlock(&xattrs->lock);
+
if (xattr) {
- kfree(xattr->name);
- kvfree(xattr);
+ /* Fail if XATTR_CREATE is requested and the xattr exists. */
+ if (flags & XATTR_CREATE) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+
+ if (new_xattr)
+ rb_replace_node(&xattr->rb_node, &new_xattr->rb_node,
+ &xattrs->rb_root);
+ else
+ rb_erase(&xattr->rb_node, &xattrs->rb_root);
+ if (!err && removed_size)
+ *removed_size = xattr->size;
+ } else {
+ /* Fail if XATTR_REPLACE is requested but no xattr is found. */
+ if (flags & XATTR_REPLACE) {
+ err = -ENODATA;
+ goto out_unlock;
+ }
+
+ /*
+ * If XATTR_CREATE or no flags are specified together with a
+ * new value simply insert it.
+ */
+ if (new_xattr) {
+ rb_link_node(&new_xattr->rb_node, parent, rbp);
+ rb_insert_color(&new_xattr->rb_node, &xattrs->rb_root);
+ }
+
+ /*
+ * If XATTR_CREATE or no flags are specified and neither an
+ * old or new xattr exist then we don't need to do anything.
+ */
}
+
+out_unlock:
+ write_unlock(&xattrs->lock);
+ if (err)
+ free_simple_xattr(new_xattr);
+ else
+ free_simple_xattr(xattr);
return err;
}
@@ -1141,14 +1259,31 @@ static int xattr_list_one(char **buffer, ssize_t *remaining_size,
return 0;
}
-/*
- * xattr LIST operation for in-memory/pseudo filesystems
+/**
+ * simple_xattr_list - list all xattr objects
+ * @inode: inode from which to get the xattrs
+ * @xattrs: the header of the xattr object
+ * @buffer: the buffer to store all xattrs into
+ * @size: the size of @buffer
+ *
+ * List all xattrs associated with @inode. If @buffer is NULL we returned
+ * the required size of the buffer. If @buffer is provided we store the
+ * xattrs value into it provided it is big enough.
+ *
+ * Note, the number of xattr names that can be listed with listxattr(2) is
+ * limited to XATTR_LIST_MAX aka 65536 bytes. If a larger buffer is passed
+ * then vfs_listxattr() caps it to XATTR_LIST_MAX and if more xattr names
+ * are found it will return -E2BIG.
+ *
+ * Return: On success the required size or the size of the copied xattrs is
+ * returned. On error a negative error code is returned.
*/
ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
char *buffer, size_t size)
{
- bool trusted = capable(CAP_SYS_ADMIN);
+ bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
struct simple_xattr *xattr;
+ struct rb_node *rbp;
ssize_t remaining_size = size;
int err = 0;
@@ -1169,8 +1304,10 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
}
#endif
- spin_lock(&xattrs->lock);
- list_for_each_entry(xattr, &xattrs->head, list) {
+ read_lock(&xattrs->lock);
+ for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) {
+ xattr = rb_entry(rbp, struct simple_xattr, rb_node);
+
/* skip "trusted." attributes for unprivileged callers */
if (!trusted && xattr_is_trusted(xattr->name))
continue;
@@ -1179,18 +1316,76 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
if (err)
break;
}
- spin_unlock(&xattrs->lock);
+ read_unlock(&xattrs->lock);
return err ? err : size - remaining_size;
}
-/*
- * Adds an extended attribute to the list
+/**
+ * rbtree_simple_xattr_less - compare two xattr rbtree nodes
+ * @new_node: new node
+ * @node: current node
+ *
+ * Compare the xattr attached to @new_node with the xattr attached to @node.
+ * Note that this function technically tolerates duplicate entries.
+ *
+ * Return: True if insertion point in the rbtree is found.
+ */
+static bool rbtree_simple_xattr_less(struct rb_node *new_node,
+ const struct rb_node *node)
+{
+ return rbtree_simple_xattr_node_cmp(new_node, node) < 0;
+}
+
+/**
+ * simple_xattr_add - add xattr objects
+ * @xattrs: the header of the xattr object
+ * @new_xattr: the xattr object to add
+ *
+ * Add an xattr object to @xattrs. This assumes no replacement or removal
+ * of matching xattrs is wanted. Should only be called during inode
+ * initialization when a few distinct initial xattrs are supposed to be set.
+ */
+void simple_xattr_add(struct simple_xattrs *xattrs,
+ struct simple_xattr *new_xattr)
+{
+ write_lock(&xattrs->lock);
+ rb_add(&new_xattr->rb_node, &xattrs->rb_root, rbtree_simple_xattr_less);
+ write_unlock(&xattrs->lock);
+}
+
+/**
+ * simple_xattrs_init - initialize new xattr header
+ * @xattrs: header to initialize
+ *
+ * Initialize relevant fields of a an xattr header.
*/
-void simple_xattr_list_add(struct simple_xattrs *xattrs,
- struct simple_xattr *new_xattr)
+void simple_xattrs_init(struct simple_xattrs *xattrs)
{
- spin_lock(&xattrs->lock);
- list_add(&new_xattr->list, &xattrs->head);
- spin_unlock(&xattrs->lock);
+ xattrs->rb_root = RB_ROOT;
+ rwlock_init(&xattrs->lock);
+}
+
+/**
+ * simple_xattrs_free - free xattrs
+ * @xattrs: xattr header whose xattrs to destroy
+ *
+ * Destroy all xattrs in @xattr. When this is called no one can hold a
+ * reference to any of the xattrs anymore.
+ */
+void simple_xattrs_free(struct simple_xattrs *xattrs)
+{
+ struct rb_node *rbp;
+
+ rbp = rb_first(&xattrs->rb_root);
+ while (rbp) {
+ struct simple_xattr *xattr;
+ struct rb_node *rbp_next;
+
+ rbp_next = rb_next(rbp);
+ xattr = rb_entry(rbp, struct simple_xattr, rb_node);
+ rb_erase(&xattr->rb_node, &xattrs->rb_root);
+ free_simple_xattr(xattr);
+ rbp = rbp_next;
+ }
}
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 517a138faa66..191b22b9a35b 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -133,6 +133,21 @@ xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno)
return true;
}
+static inline bool
+xfs_verify_agbext(
+ struct xfs_perag *pag,
+ xfs_agblock_t agbno,
+ xfs_agblock_t len)
+{
+ if (agbno + len <= agbno)
+ return false;
+
+ if (!xfs_verify_agbno(pag, agbno))
+ return false;
+
+ return xfs_verify_agbno(pag, agbno + len - 1);
+}
+
/*
* Verify that an AG inode number pointer neither points outside the AG
* nor points at static metadata.
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index e2bdf089c0a3..989cf341779b 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -263,11 +263,7 @@ xfs_alloc_get_rec(
goto out_bad_rec;
/* check for valid extent range, including overflow */
- if (!xfs_verify_agbno(pag, *bno))
- goto out_bad_rec;
- if (*bno > *bno + *len)
- goto out_bad_rec;
- if (!xfs_verify_agbno(pag, *bno + *len - 1))
+ if (!xfs_verify_agbext(pag, *bno, *len))
goto out_bad_rec;
return 0;
@@ -1520,7 +1516,7 @@ xfs_alloc_ag_vextent_lastblock(
#ifdef DEBUG
/* Randomly don't execute the first algorithm. */
- if (prandom_u32() & 1)
+ if (get_random_u32_below(2))
return 0;
#endif
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index e56723dc9cd5..0d56a8d862e8 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -294,7 +294,7 @@ xfs_check_block(
else
thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
if (*thispa == *pp) {
- xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
+ xfs_warn(mp, "%s: thispa(%d) == pp(%d) %lld",
__func__, j, i,
(unsigned long long)be64_to_cpu(*thispa));
xfs_err(mp, "%s: ptrs are equal in node\n",
@@ -4058,7 +4058,7 @@ xfs_bmap_alloc_userdata(
* the busy list.
*/
bma->datatype = XFS_ALLOC_NOBUSY;
- if (whichfork == XFS_DATA_FORK) {
+ if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) {
bma->datatype |= XFS_ALLOC_USERDATA;
if (bma->offset == 0)
bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
@@ -4551,7 +4551,8 @@ xfs_bmapi_convert_delalloc(
* the extent. Just return the real extent at this offset.
*/
if (!isnullstartblock(bma.got.br_startblock)) {
- xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
+ xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
+ xfs_iomap_inode_sequence(ip, flags));
*seq = READ_ONCE(ifp->if_seq);
goto out_trans_cancel;
}
@@ -4599,7 +4600,8 @@ xfs_bmapi_convert_delalloc(
XFS_STATS_INC(mp, xs_xstrat_quick);
ASSERT(!isnullstartblock(bma.got.br_startblock));
- xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
+ xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
+ xfs_iomap_inode_sequence(ip, flags));
*seq = READ_ONCE(ifp->if_seq);
if (whichfork == XFS_COW_FORK)
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index eef27858a013..29c4b4ccb909 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -556,7 +556,6 @@ xfs_btree_islastblock(
struct xfs_buf *bp;
block = xfs_btree_get_block(cur, level, &bp);
- ASSERT(block && xfs_btree_check_block(cur, block, level, bp) == 0);
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index e7201dc68f43..e576560b46e9 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -2192,8 +2192,8 @@ xfs_da_grow_inode_int(
*/
mapp = kmem_alloc(sizeof(*mapp) * count, 0);
for (b = *bno, mapi = 0; b < *bno + count; ) {
- nmap = min(XFS_BMAP_MAX_NMAP, count);
c = (int)(*bno + count - b);
+ nmap = min(XFS_BMAP_MAX_NMAP, c);
error = xfs_bmapi_write(tp, dp, b, c,
xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
args->total, &mapp[mapi], &nmap);
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 76eedc2756b3..92bac3373f1f 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -261,7 +261,7 @@ xfs_dir_createname(
{
struct xfs_da_args *args;
int rval;
- int v; /* type-checking value */
+ bool v;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
@@ -357,7 +357,7 @@ xfs_dir_lookup(
{
struct xfs_da_args *args;
int rval;
- int v; /* type-checking value */
+ bool v;
int lock_mode;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
@@ -435,7 +435,7 @@ xfs_dir_removename(
{
struct xfs_da_args *args;
int rval;
- int v; /* type-checking value */
+ bool v;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
XFS_STATS_INC(dp->i_mount, xs_dir_remove);
@@ -493,7 +493,7 @@ xfs_dir_replace(
{
struct xfs_da_args *args;
int rval;
- int v; /* type-checking value */
+ bool v;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
@@ -610,19 +610,23 @@ xfs_dir2_grow_inode(
int
xfs_dir2_isblock(
struct xfs_da_args *args,
- int *vp) /* out: 1 is block, 0 is not block */
+ bool *isblock)
{
- xfs_fileoff_t last; /* last file offset */
- int rval;
+ struct xfs_mount *mp = args->dp->i_mount;
+ xfs_fileoff_t eof;
+ int error;
- if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
- return rval;
- rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize;
- if (XFS_IS_CORRUPT(args->dp->i_mount,
- rval != 0 &&
- args->dp->i_disk_size != args->geo->blksize))
+ error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ *isblock = false;
+ if (XFS_FSB_TO_B(mp, eof) != args->geo->blksize)
+ return 0;
+
+ *isblock = true;
+ if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize))
return -EFSCORRUPTED;
- *vp = rval;
return 0;
}
@@ -632,14 +636,20 @@ xfs_dir2_isblock(
int
xfs_dir2_isleaf(
struct xfs_da_args *args,
- int *vp) /* out: 1 is block, 0 is not block */
+ bool *isleaf)
{
- xfs_fileoff_t last; /* last file offset */
- int rval;
+ xfs_fileoff_t eof;
+ int error;
- if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
- return rval;
- *vp = last == args->geo->leafblk + args->geo->fsbcount;
+ error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ *isleaf = false;
+ if (eof != args->geo->leafblk + args->geo->fsbcount)
+ return 0;
+
+ *isleaf = true;
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index b6df3c34b26a..dd39f17dd9a9 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -61,8 +61,8 @@ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
/*
* Interface routines used by userspace utilities
*/
-extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r);
-extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r);
+extern int xfs_dir2_isblock(struct xfs_da_args *args, bool *isblock);
+extern int xfs_dir2_isleaf(struct xfs_da_args *args, bool *isleaf);
extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
struct xfs_buf *bp);
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index d9b66306a9a7..cb9e950a911d 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -146,6 +146,8 @@ xfs_dir3_leaf_check_int(
xfs_dir2_leaf_tail_t *ltp;
int stale;
int i;
+ bool isleaf1 = (hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+ hdr->magic == XFS_DIR3_LEAF1_MAGIC);
ltp = xfs_dir2_leaf_tail_p(geo, leaf);
@@ -158,8 +160,7 @@ xfs_dir3_leaf_check_int(
return __this_address;
/* Leaves and bests don't overlap in leaf format. */
- if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
- hdr->magic == XFS_DIR3_LEAF1_MAGIC) &&
+ if (isleaf1 &&
(char *)&hdr->ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp))
return __this_address;
@@ -175,6 +176,10 @@ xfs_dir3_leaf_check_int(
}
if (hdr->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
stale++;
+ if (isleaf1 && xfs_dir2_dataptr_to_db(geo,
+ be32_to_cpu(hdr->ents[i].address)) >=
+ be32_to_cpu(ltp->bestcount))
+ return __this_address;
}
if (hdr->stale != stale)
return __this_address;
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 003812fd7d35..8cd37e6e9d38 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -865,7 +865,6 @@ xfs_dir2_sf_lookup(
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
int i; /* entry index */
- int error;
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
enum xfs_dacmp cmp; /* comparison result */
@@ -929,8 +928,7 @@ xfs_dir2_sf_lookup(
if (!ci_sfep)
return -ENOENT;
/* otherwise process the CI match as required by the caller */
- error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
- return error;
+ return xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
}
/*
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 5362908164b0..01a9e86b3037 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -40,13 +40,12 @@
#define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25
#define XFS_ERRTAG_BMAP_FINISH_ONE 26
#define XFS_ERRTAG_AG_RESV_CRITICAL 27
+
/*
- * DEBUG mode instrumentation to test and/or trigger delayed allocation
- * block killing in the event of failed writes. When enabled, all
- * buffered writes are silenty dropped and handled as if they failed.
- * All delalloc blocks in the range of the write (including pre-existing
- * delalloc blocks!) are tossed as part of the write failure error
- * handling sequence.
+ * Drop-writes support removed because write error handling cannot trash
+ * pre-existing delalloc extents in any useful way anymore. We retain the
+ * definition so that we can reject it as an invalid value in
+ * xfs_errortag_valid().
*/
#define XFS_ERRTAG_DROP_WRITES 28
#define XFS_ERRTAG_LOG_BAD_CRC 29
@@ -62,7 +61,9 @@
#define XFS_ERRTAG_LARP 39
#define XFS_ERRTAG_DA_LEAF_SPLIT 40
#define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41
-#define XFS_ERRTAG_MAX 42
+#define XFS_ERRTAG_WB_DELAY_MS 42
+#define XFS_ERRTAG_WRITE_DELAY_MS 43
+#define XFS_ERRTAG_MAX 44
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -95,7 +96,6 @@
#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1
#define XFS_RANDOM_BMAP_FINISH_ONE 1
#define XFS_RANDOM_AG_RESV_CRITICAL 4
-#define XFS_RANDOM_DROP_WRITES 1
#define XFS_RANDOM_LOG_BAD_CRC 1
#define XFS_RANDOM_LOG_ITEM_PIN 1
#define XFS_RANDOM_BUF_LRU_REF 2
@@ -109,5 +109,7 @@
#define XFS_RANDOM_LARP 1
#define XFS_RANDOM_DA_LEAF_SPLIT 1
#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1
+#define XFS_RANDOM_WB_DELAY_MS 3000
+#define XFS_RANDOM_WRITE_DELAY_MS 3000
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index b55bdfa9c8a8..371dc07233e0 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1564,20 +1564,6 @@ struct xfs_rmap_rec {
#define RMAPBT_UNUSED_OFFSET_BITLEN 7
#define RMAPBT_OFFSET_BITLEN 54
-#define XFS_RMAP_ATTR_FORK (1 << 0)
-#define XFS_RMAP_BMBT_BLOCK (1 << 1)
-#define XFS_RMAP_UNWRITTEN (1 << 2)
-#define XFS_RMAP_KEY_FLAGS (XFS_RMAP_ATTR_FORK | \
- XFS_RMAP_BMBT_BLOCK)
-#define XFS_RMAP_REC_FLAGS (XFS_RMAP_UNWRITTEN)
-struct xfs_rmap_irec {
- xfs_agblock_t rm_startblock; /* extent start block */
- xfs_extlen_t rm_blockcount; /* extent length */
- uint64_t rm_owner; /* extent owner */
- uint64_t rm_offset; /* offset within the owner */
- unsigned int rm_flags; /* state flags */
-};
-
/*
* Key structure
*
@@ -1626,7 +1612,7 @@ unsigned int xfs_refc_block(struct xfs_mount *mp);
* on the startblock. This speeds up mount time deletion of stale
* staging extents because they're all at the right side of the tree.
*/
-#define XFS_REFC_COW_START ((xfs_agblock_t)(1U << 31))
+#define XFS_REFC_COWFLAG (1U << 31)
#define REFCNTBT_COWFLAG_BITLEN 1
#define REFCNTBT_AGBLOCK_BITLEN 31
@@ -1640,12 +1626,6 @@ struct xfs_refcount_key {
__be32 rc_startblock; /* starting block number */
};
-struct xfs_refcount_irec {
- xfs_agblock_t rc_startblock; /* starting block number */
- xfs_extlen_t rc_blockcount; /* count of free blocks */
- xfs_nlink_t rc_refcount; /* number of inodes linked here */
-};
-
#define MAXREFCOUNT ((xfs_nlink_t)~0U)
#define MAXREFCEXTLEN ((xfs_extlen_t)~0U)
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 6cdfd64bc56b..5118dedf9267 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -636,7 +636,7 @@ xfs_ialloc_ag_alloc(
/* randomly do sparse inode allocations */
if (xfs_has_sparseinodes(tp->t_mountp) &&
igeo->ialloc_min_blks < igeo->ialloc_blks)
- do_sparse = prandom_u32() & 1;
+ do_sparse = get_random_u32_below(2);
#endif
/*
@@ -805,7 +805,7 @@ sparse_alloc:
* number from being easily guessable.
*/
error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno,
- args.agbno, args.len, prandom_u32());
+ args.agbno, args.len, get_random_u32());
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 9327a4f39206..6b21760184d9 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -78,7 +78,7 @@ xfs_iformat_local(
*/
if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
xfs_warn(ip->i_mount,
- "corrupt inode %Lu (bad size %d for local fork, size = %zd).",
+ "corrupt inode %llu (bad size %d for local fork, size = %zd).",
(unsigned long long) ip->i_ino, size,
XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
@@ -192,7 +192,7 @@ xfs_iformat_btree(
XFS_DFORK_SIZE(dip, mp, whichfork) ||
ifp->if_nextents > ip->i_nblocks) ||
level == 0 || level > XFS_BM_MAXLEVELS(mp, whichfork)) {
- xfs_warn(mp, "corrupt inode %Lu (btree).",
+ xfs_warn(mp, "corrupt inode %llu (btree).",
(unsigned long long) ip->i_ino);
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iformat_btree", dfp, size,
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index b351b9dc6561..f13e0809dc63 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -613,25 +613,49 @@ typedef struct xfs_efi_log_format {
uint16_t efi_size; /* size of this item */
uint32_t efi_nextents; /* # extents to free */
uint64_t efi_id; /* efi identifier */
- xfs_extent_t efi_extents[1]; /* array of extents to free */
+ xfs_extent_t efi_extents[]; /* array of extents to free */
} xfs_efi_log_format_t;
+static inline size_t
+xfs_efi_log_format_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efi_log_format) +
+ nr * sizeof(struct xfs_extent);
+}
+
typedef struct xfs_efi_log_format_32 {
uint16_t efi_type; /* efi log item type */
uint16_t efi_size; /* size of this item */
uint32_t efi_nextents; /* # extents to free */
uint64_t efi_id; /* efi identifier */
- xfs_extent_32_t efi_extents[1]; /* array of extents to free */
+ xfs_extent_32_t efi_extents[]; /* array of extents to free */
} __attribute__((packed)) xfs_efi_log_format_32_t;
+static inline size_t
+xfs_efi_log_format32_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efi_log_format_32) +
+ nr * sizeof(struct xfs_extent_32);
+}
+
typedef struct xfs_efi_log_format_64 {
uint16_t efi_type; /* efi log item type */
uint16_t efi_size; /* size of this item */
uint32_t efi_nextents; /* # extents to free */
uint64_t efi_id; /* efi identifier */
- xfs_extent_64_t efi_extents[1]; /* array of extents to free */
+ xfs_extent_64_t efi_extents[]; /* array of extents to free */
} xfs_efi_log_format_64_t;
+static inline size_t
+xfs_efi_log_format64_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efi_log_format_64) +
+ nr * sizeof(struct xfs_extent_64);
+}
+
/*
* This is the structure used to lay out an efd log item in the
* log. The efd_extents array is a variable size array whose
@@ -642,25 +666,49 @@ typedef struct xfs_efd_log_format {
uint16_t efd_size; /* size of this item */
uint32_t efd_nextents; /* # of extents freed */
uint64_t efd_efi_id; /* id of corresponding efi */
- xfs_extent_t efd_extents[1]; /* array of extents freed */
+ xfs_extent_t efd_extents[]; /* array of extents freed */
} xfs_efd_log_format_t;
+static inline size_t
+xfs_efd_log_format_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efd_log_format) +
+ nr * sizeof(struct xfs_extent);
+}
+
typedef struct xfs_efd_log_format_32 {
uint16_t efd_type; /* efd log item type */
uint16_t efd_size; /* size of this item */
uint32_t efd_nextents; /* # of extents freed */
uint64_t efd_efi_id; /* id of corresponding efi */
- xfs_extent_32_t efd_extents[1]; /* array of extents freed */
+ xfs_extent_32_t efd_extents[]; /* array of extents freed */
} __attribute__((packed)) xfs_efd_log_format_32_t;
+static inline size_t
+xfs_efd_log_format32_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efd_log_format_32) +
+ nr * sizeof(struct xfs_extent_32);
+}
+
typedef struct xfs_efd_log_format_64 {
uint16_t efd_type; /* efd log item type */
uint16_t efd_size; /* size of this item */
uint32_t efd_nextents; /* # of extents freed */
uint64_t efd_efi_id; /* id of corresponding efi */
- xfs_extent_64_t efd_extents[1]; /* array of extents freed */
+ xfs_extent_64_t efd_extents[]; /* array of extents freed */
} xfs_efd_log_format_64_t;
+static inline size_t
+xfs_efd_log_format64_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efd_log_format_64) +
+ nr * sizeof(struct xfs_extent_64);
+}
+
/*
* RUI/RUD (reverse mapping) log format definitions
*/
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 64b910caafaa..6f7ed9288fe4 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -46,13 +46,16 @@ STATIC int __xfs_refcount_cow_free(struct xfs_btree_cur *rcur,
int
xfs_refcount_lookup_le(
struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
xfs_agblock_t bno,
int *stat)
{
- trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno,
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ xfs_refcount_encode_startblock(bno, domain),
XFS_LOOKUP_LE);
cur->bc_rec.rc.rc_startblock = bno;
cur->bc_rec.rc.rc_blockcount = 0;
+ cur->bc_rec.rc.rc_domain = domain;
return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
}
@@ -63,13 +66,16 @@ xfs_refcount_lookup_le(
int
xfs_refcount_lookup_ge(
struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
xfs_agblock_t bno,
int *stat)
{
- trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno,
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ xfs_refcount_encode_startblock(bno, domain),
XFS_LOOKUP_GE);
cur->bc_rec.rc.rc_startblock = bno;
cur->bc_rec.rc.rc_blockcount = 0;
+ cur->bc_rec.rc.rc_domain = domain;
return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
}
@@ -80,13 +86,16 @@ xfs_refcount_lookup_ge(
int
xfs_refcount_lookup_eq(
struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
xfs_agblock_t bno,
int *stat)
{
- trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno,
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ xfs_refcount_encode_startblock(bno, domain),
XFS_LOOKUP_LE);
cur->bc_rec.rc.rc_startblock = bno;
cur->bc_rec.rc.rc_blockcount = 0;
+ cur->bc_rec.rc.rc_domain = domain;
return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
}
@@ -96,7 +105,17 @@ xfs_refcount_btrec_to_irec(
const union xfs_btree_rec *rec,
struct xfs_refcount_irec *irec)
{
- irec->rc_startblock = be32_to_cpu(rec->refc.rc_startblock);
+ uint32_t start;
+
+ start = be32_to_cpu(rec->refc.rc_startblock);
+ if (start & XFS_REFC_COWFLAG) {
+ start &= ~XFS_REFC_COWFLAG;
+ irec->rc_domain = XFS_REFC_DOMAIN_COW;
+ } else {
+ irec->rc_domain = XFS_REFC_DOMAIN_SHARED;
+ }
+
+ irec->rc_startblock = start;
irec->rc_blockcount = be32_to_cpu(rec->refc.rc_blockcount);
irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount);
}
@@ -114,7 +133,6 @@ xfs_refcount_get_rec(
struct xfs_perag *pag = cur->bc_ag.pag;
union xfs_btree_rec *rec;
int error;
- xfs_agblock_t realstart;
error = xfs_btree_get_rec(cur, &rec, stat);
if (error || !*stat)
@@ -124,22 +142,11 @@ xfs_refcount_get_rec(
if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
goto out_bad_rec;
- /* handle special COW-staging state */
- realstart = irec->rc_startblock;
- if (realstart & XFS_REFC_COW_START) {
- if (irec->rc_refcount != 1)
- goto out_bad_rec;
- realstart &= ~XFS_REFC_COW_START;
- } else if (irec->rc_refcount < 2) {
+ if (!xfs_refcount_check_domain(irec))
goto out_bad_rec;
- }
/* check for valid extent range, including overflow */
- if (!xfs_verify_agbno(pag, realstart))
- goto out_bad_rec;
- if (realstart > realstart + irec->rc_blockcount)
- goto out_bad_rec;
- if (!xfs_verify_agbno(pag, realstart + irec->rc_blockcount - 1))
+ if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount))
goto out_bad_rec;
if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT)
@@ -169,12 +176,17 @@ xfs_refcount_update(
struct xfs_refcount_irec *irec)
{
union xfs_btree_rec rec;
+ uint32_t start;
int error;
trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
- rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock);
+
+ start = xfs_refcount_encode_startblock(irec->rc_startblock,
+ irec->rc_domain);
+ rec.refc.rc_startblock = cpu_to_be32(start);
rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount);
rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount);
+
error = xfs_btree_update(cur, &rec);
if (error)
trace_xfs_refcount_update_error(cur->bc_mp,
@@ -196,9 +208,12 @@ xfs_refcount_insert(
int error;
trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
+
cur->bc_rec.rc.rc_startblock = irec->rc_startblock;
cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount;
cur->bc_rec.rc.rc_refcount = irec->rc_refcount;
+ cur->bc_rec.rc.rc_domain = irec->rc_domain;
+
error = xfs_btree_insert(cur, i);
if (error)
goto out_error;
@@ -244,7 +259,8 @@ xfs_refcount_delete(
}
if (error)
goto out_error;
- error = xfs_refcount_lookup_ge(cur, irec.rc_startblock, &found_rec);
+ error = xfs_refcount_lookup_ge(cur, irec.rc_domain, irec.rc_startblock,
+ &found_rec);
out_error:
if (error)
trace_xfs_refcount_delete_error(cur->bc_mp,
@@ -343,6 +359,7 @@ xfs_refc_next(
STATIC int
xfs_refcount_split_extent(
struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
xfs_agblock_t agbno,
bool *shape_changed)
{
@@ -351,7 +368,7 @@ xfs_refcount_split_extent(
int error;
*shape_changed = false;
- error = xfs_refcount_lookup_le(cur, agbno, &found_rec);
+ error = xfs_refcount_lookup_le(cur, domain, agbno, &found_rec);
if (error)
goto out_error;
if (!found_rec)
@@ -364,6 +381,8 @@ xfs_refcount_split_extent(
error = -EFSCORRUPTED;
goto out_error;
}
+ if (rcext.rc_domain != domain)
+ return 0;
if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno)
return 0;
@@ -415,6 +434,9 @@ xfs_refcount_merge_center_extents(
trace_xfs_refcount_merge_center_extents(cur->bc_mp,
cur->bc_ag.pag->pag_agno, left, center, right);
+ ASSERT(left->rc_domain == center->rc_domain);
+ ASSERT(right->rc_domain == center->rc_domain);
+
/*
* Make sure the center and right extents are not in the btree.
* If the center extent was synthesized, the first delete call
@@ -423,8 +445,8 @@ xfs_refcount_merge_center_extents(
* call removes the center and the second one removes the right
* extent.
*/
- error = xfs_refcount_lookup_ge(cur, center->rc_startblock,
- &found_rec);
+ error = xfs_refcount_lookup_ge(cur, center->rc_domain,
+ center->rc_startblock, &found_rec);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
@@ -451,8 +473,8 @@ xfs_refcount_merge_center_extents(
}
/* Enlarge the left extent. */
- error = xfs_refcount_lookup_le(cur, left->rc_startblock,
- &found_rec);
+ error = xfs_refcount_lookup_le(cur, left->rc_domain,
+ left->rc_startblock, &found_rec);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
@@ -491,10 +513,12 @@ xfs_refcount_merge_left_extent(
trace_xfs_refcount_merge_left_extent(cur->bc_mp,
cur->bc_ag.pag->pag_agno, left, cleft);
+ ASSERT(left->rc_domain == cleft->rc_domain);
+
/* If the extent at agbno (cleft) wasn't synthesized, remove it. */
if (cleft->rc_refcount > 1) {
- error = xfs_refcount_lookup_le(cur, cleft->rc_startblock,
- &found_rec);
+ error = xfs_refcount_lookup_le(cur, cleft->rc_domain,
+ cleft->rc_startblock, &found_rec);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
@@ -512,8 +536,8 @@ xfs_refcount_merge_left_extent(
}
/* Enlarge the left extent. */
- error = xfs_refcount_lookup_le(cur, left->rc_startblock,
- &found_rec);
+ error = xfs_refcount_lookup_le(cur, left->rc_domain,
+ left->rc_startblock, &found_rec);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
@@ -552,13 +576,15 @@ xfs_refcount_merge_right_extent(
trace_xfs_refcount_merge_right_extent(cur->bc_mp,
cur->bc_ag.pag->pag_agno, cright, right);
+ ASSERT(right->rc_domain == cright->rc_domain);
+
/*
* If the extent ending at agbno+aglen (cright) wasn't synthesized,
* remove it.
*/
if (cright->rc_refcount > 1) {
- error = xfs_refcount_lookup_le(cur, cright->rc_startblock,
- &found_rec);
+ error = xfs_refcount_lookup_le(cur, cright->rc_domain,
+ cright->rc_startblock, &found_rec);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
@@ -576,8 +602,8 @@ xfs_refcount_merge_right_extent(
}
/* Enlarge the right extent. */
- error = xfs_refcount_lookup_le(cur, right->rc_startblock,
- &found_rec);
+ error = xfs_refcount_lookup_le(cur, right->rc_domain,
+ right->rc_startblock, &found_rec);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
@@ -600,8 +626,6 @@ out_error:
return error;
}
-#define XFS_FIND_RCEXT_SHARED 1
-#define XFS_FIND_RCEXT_COW 2
/*
* Find the left extent and the one after it (cleft). This function assumes
* that we've already split any extent crossing agbno.
@@ -611,16 +635,16 @@ xfs_refcount_find_left_extents(
struct xfs_btree_cur *cur,
struct xfs_refcount_irec *left,
struct xfs_refcount_irec *cleft,
+ enum xfs_refc_domain domain,
xfs_agblock_t agbno,
- xfs_extlen_t aglen,
- int flags)
+ xfs_extlen_t aglen)
{
struct xfs_refcount_irec tmp;
int error;
int found_rec;
left->rc_startblock = cleft->rc_startblock = NULLAGBLOCK;
- error = xfs_refcount_lookup_le(cur, agbno - 1, &found_rec);
+ error = xfs_refcount_lookup_le(cur, domain, agbno - 1, &found_rec);
if (error)
goto out_error;
if (!found_rec)
@@ -634,11 +658,9 @@ xfs_refcount_find_left_extents(
goto out_error;
}
- if (xfs_refc_next(&tmp) != agbno)
- return 0;
- if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2)
+ if (tmp.rc_domain != domain)
return 0;
- if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1)
+ if (xfs_refc_next(&tmp) != agbno)
return 0;
/* We have a left extent; retrieve (or invent) the next right one */
*left = tmp;
@@ -655,6 +677,9 @@ xfs_refcount_find_left_extents(
goto out_error;
}
+ if (tmp.rc_domain != domain)
+ goto not_found;
+
/* if tmp starts at the end of our range, just use that */
if (tmp.rc_startblock == agbno)
*cleft = tmp;
@@ -671,8 +696,10 @@ xfs_refcount_find_left_extents(
cleft->rc_blockcount = min(aglen,
tmp.rc_startblock - agbno);
cleft->rc_refcount = 1;
+ cleft->rc_domain = domain;
}
} else {
+not_found:
/*
* No extents, so pretend that there's one covering the whole
* range.
@@ -680,6 +707,7 @@ xfs_refcount_find_left_extents(
cleft->rc_startblock = agbno;
cleft->rc_blockcount = aglen;
cleft->rc_refcount = 1;
+ cleft->rc_domain = domain;
}
trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
left, cleft, agbno);
@@ -700,16 +728,16 @@ xfs_refcount_find_right_extents(
struct xfs_btree_cur *cur,
struct xfs_refcount_irec *right,
struct xfs_refcount_irec *cright,
+ enum xfs_refc_domain domain,
xfs_agblock_t agbno,
- xfs_extlen_t aglen,
- int flags)
+ xfs_extlen_t aglen)
{
struct xfs_refcount_irec tmp;
int error;
int found_rec;
right->rc_startblock = cright->rc_startblock = NULLAGBLOCK;
- error = xfs_refcount_lookup_ge(cur, agbno + aglen, &found_rec);
+ error = xfs_refcount_lookup_ge(cur, domain, agbno + aglen, &found_rec);
if (error)
goto out_error;
if (!found_rec)
@@ -723,11 +751,9 @@ xfs_refcount_find_right_extents(
goto out_error;
}
- if (tmp.rc_startblock != agbno + aglen)
+ if (tmp.rc_domain != domain)
return 0;
- if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2)
- return 0;
- if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1)
+ if (tmp.rc_startblock != agbno + aglen)
return 0;
/* We have a right extent; retrieve (or invent) the next left one */
*right = tmp;
@@ -744,6 +770,9 @@ xfs_refcount_find_right_extents(
goto out_error;
}
+ if (tmp.rc_domain != domain)
+ goto not_found;
+
/* if tmp ends at the end of our range, just use that */
if (xfs_refc_next(&tmp) == agbno + aglen)
*cright = tmp;
@@ -760,8 +789,10 @@ xfs_refcount_find_right_extents(
cright->rc_blockcount = right->rc_startblock -
cright->rc_startblock;
cright->rc_refcount = 1;
+ cright->rc_domain = domain;
}
} else {
+not_found:
/*
* No extents, so pretend that there's one covering the whole
* range.
@@ -769,6 +800,7 @@ xfs_refcount_find_right_extents(
cright->rc_startblock = agbno;
cright->rc_blockcount = aglen;
cright->rc_refcount = 1;
+ cright->rc_domain = domain;
}
trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
cright, right, agbno + aglen);
@@ -783,21 +815,146 @@ out_error:
/* Is this extent valid? */
static inline bool
xfs_refc_valid(
- struct xfs_refcount_irec *rc)
+ const struct xfs_refcount_irec *rc)
{
return rc->rc_startblock != NULLAGBLOCK;
}
+static inline xfs_nlink_t
+xfs_refc_merge_refcount(
+ const struct xfs_refcount_irec *irec,
+ enum xfs_refc_adjust_op adjust)
+{
+ /* Once a record hits MAXREFCOUNT, it is pinned there forever */
+ if (irec->rc_refcount == MAXREFCOUNT)
+ return MAXREFCOUNT;
+ return irec->rc_refcount + adjust;
+}
+
+static inline bool
+xfs_refc_want_merge_center(
+ const struct xfs_refcount_irec *left,
+ const struct xfs_refcount_irec *cleft,
+ const struct xfs_refcount_irec *cright,
+ const struct xfs_refcount_irec *right,
+ bool cleft_is_cright,
+ enum xfs_refc_adjust_op adjust,
+ unsigned long long *ulenp)
+{
+ unsigned long long ulen = left->rc_blockcount;
+ xfs_nlink_t new_refcount;
+
+ /*
+ * To merge with a center record, both shoulder records must be
+ * adjacent to the record we want to adjust. This is only true if
+ * find_left and find_right made all four records valid.
+ */
+ if (!xfs_refc_valid(left) || !xfs_refc_valid(right) ||
+ !xfs_refc_valid(cleft) || !xfs_refc_valid(cright))
+ return false;
+
+ /* There must only be one record for the entire range. */
+ if (!cleft_is_cright)
+ return false;
+
+ /* The shoulder record refcounts must match the new refcount. */
+ new_refcount = xfs_refc_merge_refcount(cleft, adjust);
+ if (left->rc_refcount != new_refcount)
+ return false;
+ if (right->rc_refcount != new_refcount)
+ return false;
+
+ /*
+ * The new record cannot exceed the max length. ulen is a ULL as the
+ * individual record block counts can be up to (u32 - 1) in length
+ * hence we need to catch u32 addition overflows here.
+ */
+ ulen += cleft->rc_blockcount + right->rc_blockcount;
+ if (ulen >= MAXREFCEXTLEN)
+ return false;
+
+ *ulenp = ulen;
+ return true;
+}
+
+static inline bool
+xfs_refc_want_merge_left(
+ const struct xfs_refcount_irec *left,
+ const struct xfs_refcount_irec *cleft,
+ enum xfs_refc_adjust_op adjust)
+{
+ unsigned long long ulen = left->rc_blockcount;
+ xfs_nlink_t new_refcount;
+
+ /*
+ * For a left merge, the left shoulder record must be adjacent to the
+ * start of the range. If this is true, find_left made left and cleft
+ * contain valid contents.
+ */
+ if (!xfs_refc_valid(left) || !xfs_refc_valid(cleft))
+ return false;
+
+ /* Left shoulder record refcount must match the new refcount. */
+ new_refcount = xfs_refc_merge_refcount(cleft, adjust);
+ if (left->rc_refcount != new_refcount)
+ return false;
+
+ /*
+ * The new record cannot exceed the max length. ulen is a ULL as the
+ * individual record block counts can be up to (u32 - 1) in length
+ * hence we need to catch u32 addition overflows here.
+ */
+ ulen += cleft->rc_blockcount;
+ if (ulen >= MAXREFCEXTLEN)
+ return false;
+
+ return true;
+}
+
+static inline bool
+xfs_refc_want_merge_right(
+ const struct xfs_refcount_irec *cright,
+ const struct xfs_refcount_irec *right,
+ enum xfs_refc_adjust_op adjust)
+{
+ unsigned long long ulen = right->rc_blockcount;
+ xfs_nlink_t new_refcount;
+
+ /*
+ * For a right merge, the right shoulder record must be adjacent to the
+ * end of the range. If this is true, find_right made cright and right
+ * contain valid contents.
+ */
+ if (!xfs_refc_valid(right) || !xfs_refc_valid(cright))
+ return false;
+
+ /* Right shoulder record refcount must match the new refcount. */
+ new_refcount = xfs_refc_merge_refcount(cright, adjust);
+ if (right->rc_refcount != new_refcount)
+ return false;
+
+ /*
+ * The new record cannot exceed the max length. ulen is a ULL as the
+ * individual record block counts can be up to (u32 - 1) in length
+ * hence we need to catch u32 addition overflows here.
+ */
+ ulen += cright->rc_blockcount;
+ if (ulen >= MAXREFCEXTLEN)
+ return false;
+
+ return true;
+}
+
/*
* Try to merge with any extents on the boundaries of the adjustment range.
*/
STATIC int
xfs_refcount_merge_extents(
struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
xfs_agblock_t *agbno,
xfs_extlen_t *aglen,
enum xfs_refc_adjust_op adjust,
- int flags,
bool *shape_changed)
{
struct xfs_refcount_irec left = {0}, cleft = {0};
@@ -812,12 +969,12 @@ xfs_refcount_merge_extents(
* just below (agbno + aglen) [cright], and just above (agbno + aglen)
* [right].
*/
- error = xfs_refcount_find_left_extents(cur, &left, &cleft, *agbno,
- *aglen, flags);
+ error = xfs_refcount_find_left_extents(cur, &left, &cleft, domain,
+ *agbno, *aglen);
if (error)
return error;
- error = xfs_refcount_find_right_extents(cur, &right, &cright, *agbno,
- *aglen, flags);
+ error = xfs_refcount_find_right_extents(cur, &right, &cright, domain,
+ *agbno, *aglen);
if (error)
return error;
@@ -829,23 +986,15 @@ xfs_refcount_merge_extents(
(cleft.rc_blockcount == cright.rc_blockcount);
/* Try to merge left, cleft, and right. cleft must == cright. */
- ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount +
- right.rc_blockcount;
- if (xfs_refc_valid(&left) && xfs_refc_valid(&right) &&
- xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal &&
- left.rc_refcount == cleft.rc_refcount + adjust &&
- right.rc_refcount == cleft.rc_refcount + adjust &&
- ulen < MAXREFCEXTLEN) {
+ if (xfs_refc_want_merge_center(&left, &cleft, &cright, &right, cequal,
+ adjust, &ulen)) {
*shape_changed = true;
return xfs_refcount_merge_center_extents(cur, &left, &cleft,
&right, ulen, aglen);
}
/* Try to merge left and cleft. */
- ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount;
- if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) &&
- left.rc_refcount == cleft.rc_refcount + adjust &&
- ulen < MAXREFCEXTLEN) {
+ if (xfs_refc_want_merge_left(&left, &cleft, adjust)) {
*shape_changed = true;
error = xfs_refcount_merge_left_extent(cur, &left, &cleft,
agbno, aglen);
@@ -861,16 +1010,13 @@ xfs_refcount_merge_extents(
}
/* Try to merge cright and right. */
- ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount;
- if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) &&
- right.rc_refcount == cright.rc_refcount + adjust &&
- ulen < MAXREFCEXTLEN) {
+ if (xfs_refc_want_merge_right(&cright, &right, adjust)) {
*shape_changed = true;
return xfs_refcount_merge_right_extent(cur, &right, &cright,
aglen);
}
- return error;
+ return 0;
}
/*
@@ -933,7 +1079,8 @@ xfs_refcount_adjust_extents(
if (*aglen == 0)
return 0;
- error = xfs_refcount_lookup_ge(cur, *agbno, &found_rec);
+ error = xfs_refcount_lookup_ge(cur, XFS_REFC_DOMAIN_SHARED, *agbno,
+ &found_rec);
if (error)
goto out_error;
@@ -941,10 +1088,11 @@ xfs_refcount_adjust_extents(
error = xfs_refcount_get_rec(cur, &ext, &found_rec);
if (error)
goto out_error;
- if (!found_rec) {
+ if (!found_rec || ext.rc_domain != XFS_REFC_DOMAIN_SHARED) {
ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
ext.rc_blockcount = 0;
ext.rc_refcount = 0;
+ ext.rc_domain = XFS_REFC_DOMAIN_SHARED;
}
/*
@@ -957,6 +1105,8 @@ xfs_refcount_adjust_extents(
tmp.rc_blockcount = min(*aglen,
ext.rc_startblock - *agbno);
tmp.rc_refcount = 1 + adj;
+ tmp.rc_domain = XFS_REFC_DOMAIN_SHARED;
+
trace_xfs_refcount_modify_extent(cur->bc_mp,
cur->bc_ag.pag->pag_agno, &tmp);
@@ -986,15 +1136,30 @@ xfs_refcount_adjust_extents(
(*agbno) += tmp.rc_blockcount;
(*aglen) -= tmp.rc_blockcount;
- error = xfs_refcount_lookup_ge(cur, *agbno,
+ /* Stop if there's nothing left to modify */
+ if (*aglen == 0 || !xfs_refcount_still_have_space(cur))
+ break;
+
+ /* Move the cursor to the start of ext. */
+ error = xfs_refcount_lookup_ge(cur,
+ XFS_REFC_DOMAIN_SHARED, *agbno,
&found_rec);
if (error)
goto out_error;
}
- /* Stop if there's nothing left to modify */
- if (*aglen == 0 || !xfs_refcount_still_have_space(cur))
- break;
+ /*
+ * A previous step trimmed agbno/aglen such that the end of the
+ * range would not be in the middle of the record. If this is
+ * no longer the case, something is seriously wrong with the
+ * btree. Make sure we never feed the synthesized record into
+ * the processing loop below.
+ */
+ if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount == 0) ||
+ XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount > *aglen)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
/*
* Adjust the reference count and either update the tree
@@ -1070,13 +1235,15 @@ xfs_refcount_adjust(
/*
* Ensure that no rcextents cross the boundary of the adjustment range.
*/
- error = xfs_refcount_split_extent(cur, agbno, &shape_changed);
+ error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED,
+ agbno, &shape_changed);
if (error)
goto out_error;
if (shape_changed)
shape_changes++;
- error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed);
+ error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED,
+ agbno + aglen, &shape_changed);
if (error)
goto out_error;
if (shape_changed)
@@ -1085,8 +1252,8 @@ xfs_refcount_adjust(
/*
* Try to merge with the left or right extents of the range.
*/
- error = xfs_refcount_merge_extents(cur, new_agbno, new_aglen, adj,
- XFS_FIND_RCEXT_SHARED, &shape_changed);
+ error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_SHARED,
+ new_agbno, new_aglen, adj, &shape_changed);
if (error)
goto out_error;
if (shape_changed)
@@ -1125,6 +1292,32 @@ xfs_refcount_finish_one_cleanup(
}
/*
+ * Set up a continuation a deferred refcount operation by updating the intent.
+ * Checks to make sure we're not going to run off the end of the AG.
+ */
+static inline int
+xfs_refcount_continue_op(
+ struct xfs_btree_cur *cur,
+ xfs_fsblock_t startblock,
+ xfs_agblock_t new_agbno,
+ xfs_extlen_t new_len,
+ xfs_fsblock_t *new_fsbno)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_perag *pag = cur->bc_ag.pag;
+
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, new_len)))
+ return -EFSCORRUPTED;
+
+ *new_fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
+
+ ASSERT(xfs_verify_fsbext(mp, *new_fsbno, new_len));
+ ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, *new_fsbno));
+
+ return 0;
+}
+
+/*
* Process one of the deferred refcount operations. We pass back the
* btree cursor to maintain our lock on the btree between calls.
* This saves time and eliminates a buffer deadlock between the
@@ -1191,12 +1384,20 @@ xfs_refcount_finish_one(
case XFS_REFCOUNT_INCREASE:
error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
new_len, XFS_REFCOUNT_ADJUST_INCREASE);
- *new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
+ if (error)
+ goto out_drop;
+ if (*new_len > 0)
+ error = xfs_refcount_continue_op(rcur, startblock,
+ new_agbno, *new_len, new_fsb);
break;
case XFS_REFCOUNT_DECREASE:
error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
new_len, XFS_REFCOUNT_ADJUST_DECREASE);
- *new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
+ if (error)
+ goto out_drop;
+ if (*new_len > 0)
+ error = xfs_refcount_continue_op(rcur, startblock,
+ new_agbno, *new_len, new_fsb);
break;
case XFS_REFCOUNT_ALLOC_COW:
*new_fsb = startblock + blockcount;
@@ -1307,7 +1508,8 @@ xfs_refcount_find_shared(
*flen = 0;
/* Try to find a refcount extent that crosses the start */
- error = xfs_refcount_lookup_le(cur, agbno, &have);
+ error = xfs_refcount_lookup_le(cur, XFS_REFC_DOMAIN_SHARED, agbno,
+ &have);
if (error)
goto out_error;
if (!have) {
@@ -1325,6 +1527,8 @@ xfs_refcount_find_shared(
error = -EFSCORRUPTED;
goto out_error;
}
+ if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED)
+ goto done;
/* If the extent ends before the start, look at the next one */
if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) {
@@ -1340,6 +1544,8 @@ xfs_refcount_find_shared(
error = -EFSCORRUPTED;
goto out_error;
}
+ if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED)
+ goto done;
}
/* If the extent starts after the range we want, bail out */
@@ -1371,7 +1577,8 @@ xfs_refcount_find_shared(
error = -EFSCORRUPTED;
goto out_error;
}
- if (tmp.rc_startblock >= agbno + aglen ||
+ if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED ||
+ tmp.rc_startblock >= agbno + aglen ||
tmp.rc_startblock != *fbno + *flen)
break;
*flen = min(*flen + tmp.rc_blockcount, agbno + aglen - *fbno);
@@ -1455,17 +1662,23 @@ xfs_refcount_adjust_cow_extents(
return 0;
/* Find any overlapping refcount records */
- error = xfs_refcount_lookup_ge(cur, agbno, &found_rec);
+ error = xfs_refcount_lookup_ge(cur, XFS_REFC_DOMAIN_COW, agbno,
+ &found_rec);
if (error)
goto out_error;
error = xfs_refcount_get_rec(cur, &ext, &found_rec);
if (error)
goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec &&
+ ext.rc_domain != XFS_REFC_DOMAIN_COW)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
if (!found_rec) {
- ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks +
- XFS_REFC_COW_START;
+ ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
ext.rc_blockcount = 0;
ext.rc_refcount = 0;
+ ext.rc_domain = XFS_REFC_DOMAIN_COW;
}
switch (adj) {
@@ -1480,6 +1693,8 @@ xfs_refcount_adjust_cow_extents(
tmp.rc_startblock = agbno;
tmp.rc_blockcount = aglen;
tmp.rc_refcount = 1;
+ tmp.rc_domain = XFS_REFC_DOMAIN_COW;
+
trace_xfs_refcount_modify_extent(cur->bc_mp,
cur->bc_ag.pag->pag_agno, &tmp);
@@ -1542,24 +1757,24 @@ xfs_refcount_adjust_cow(
bool shape_changed;
int error;
- agbno += XFS_REFC_COW_START;
-
/*
* Ensure that no rcextents cross the boundary of the adjustment range.
*/
- error = xfs_refcount_split_extent(cur, agbno, &shape_changed);
+ error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_COW,
+ agbno, &shape_changed);
if (error)
goto out_error;
- error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed);
+ error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_COW,
+ agbno + aglen, &shape_changed);
if (error)
goto out_error;
/*
* Try to merge with the left or right extents of the range.
*/
- error = xfs_refcount_merge_extents(cur, &agbno, &aglen, adj,
- XFS_FIND_RCEXT_COW, &shape_changed);
+ error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_COW, &agbno,
+ &aglen, adj, &shape_changed);
if (error)
goto out_error;
@@ -1666,10 +1881,18 @@ xfs_refcount_recover_extent(
be32_to_cpu(rec->refc.rc_refcount) != 1))
return -EFSCORRUPTED;
- rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0);
+ rr = kmalloc(sizeof(struct xfs_refcount_recovery),
+ GFP_KERNEL | __GFP_NOFAIL);
+ INIT_LIST_HEAD(&rr->rr_list);
xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
- list_add_tail(&rr->rr_list, debris);
+ if (XFS_IS_CORRUPT(cur->bc_mp,
+ rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) {
+ kfree(rr);
+ return -EFSCORRUPTED;
+ }
+
+ list_add_tail(&rr->rr_list, debris);
return 0;
}
@@ -1687,10 +1910,11 @@ xfs_refcount_recover_cow_leftovers(
union xfs_btree_irec low;
union xfs_btree_irec high;
xfs_fsblock_t fsb;
- xfs_agblock_t agbno;
int error;
- if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START)
+ /* reflink filesystems mustn't have AGs larger than 2^31-1 blocks */
+ BUILD_BUG_ON(XFS_MAX_CRC_AG_BLOCKS >= XFS_REFC_COWFLAG);
+ if (mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS)
return -EOPNOTSUPP;
INIT_LIST_HEAD(&debris);
@@ -1717,7 +1941,7 @@ xfs_refcount_recover_cow_leftovers(
/* Find all the leftover CoW staging extents. */
memset(&low, 0, sizeof(low));
memset(&high, 0, sizeof(high));
- low.rc.rc_startblock = XFS_REFC_COW_START;
+ low.rc.rc_domain = high.rc.rc_domain = XFS_REFC_DOMAIN_COW;
high.rc.rc_startblock = -1U;
error = xfs_btree_query_range(cur, &low, &high,
xfs_refcount_recover_extent, &debris);
@@ -1738,8 +1962,8 @@ xfs_refcount_recover_cow_leftovers(
&rr->rr_rrec);
/* Free the orphan record */
- agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START;
- fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno);
+ fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno,
+ rr->rr_rrec.rc_startblock);
xfs_refcount_free_cow_extent(tp, fsb,
rr->rr_rrec.rc_blockcount);
@@ -1751,7 +1975,7 @@ xfs_refcount_recover_cow_leftovers(
goto out_free;
list_del(&rr->rr_list);
- kmem_free(rr);
+ kfree(rr);
}
return error;
@@ -1761,7 +1985,7 @@ out_free:
/* Free the leftover list */
list_for_each_entry_safe(rr, n, &debris, rr_list) {
list_del(&rr->rr_list);
- kmem_free(rr);
+ kfree(rr);
}
return error;
}
@@ -1770,6 +1994,7 @@ out_free:
int
xfs_refcount_has_record(
struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
xfs_agblock_t bno,
xfs_extlen_t len,
bool *exists)
@@ -1781,6 +2006,7 @@ xfs_refcount_has_record(
low.rc.rc_startblock = bno;
memset(&high, 0xFF, sizeof(high));
high.rc.rc_startblock = bno + len - 1;
+ low.rc.rc_domain = high.rc.rc_domain = domain;
return xfs_btree_has_record(cur, &low, &high, exists);
}
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index e8b322de7f3d..452f30556f5a 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -14,14 +14,33 @@ struct xfs_bmbt_irec;
struct xfs_refcount_irec;
extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur,
- xfs_agblock_t bno, int *stat);
+ enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur,
- xfs_agblock_t bno, int *stat);
+ enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
extern int xfs_refcount_lookup_eq(struct xfs_btree_cur *cur,
- xfs_agblock_t bno, int *stat);
+ enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur,
struct xfs_refcount_irec *irec, int *stat);
+static inline uint32_t
+xfs_refcount_encode_startblock(
+ xfs_agblock_t startblock,
+ enum xfs_refc_domain domain)
+{
+ uint32_t start;
+
+ /*
+ * low level btree operations need to handle the generic btree range
+ * query functions (which set rc_domain == -1U), so we check that the
+ * domain is /not/ shared.
+ */
+ start = startblock & ~XFS_REFC_COWFLAG;
+ if (domain != XFS_REFC_DOMAIN_SHARED)
+ start |= XFS_REFC_COWFLAG;
+
+ return start;
+}
+
enum xfs_refcount_intent_type {
XFS_REFCOUNT_INCREASE = 1,
XFS_REFCOUNT_DECREASE,
@@ -36,6 +55,18 @@ struct xfs_refcount_intent {
xfs_fsblock_t ri_startblock;
};
+/* Check that the refcount is appropriate for the record domain. */
+static inline bool
+xfs_refcount_check_domain(
+ const struct xfs_refcount_irec *irec)
+{
+ if (irec->rc_domain == XFS_REFC_DOMAIN_COW && irec->rc_refcount != 1)
+ return false;
+ if (irec->rc_domain == XFS_REFC_DOMAIN_SHARED && irec->rc_refcount < 2)
+ return false;
+ return true;
+}
+
void xfs_refcount_increase_extent(struct xfs_trans *tp,
struct xfs_bmbt_irec *irec);
void xfs_refcount_decrease_extent(struct xfs_trans *tp,
@@ -79,7 +110,8 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
#define XFS_REFCOUNT_ITEM_OVERHEAD 32
extern int xfs_refcount_has_record(struct xfs_btree_cur *cur,
- xfs_agblock_t bno, xfs_extlen_t len, bool *exists);
+ enum xfs_refc_domain domain, xfs_agblock_t bno,
+ xfs_extlen_t len, bool *exists);
union xfs_btree_rec;
extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_refcount_irec *irec);
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 316c1ec0c3c2..e1f789866683 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -13,6 +13,7 @@
#include "xfs_btree.h"
#include "xfs_btree_staging.h"
#include "xfs_refcount_btree.h"
+#include "xfs_refcount.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
#include "xfs_trace.h"
@@ -160,7 +161,12 @@ xfs_refcountbt_init_rec_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_rec *rec)
{
- rec->refc.rc_startblock = cpu_to_be32(cur->bc_rec.rc.rc_startblock);
+ const struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
+ uint32_t start;
+
+ start = xfs_refcount_encode_startblock(irec->rc_startblock,
+ irec->rc_domain);
+ rec->refc.rc_startblock = cpu_to_be32(start);
rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount);
rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount);
}
@@ -182,10 +188,13 @@ xfs_refcountbt_key_diff(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
- struct xfs_refcount_irec *rec = &cur->bc_rec.rc;
const struct xfs_refcount_key *kp = &key->refc;
+ const struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
+ uint32_t start;
- return (int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock;
+ start = xfs_refcount_encode_startblock(irec->rc_startblock,
+ irec->rc_domain);
+ return (int64_t)be32_to_cpu(kp->rc_startblock) - start;
}
STATIC int64_t
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 094dfc897ebc..b56aca1e7c66 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -235,13 +235,8 @@ xfs_rmap_get_rec(
goto out_bad_rec;
} else {
/* check for valid extent range, including overflow */
- if (!xfs_verify_agbno(pag, irec->rm_startblock))
- goto out_bad_rec;
- if (irec->rm_startblock >
- irec->rm_startblock + irec->rm_blockcount)
- goto out_bad_rec;
- if (!xfs_verify_agbno(pag,
- irec->rm_startblock + irec->rm_blockcount - 1))
+ if (!xfs_verify_agbext(pag, irec->rm_startblock,
+ irec->rm_blockcount))
goto out_bad_rec;
}
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index a20cade590e9..1eeecf2eb2a7 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -972,7 +972,9 @@ xfs_log_sb(
*/
if (xfs_has_lazysbcount(mp)) {
mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
- mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
+ mp->m_sb.sb_ifree = min_t(uint64_t,
+ percpu_counter_sum(&mp->m_ifree),
+ mp->m_sb.sb_icount);
mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 2c4ad6e4bb14..5b2f27cbdb80 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -422,7 +422,7 @@ xfs_calc_itruncate_reservation_minlogsize(
/*
* In renaming a files we can modify:
- * the four inodes involved: 4 * inode size
+ * the five inodes involved: 5 * inode size
* the two directory btrees: 2 * (max depth + v2) * dir block size
* the two directory bmap btrees: 2 * max depth * block size
* And the bmap_finish transaction can free dir and bmap blocks (two sets
@@ -437,7 +437,7 @@ xfs_calc_rename_reservation(
struct xfs_mount *mp)
{
return XFS_DQUOT_LOGRES(mp) +
- max((xfs_calc_inode_res(mp, 4) +
+ max((xfs_calc_inode_res(mp, 5) +
xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index a6b7d98cf68f..5ebdda7e1078 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -166,6 +166,36 @@ typedef struct xfs_bmbt_irec
xfs_exntst_t br_state; /* extent state */
} xfs_bmbt_irec_t;
+enum xfs_refc_domain {
+ XFS_REFC_DOMAIN_SHARED = 0,
+ XFS_REFC_DOMAIN_COW,
+};
+
+#define XFS_REFC_DOMAIN_STRINGS \
+ { XFS_REFC_DOMAIN_SHARED, "shared" }, \
+ { XFS_REFC_DOMAIN_COW, "cow" }
+
+struct xfs_refcount_irec {
+ xfs_agblock_t rc_startblock; /* starting block number */
+ xfs_extlen_t rc_blockcount; /* count of free blocks */
+ xfs_nlink_t rc_refcount; /* number of inodes linked here */
+ enum xfs_refc_domain rc_domain; /* shared or cow staging extent? */
+};
+
+#define XFS_RMAP_ATTR_FORK (1 << 0)
+#define XFS_RMAP_BMBT_BLOCK (1 << 1)
+#define XFS_RMAP_UNWRITTEN (1 << 2)
+#define XFS_RMAP_KEY_FLAGS (XFS_RMAP_ATTR_FORK | \
+ XFS_RMAP_BMBT_BLOCK)
+#define XFS_RMAP_REC_FLAGS (XFS_RMAP_UNWRITTEN)
+struct xfs_rmap_irec {
+ xfs_agblock_t rm_startblock; /* extent start block */
+ xfs_extlen_t rm_blockcount; /* extent length */
+ uint64_t rm_owner; /* extent owner */
+ uint64_t rm_offset; /* offset within the owner */
+ unsigned int rm_flags; /* state flags */
+};
+
/* per-AG block reservation types */
enum xfs_ag_resv_type {
XFS_AG_RESV_NONE = 0,
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index b7b838bd4ba4..4dd52b15f09c 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -609,9 +609,16 @@ out:
/* AGFL */
struct xchk_agfl_info {
- unsigned int sz_entries;
+ /* Number of AGFL entries that the AGF claims are in use. */
+ unsigned int agflcount;
+
+ /* Number of AGFL entries that we found. */
unsigned int nr_entries;
+
+ /* Buffer to hold AGFL entries for extent checking. */
xfs_agblock_t *entries;
+
+ struct xfs_buf *agfl_bp;
struct xfs_scrub *sc;
};
@@ -641,10 +648,10 @@ xchk_agfl_block(
struct xfs_scrub *sc = sai->sc;
if (xfs_verify_agbno(sc->sa.pag, agbno) &&
- sai->nr_entries < sai->sz_entries)
+ sai->nr_entries < sai->agflcount)
sai->entries[sai->nr_entries++] = agbno;
else
- xchk_block_set_corrupt(sc, sc->sa.agfl_bp);
+ xchk_block_set_corrupt(sc, sai->agfl_bp);
xchk_agfl_block_xref(sc, agbno);
@@ -696,19 +703,26 @@ int
xchk_agfl(
struct xfs_scrub *sc)
{
- struct xchk_agfl_info sai;
+ struct xchk_agfl_info sai = {
+ .sc = sc,
+ };
struct xfs_agf *agf;
xfs_agnumber_t agno = sc->sm->sm_agno;
- unsigned int agflcount;
unsigned int i;
int error;
+ /* Lock the AGF and AGI so that nobody can touch this AG. */
error = xchk_ag_read_headers(sc, agno, &sc->sa);
if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error))
- goto out;
+ return error;
if (!sc->sa.agf_bp)
return -EFSCORRUPTED;
- xchk_buffer_recheck(sc, sc->sa.agfl_bp);
+
+ /* Try to read the AGFL, and verify its structure if we get it. */
+ error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &sai.agfl_bp);
+ if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error))
+ return error;
+ xchk_buffer_recheck(sc, sai.agfl_bp);
xchk_agfl_xref(sc);
@@ -717,24 +731,21 @@ xchk_agfl(
/* Allocate buffer to ensure uniqueness of AGFL entries. */
agf = sc->sa.agf_bp->b_addr;
- agflcount = be32_to_cpu(agf->agf_flcount);
- if (agflcount > xfs_agfl_size(sc->mp)) {
+ sai.agflcount = be32_to_cpu(agf->agf_flcount);
+ if (sai.agflcount > xfs_agfl_size(sc->mp)) {
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
goto out;
}
- memset(&sai, 0, sizeof(sai));
- sai.sc = sc;
- sai.sz_entries = agflcount;
- sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount,
- KM_MAYFAIL);
+ sai.entries = kvcalloc(sai.agflcount, sizeof(xfs_agblock_t),
+ XCHK_GFP_FLAGS);
if (!sai.entries) {
error = -ENOMEM;
goto out;
}
/* Check the blocks in the AGFL. */
- error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr,
- sc->sa.agfl_bp, xchk_agfl_block, &sai);
+ error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr, sai.agfl_bp,
+ xchk_agfl_block, &sai);
if (error == -ECANCELED) {
error = 0;
goto out_free;
@@ -742,7 +753,7 @@ xchk_agfl(
if (error)
goto out_free;
- if (agflcount != sai.nr_entries) {
+ if (sai.agflcount != sai.nr_entries) {
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
goto out_free;
}
@@ -758,7 +769,7 @@ xchk_agfl(
}
out_free:
- kmem_free(sai.entries);
+ kvfree(sai.entries);
out:
return error;
}
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 1b0b4e243f77..d75d82151eeb 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -442,12 +442,18 @@ out_revert:
/* AGFL */
struct xrep_agfl {
+ /* Bitmap of alleged AGFL blocks that we're not going to add. */
+ struct xbitmap crossed;
+
/* Bitmap of other OWN_AG metadata blocks. */
struct xbitmap agmetablocks;
/* Bitmap of free space. */
struct xbitmap *freesp;
+ /* rmapbt cursor for finding crosslinked blocks */
+ struct xfs_btree_cur *rmap_cur;
+
struct xfs_scrub *sc;
};
@@ -477,6 +483,41 @@ xrep_agfl_walk_rmap(
return xbitmap_set_btcur_path(&ra->agmetablocks, cur);
}
+/* Strike out the blocks that are cross-linked according to the rmapbt. */
+STATIC int
+xrep_agfl_check_extent(
+ struct xrep_agfl *ra,
+ uint64_t start,
+ uint64_t len)
+{
+ xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(ra->sc->mp, start);
+ xfs_agblock_t last_agbno = agbno + len - 1;
+ int error;
+
+ ASSERT(XFS_FSB_TO_AGNO(ra->sc->mp, start) == ra->sc->sa.pag->pag_agno);
+
+ while (agbno <= last_agbno) {
+ bool other_owners;
+
+ error = xfs_rmap_has_other_keys(ra->rmap_cur, agbno, 1,
+ &XFS_RMAP_OINFO_AG, &other_owners);
+ if (error)
+ return error;
+
+ if (other_owners) {
+ error = xbitmap_set(&ra->crossed, agbno, 1);
+ if (error)
+ return error;
+ }
+
+ if (xchk_should_terminate(ra->sc, &error))
+ return error;
+ agbno++;
+ }
+
+ return 0;
+}
+
/*
* Map out all the non-AGFL OWN_AG space in this AG so that we can deduce
* which blocks belong to the AGFL.
@@ -496,44 +537,58 @@ xrep_agfl_collect_blocks(
struct xrep_agfl ra;
struct xfs_mount *mp = sc->mp;
struct xfs_btree_cur *cur;
+ struct xbitmap_range *br, *n;
int error;
ra.sc = sc;
ra.freesp = agfl_extents;
xbitmap_init(&ra.agmetablocks);
+ xbitmap_init(&ra.crossed);
/* Find all space used by the free space btrees & rmapbt. */
cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xfs_rmap_query_all(cur, xrep_agfl_walk_rmap, &ra);
- if (error)
- goto err;
xfs_btree_del_cursor(cur, error);
+ if (error)
+ goto out_bmp;
/* Find all blocks currently being used by the bnobt. */
cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
sc->sa.pag, XFS_BTNUM_BNO);
error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
- if (error)
- goto err;
xfs_btree_del_cursor(cur, error);
+ if (error)
+ goto out_bmp;
/* Find all blocks currently being used by the cntbt. */
cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
sc->sa.pag, XFS_BTNUM_CNT);
error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
- if (error)
- goto err;
-
xfs_btree_del_cursor(cur, error);
+ if (error)
+ goto out_bmp;
/*
* Drop the freesp meta blocks that are in use by btrees.
* The remaining blocks /should/ be AGFL blocks.
*/
error = xbitmap_disunion(agfl_extents, &ra.agmetablocks);
- xbitmap_destroy(&ra.agmetablocks);
if (error)
- return error;
+ goto out_bmp;
+
+ /* Strike out the blocks that are cross-linked. */
+ ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
+ for_each_xbitmap_extent(br, n, agfl_extents) {
+ error = xrep_agfl_check_extent(&ra, br->start, br->len);
+ if (error)
+ break;
+ }
+ xfs_btree_del_cursor(ra.rmap_cur, error);
+ if (error)
+ goto out_bmp;
+ error = xbitmap_disunion(agfl_extents, &ra.crossed);
+ if (error)
+ goto out_bmp;
/*
* Calculate the new AGFL size. If we found more blocks than fit in
@@ -541,11 +596,10 @@ xrep_agfl_collect_blocks(
*/
*flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents),
xfs_agfl_size(mp));
- return 0;
-err:
+out_bmp:
+ xbitmap_destroy(&ra.crossed);
xbitmap_destroy(&ra.agmetablocks);
- xfs_btree_del_cursor(cur, error);
return error;
}
@@ -631,7 +685,7 @@ xrep_agfl_init_header(
if (br->len)
break;
list_del(&br->list);
- kmem_free(br);
+ kfree(br);
}
/* Write new AGFL to disk. */
@@ -697,7 +751,6 @@ xrep_agfl(
* freespace overflow to the freespace btrees.
*/
sc->sa.agf_bp = agf_bp;
- sc->sa.agfl_bp = agfl_bp;
error = xrep_roll_ag_trans(sc);
if (error)
goto err;
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index ab427b4d7fe0..3b38f4e2a537 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -100,9 +100,7 @@ xchk_allocbt_rec(
bno = be32_to_cpu(rec->alloc.ar_startblock);
len = be32_to_cpu(rec->alloc.ar_blockcount);
- if (bno + len <= bno ||
- !xfs_verify_agbno(pag, bno) ||
- !xfs_verify_agbno(pag, bno + len - 1))
+ if (!xfs_verify_agbext(pag, bno, len))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
xchk_allocbt_xref(bs->sc, bno, len);
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index b6f0c9f3f124..31529b9bf389 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -49,7 +49,7 @@ xchk_setup_xattr_buf(
if (ab) {
if (sz <= ab->sz)
return 0;
- kmem_free(ab);
+ kvfree(ab);
sc->buf = NULL;
}
@@ -79,7 +79,8 @@ xchk_setup_xattr(
* without the inode lock held, which means we can sleep.
*/
if (sc->flags & XCHK_TRY_HARDER) {
- error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, GFP_KERNEL);
+ error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX,
+ XCHK_GFP_FLAGS);
if (error)
return error;
}
@@ -138,8 +139,7 @@ xchk_xattr_listent(
* doesn't work, we overload the seen_enough variable to convey
* the error message back to the main scrub function.
*/
- error = xchk_setup_xattr_buf(sx->sc, valuelen,
- GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ error = xchk_setup_xattr_buf(sx->sc, valuelen, XCHK_GFP_FLAGS);
if (error == -ENOMEM)
error = -EDEADLOCK;
if (error) {
@@ -324,8 +324,7 @@ xchk_xattr_block(
return 0;
/* Allocate memory for block usage checking. */
- error = xchk_setup_xattr_buf(ds->sc, 0,
- GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ error = xchk_setup_xattr_buf(ds->sc, 0, XCHK_GFP_FLAGS);
if (error == -ENOMEM)
return -EDEADLOCK;
if (error)
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index b89bf9de9b1c..a255f09e9f0a 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -10,6 +10,7 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
+#include "scrub/scrub.h"
#include "scrub/bitmap.h"
/*
@@ -25,7 +26,7 @@ xbitmap_set(
{
struct xbitmap_range *bmr;
- bmr = kmem_alloc(sizeof(struct xbitmap_range), KM_MAYFAIL);
+ bmr = kmalloc(sizeof(struct xbitmap_range), XCHK_GFP_FLAGS);
if (!bmr)
return -ENOMEM;
@@ -47,7 +48,7 @@ xbitmap_destroy(
for_each_xbitmap_extent(bmr, n, bitmap) {
list_del(&bmr->list);
- kmem_free(bmr);
+ kfree(bmr);
}
}
@@ -174,15 +175,15 @@ xbitmap_disunion(
/* Total overlap, just delete ex. */
lp = lp->next;
list_del(&br->list);
- kmem_free(br);
+ kfree(br);
break;
case 0:
/*
* Deleting from the middle: add the new right extent
* and then shrink the left extent.
*/
- new_br = kmem_alloc(sizeof(struct xbitmap_range),
- KM_MAYFAIL);
+ new_br = kmalloc(sizeof(struct xbitmap_range),
+ XCHK_GFP_FLAGS);
if (!new_br) {
error = -ENOMEM;
goto out;
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index f0b9cb6506fd..d50d0eab196a 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -90,6 +90,7 @@ out:
struct xchk_bmap_info {
struct xfs_scrub *sc;
+ struct xfs_iext_cursor icur;
xfs_fileoff_t lastoff;
bool is_rt;
bool is_shared;
@@ -146,6 +147,48 @@ xchk_bmap_get_rmap(
return has_rmap;
}
+static inline bool
+xchk_bmap_has_prev(
+ struct xchk_bmap_info *info,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_bmbt_irec got;
+ struct xfs_ifork *ifp;
+
+ ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork);
+
+ if (!xfs_iext_peek_prev_extent(ifp, &info->icur, &got))
+ return false;
+ if (got.br_startoff + got.br_blockcount != irec->br_startoff)
+ return false;
+ if (got.br_startblock + got.br_blockcount != irec->br_startblock)
+ return false;
+ if (got.br_state != irec->br_state)
+ return false;
+ return true;
+}
+
+static inline bool
+xchk_bmap_has_next(
+ struct xchk_bmap_info *info,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_bmbt_irec got;
+ struct xfs_ifork *ifp;
+
+ ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork);
+
+ if (!xfs_iext_peek_next_extent(ifp, &info->icur, &got))
+ return false;
+ if (irec->br_startoff + irec->br_blockcount != got.br_startoff)
+ return false;
+ if (irec->br_startblock + irec->br_blockcount != got.br_startblock)
+ return false;
+ if (got.br_state != irec->br_state)
+ return false;
+ return true;
+}
+
/* Make sure that we have rmapbt records for this extent. */
STATIC void
xchk_bmap_xref_rmap(
@@ -214,6 +257,34 @@ xchk_bmap_xref_rmap(
if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
+
+ /*
+ * If the rmap starts before this bmbt record, make sure there's a bmbt
+ * record for the previous offset that is contiguous with this mapping.
+ * Skip this for CoW fork extents because the refcount btree (and not
+ * the inode) is the ondisk owner for those extents.
+ */
+ if (info->whichfork != XFS_COW_FORK && rmap.rm_startblock < agbno &&
+ !xchk_bmap_has_prev(info, irec)) {
+ xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+ return;
+ }
+
+ /*
+ * If the rmap ends after this bmbt record, make sure there's a bmbt
+ * record for the next offset that is contiguous with this mapping.
+ * Skip this for CoW fork extents because the refcount btree (and not
+ * the inode) is the ondisk owner for those extents.
+ */
+ rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
+ if (info->whichfork != XFS_COW_FORK &&
+ rmap_end > agbno + irec->br_blockcount &&
+ !xchk_bmap_has_next(info, irec)) {
+ xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+ return;
+ }
}
/* Cross-reference a single rtdev extent record. */
@@ -264,6 +335,8 @@ xchk_bmap_iextent_xref(
case XFS_COW_FORK:
xchk_xref_is_cow_staging(info->sc, agbno,
irec->br_blockcount);
+ xchk_xref_is_not_shared(info->sc, agbno,
+ irec->br_blockcount);
break;
}
@@ -297,14 +370,13 @@ xchk_bmap_dirattr_extent(
}
/* Scrub a single extent record. */
-STATIC int
+STATIC void
xchk_bmap_iextent(
struct xfs_inode *ip,
struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec)
{
struct xfs_mount *mp = info->sc->mp;
- int error = 0;
/*
* Check for out-of-order extents. This record could have come
@@ -325,14 +397,6 @@ xchk_bmap_iextent(
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
- /*
- * Check for delalloc extents. We never iterate the ones in the
- * in-core extent scan, and we should never see these in the bmbt.
- */
- if (isnullstartblock(irec->br_startblock))
- xchk_fblock_set_corrupt(info->sc, info->whichfork,
- irec->br_startoff);
-
/* Make sure the extent points to a valid place. */
if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN)
xchk_fblock_set_corrupt(info->sc, info->whichfork,
@@ -353,15 +417,12 @@ xchk_bmap_iextent(
irec->br_startoff);
if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return 0;
+ return;
if (info->is_rt)
xchk_bmap_rt_iextent_xref(ip, info, irec);
else
xchk_bmap_iextent_xref(ip, info, irec);
-
- info->lastoff = irec->br_startoff + irec->br_blockcount;
- return error;
}
/* Scrub a bmbt record. */
@@ -599,14 +660,41 @@ xchk_bmap_check_rmaps(
for_each_perag(sc->mp, agno, pag) {
error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag);
- if (error)
- break;
- if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- break;
+ if (error ||
+ (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
+ xfs_perag_put(pag);
+ return error;
+ }
}
- if (pag)
- xfs_perag_put(pag);
- return error;
+
+ return 0;
+}
+
+/* Scrub a delalloc reservation from the incore extent map tree. */
+STATIC void
+xchk_bmap_iextent_delalloc(
+ struct xfs_inode *ip,
+ struct xchk_bmap_info *info,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_mount *mp = info->sc->mp;
+
+ /*
+ * Check for out-of-order extents. This record could have come
+ * from the incore list, for which there is no ordering check.
+ */
+ if (irec->br_startoff < info->lastoff)
+ xchk_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount))
+ xchk_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ /* Make sure the extent points to a valid place. */
+ if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN)
+ xchk_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
}
/*
@@ -626,7 +714,6 @@ xchk_bmap(
struct xfs_inode *ip = sc->ip;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_fileoff_t endoff;
- struct xfs_iext_cursor icur;
int error = 0;
/* Non-existent forks can be ignored. */
@@ -661,6 +748,8 @@ xchk_bmap(
case XFS_DINODE_FMT_DEV:
case XFS_DINODE_FMT_LOCAL:
/* No mappings to check. */
+ if (whichfork == XFS_COW_FORK)
+ xchk_fblock_set_corrupt(sc, whichfork, 0);
goto out;
case XFS_DINODE_FMT_EXTENTS:
break;
@@ -690,20 +779,22 @@ xchk_bmap(
/* Scrub extent records. */
info.lastoff = 0;
ifp = xfs_ifork_ptr(ip, whichfork);
- for_each_xfs_iext(ifp, &icur, &irec) {
+ for_each_xfs_iext(ifp, &info.icur, &irec) {
if (xchk_should_terminate(sc, &error) ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
goto out;
- if (isnullstartblock(irec.br_startblock))
- continue;
+
if (irec.br_startoff >= endoff) {
xchk_fblock_set_corrupt(sc, whichfork,
irec.br_startoff);
goto out;
}
- error = xchk_bmap_iextent(ip, &info, &irec);
- if (error)
- goto out;
+
+ if (isnullstartblock(irec.br_startblock))
+ xchk_bmap_iextent_delalloc(ip, &info, &irec);
+ else
+ xchk_bmap_iextent(ip, &info, &irec);
+ info.lastoff = irec.br_startoff + irec.br_blockcount;
}
error = xchk_bmap_check_rmaps(sc, whichfork);
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 2f4519590dc1..0fd36d5b4646 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -408,7 +408,6 @@ xchk_btree_check_owner(
struct xfs_buf *bp)
{
struct xfs_btree_cur *cur = bs->cur;
- struct check_owner *co;
/*
* In theory, xfs_btree_get_block should only give us a null buffer
@@ -431,10 +430,13 @@ xchk_btree_check_owner(
* later scanning.
*/
if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) {
- co = kmem_alloc(sizeof(struct check_owner),
- KM_MAYFAIL);
+ struct check_owner *co;
+
+ co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS);
if (!co)
return -ENOMEM;
+
+ INIT_LIST_HEAD(&co->list);
co->level = level;
co->daddr = xfs_buf_daddr(bp);
list_add_tail(&co->list, &bs->to_check);
@@ -649,7 +651,7 @@ xchk_btree(
xchk_btree_set_corrupt(sc, cur, 0);
return 0;
}
- bs = kmem_zalloc(cur_sz, KM_NOFS | KM_MAYFAIL);
+ bs = kzalloc(cur_sz, XCHK_GFP_FLAGS);
if (!bs)
return -ENOMEM;
bs->cur = cur;
@@ -740,9 +742,9 @@ out:
error = xchk_btree_check_block_owner(bs, co->level,
co->daddr);
list_del(&co->list);
- kmem_free(co);
+ kfree(co);
}
- kmem_free(bs);
+ kfree(bs);
return error;
}
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 9bbbf20f401b..613260b04a3d 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -424,10 +424,6 @@ xchk_ag_read_headers(
if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
return error;
- error = xfs_alloc_read_agfl(sa->pag, sc->tp, &sa->agfl_bp);
- if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL))
- return error;
-
return 0;
}
@@ -515,10 +511,6 @@ xchk_ag_free(
struct xchk_ag *sa)
{
xchk_ag_btcur_free(sa);
- if (sa->agfl_bp) {
- xfs_trans_brelse(sc->tp, sa->agfl_bp);
- sa->agfl_bp = NULL;
- }
if (sa->agf_bp) {
xfs_trans_brelse(sc->tp, sa->agf_bp);
sa->agf_bp = NULL;
@@ -789,6 +781,33 @@ xchk_buffer_recheck(
trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa);
}
+static inline int
+xchk_metadata_inode_subtype(
+ struct xfs_scrub *sc,
+ unsigned int scrub_type)
+{
+ __u32 smtype = sc->sm->sm_type;
+ int error;
+
+ sc->sm->sm_type = scrub_type;
+
+ switch (scrub_type) {
+ case XFS_SCRUB_TYPE_INODE:
+ error = xchk_inode(sc);
+ break;
+ case XFS_SCRUB_TYPE_BMBTD:
+ error = xchk_bmap_data(sc);
+ break;
+ default:
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ break;
+ }
+
+ sc->sm->sm_type = smtype;
+ return error;
+}
+
/*
* Scrub the attr/data forks of a metadata inode. The metadata inode must be
* pointed to by sc->ip and the ILOCK must be held.
@@ -797,13 +816,17 @@ int
xchk_metadata_inode_forks(
struct xfs_scrub *sc)
{
- __u32 smtype;
bool shared;
int error;
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return 0;
+ /* Check the inode record. */
+ error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return error;
+
/* Metadata inodes don't live on the rt device. */
if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) {
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
@@ -823,10 +846,7 @@ xchk_metadata_inode_forks(
}
/* Invoke the data fork scrubber. */
- smtype = sc->sm->sm_type;
- sc->sm->sm_type = XFS_SCRUB_TYPE_BMBTD;
- error = xchk_bmap_data(sc);
- sc->sm->sm_type = smtype;
+ error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
return error;
@@ -841,7 +861,7 @@ xchk_metadata_inode_forks(
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
}
- return error;
+ return 0;
}
/*
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 454145db10e7..b73648d81d23 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -25,7 +25,7 @@ xchk_should_terminate(
if (fatal_signal_pending(current)) {
if (*error == 0)
- *error = -EAGAIN;
+ *error = -EINTR;
return true;
}
return false;
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index 84fe3d33d699..d17cee177085 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -486,7 +486,7 @@ xchk_da_btree(
return 0;
/* Set up initial da state. */
- ds = kmem_zalloc(sizeof(struct xchk_da_btree), KM_NOFS | KM_MAYFAIL);
+ ds = kzalloc(sizeof(struct xchk_da_btree), XCHK_GFP_FLAGS);
if (!ds)
return -ENOMEM;
ds->dargs.dp = sc->ip;
@@ -591,6 +591,6 @@ out:
out_state:
xfs_da_state_free(ds->state);
- kmem_free(ds);
+ kfree(ds);
return error;
}
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 5abb5fdb71d9..d1b0f23c2c59 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -99,7 +99,7 @@ out:
* we check the inode number to make sure it's sane, then we check that
* we can look up this filename. Finally, we check the ftype.
*/
-STATIC int
+STATIC bool
xchk_dir_actor(
struct dir_context *dir_iter,
const char *name,
@@ -124,7 +124,7 @@ xchk_dir_actor(
xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos));
if (xchk_should_terminate(sdc->sc, &error))
- return error;
+ return !error;
/* Does this inode number make sense? */
if (!xfs_verify_dir_ino(mp, ino)) {
@@ -191,8 +191,8 @@ out:
* and return zero to xchk_directory.
*/
if (error == 0 && sdc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return -EFSCORRUPTED;
- return error;
+ return false;
+ return !error;
}
/* Scrub a directory btree record. */
@@ -666,7 +666,12 @@ xchk_directory_blocks(
struct xfs_scrub *sc)
{
struct xfs_bmbt_irec got;
- struct xfs_da_args args;
+ struct xfs_da_args args = {
+ .dp = sc ->ip,
+ .whichfork = XFS_DATA_FORK,
+ .geo = sc->mp->m_dir_geo,
+ .trans = sc->tp,
+ };
struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
struct xfs_mount *mp = sc->mp;
xfs_fileoff_t leaf_lblk;
@@ -676,7 +681,7 @@ xchk_directory_blocks(
xfs_dablk_t dabno;
xfs_dir2_db_t last_data_db = 0;
bool found;
- int is_block = 0;
+ bool is_block = false;
int error;
/* Ignore local format directories. */
@@ -689,9 +694,6 @@ xchk_directory_blocks(
free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET);
/* Is this a block dir? */
- args.dp = sc->ip;
- args.geo = mp->m_dir_geo;
- args.trans = sc->tp;
error = xfs_dir2_isblock(&args, &is_block);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
goto out;
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 6a6f8fe7f87c..4777e7b89fdc 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -14,6 +14,8 @@
#include "xfs_health.h"
#include "xfs_btree.h"
#include "xfs_ag.h"
+#include "xfs_rtalloc.h"
+#include "xfs_inode.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -43,6 +45,16 @@
* our tolerance for mismatch between expected and actual counter values.
*/
+struct xchk_fscounters {
+ struct xfs_scrub *sc;
+ uint64_t icount;
+ uint64_t ifree;
+ uint64_t fdblocks;
+ uint64_t frextents;
+ unsigned long long icount_min;
+ unsigned long long icount_max;
+};
+
/*
* Since the expected value computation is lockless but only browses incore
* values, the percpu counters should be fairly close to each other. However,
@@ -116,10 +128,11 @@ xchk_setup_fscounters(
struct xchk_fscounters *fsc;
int error;
- sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0);
+ sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS);
if (!sc->buf)
return -ENOMEM;
fsc = sc->buf;
+ fsc->sc = sc;
xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max);
@@ -138,6 +151,18 @@ xchk_setup_fscounters(
return xchk_trans_alloc(sc, 0);
}
+/*
+ * Part 1: Collecting filesystem summary counts. For each AG, we add its
+ * summary counts (total inodes, free inodes, free data blocks) to an incore
+ * copy of the overall filesystem summary counts.
+ *
+ * To avoid false corruption reports in part 2, any failure in this part must
+ * set the INCOMPLETE flag even when a negative errno is returned. This care
+ * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
+ * ECANCELED) that are absorbed into a scrub state flag update by
+ * xchk_*_process_error.
+ */
+
/* Count free space btree blocks manually for pre-lazysbcount filesystems. */
static int
xchk_fscount_btreeblks(
@@ -225,8 +250,10 @@ retry:
}
if (pag)
xfs_perag_put(pag);
- if (error)
+ if (error) {
+ xchk_set_incomplete(sc);
return error;
+ }
/*
* The global incore space reservation is taken from the incore
@@ -267,6 +294,64 @@ retry:
return 0;
}
+#ifdef CONFIG_XFS_RT
+STATIC int
+xchk_fscount_add_frextent(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *rec,
+ void *priv)
+{
+ struct xchk_fscounters *fsc = priv;
+ int error = 0;
+
+ fsc->frextents += rec->ar_extcount;
+
+ xchk_should_terminate(fsc->sc, &error);
+ return error;
+}
+
+/* Calculate the number of free realtime extents from the realtime bitmap. */
+STATIC int
+xchk_fscount_count_frextents(
+ struct xfs_scrub *sc,
+ struct xchk_fscounters *fsc)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ fsc->frextents = 0;
+ if (!xfs_has_realtime(mp))
+ return 0;
+
+ xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ error = xfs_rtalloc_query_all(sc->mp, sc->tp,
+ xchk_fscount_add_frextent, fsc);
+ if (error) {
+ xchk_set_incomplete(sc);
+ goto out_unlock;
+ }
+
+out_unlock:
+ xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ return error;
+}
+#else
+STATIC int
+xchk_fscount_count_frextents(
+ struct xfs_scrub *sc,
+ struct xchk_fscounters *fsc)
+{
+ fsc->frextents = 0;
+ return 0;
+}
+#endif /* CONFIG_XFS_RT */
+
+/*
+ * Part 2: Comparing filesystem summary counters. All we have to do here is
+ * sum the percpu counters and compare them to what we've observed.
+ */
+
/*
* Is the @counter reasonably close to the @expected value?
*
@@ -333,16 +418,17 @@ xchk_fscounters(
{
struct xfs_mount *mp = sc->mp;
struct xchk_fscounters *fsc = sc->buf;
- int64_t icount, ifree, fdblocks;
+ int64_t icount, ifree, fdblocks, frextents;
int error;
/* Snapshot the percpu counters. */
icount = percpu_counter_sum(&mp->m_icount);
ifree = percpu_counter_sum(&mp->m_ifree);
fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+ frextents = percpu_counter_sum(&mp->m_frextents);
/* No negative values, please! */
- if (icount < 0 || ifree < 0 || fdblocks < 0)
+ if (icount < 0 || ifree < 0 || fdblocks < 0 || frextents < 0)
xchk_set_corrupt(sc);
/* See if icount is obviously wrong. */
@@ -353,6 +439,10 @@ xchk_fscounters(
if (fdblocks > mp->m_sb.sb_dblocks)
xchk_set_corrupt(sc);
+ /* See if frextents is obviously wrong. */
+ if (frextents > mp->m_sb.sb_rextents)
+ xchk_set_corrupt(sc);
+
/*
* If ifree exceeds icount by more than the minimum variance then
* something's probably wrong with the counters.
@@ -367,6 +457,13 @@ xchk_fscounters(
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
return 0;
+ /* Count the free extents counter for rt volumes. */
+ error = xchk_fscount_count_frextents(sc, fsc);
+ if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
+ return error;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
+ return 0;
+
/* Compare the in-core counters with whatever we counted. */
if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount))
xchk_set_corrupt(sc);
@@ -378,5 +475,9 @@ xchk_fscounters(
fsc->fdblocks))
xchk_set_corrupt(sc);
+ if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
+ fsc->frextents))
+ xchk_set_corrupt(sc);
+
return 0;
}
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index e1026e07bf94..e312be7cd375 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -108,9 +108,8 @@ xchk_iallocbt_chunk(
xfs_agblock_t bno;
bno = XFS_AGINO_TO_AGBNO(mp, agino);
- if (bno + len <= bno ||
- !xfs_verify_agbno(pag, bno) ||
- !xfs_verify_agbno(pag, bno + len - 1))
+
+ if (!xfs_verify_agbext(pag, bno, len))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
xchk_iallocbt_chunk_xref(bs->sc, irec, agino, bno, len);
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 51820b40ab1c..7a2f38e5202c 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -365,7 +365,7 @@ xchk_dinode(
* pagecache can't cache all the blocks in this file due to
* overly large offsets, flag the inode for admin review.
*/
- if (isize >= mp->m_super->s_maxbytes)
+ if (isize > mp->m_super->s_maxbytes)
xchk_ino_set_warning(sc, ino);
/* di_nblocks */
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index ab182a5cd0c0..d8dff3fd8053 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -38,7 +38,7 @@ struct xchk_parent_ctx {
};
/* Look for a single entry in a directory pointing to an inode. */
-STATIC int
+STATIC bool
xchk_parent_actor(
struct dir_context *dc,
const char *name,
@@ -62,7 +62,7 @@ xchk_parent_actor(
if (xchk_should_terminate(spc->sc, &error))
spc->cancelled = true;
- return error;
+ return !error;
}
/* Count the number of dentries in the parent dir that point to this inode. */
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 21b4c9006859..9eeac8565394 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -14,6 +14,7 @@
#include "xfs_inode.h"
#include "xfs_quota.h"
#include "xfs_qm.h"
+#include "xfs_bmap.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -84,7 +85,7 @@ xchk_quota_item(
int error = 0;
if (xchk_should_terminate(sc, &error))
- return -ECANCELED;
+ return error;
/*
* Except for the root dquot, the actual dquot we got must either have
@@ -189,11 +190,12 @@ xchk_quota_data_fork(
for_each_xfs_iext(ifp, &icur, &irec) {
if (xchk_should_terminate(sc, &error))
break;
+
/*
- * delalloc extents or blocks mapped above the highest
+ * delalloc/unwritten extents or blocks mapped above the highest
* quota id shouldn't happen.
*/
- if (isnullstartblock(irec.br_startblock) ||
+ if (!xfs_bmap_is_written_extent(&irec) ||
irec.br_startoff > max_dqid_off ||
irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index c68b767dc08f..d9c1b3cea4a5 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -127,8 +127,8 @@ xchk_refcountbt_rmap_check(
* is healthy each rmap_irec we see will be in agbno order
* so we don't need insertion sort here.
*/
- frag = kmem_alloc(sizeof(struct xchk_refcnt_frag),
- KM_MAYFAIL);
+ frag = kmalloc(sizeof(struct xchk_refcnt_frag),
+ XCHK_GFP_FLAGS);
if (!frag)
return -ENOMEM;
memcpy(&frag->rm, rec, sizeof(frag->rm));
@@ -215,7 +215,7 @@ xchk_refcountbt_process_rmap_fragments(
continue;
}
list_del(&frag->list);
- kmem_free(frag);
+ kfree(frag);
nr++;
}
@@ -257,11 +257,11 @@ done:
/* Delete fragments and work list. */
list_for_each_entry_safe(frag, n, &worklist, list) {
list_del(&frag->list);
- kmem_free(frag);
+ kfree(frag);
}
list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
list_del(&frag->list);
- kmem_free(frag);
+ kfree(frag);
}
}
@@ -269,15 +269,13 @@ done:
STATIC void
xchk_refcountbt_xref_rmap(
struct xfs_scrub *sc,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- xfs_nlink_t refcount)
+ const struct xfs_refcount_irec *irec)
{
struct xchk_refcnt_check refchk = {
- .sc = sc,
- .bno = bno,
- .len = len,
- .refcount = refcount,
+ .sc = sc,
+ .bno = irec->rc_startblock,
+ .len = irec->rc_blockcount,
+ .refcount = irec->rc_refcount,
.seen = 0,
};
struct xfs_rmap_irec low;
@@ -291,9 +289,9 @@ xchk_refcountbt_xref_rmap(
/* Cross-reference with the rmapbt to confirm the refcount. */
memset(&low, 0, sizeof(low));
- low.rm_startblock = bno;
+ low.rm_startblock = irec->rc_startblock;
memset(&high, 0xFF, sizeof(high));
- high.rm_startblock = bno + len - 1;
+ high.rm_startblock = irec->rc_startblock + irec->rc_blockcount - 1;
INIT_LIST_HEAD(&refchk.fragments);
error = xfs_rmap_query_range(sc->sa.rmap_cur, &low, &high,
@@ -302,30 +300,29 @@ xchk_refcountbt_xref_rmap(
goto out_free;
xchk_refcountbt_process_rmap_fragments(&refchk);
- if (refcount != refchk.seen)
+ if (irec->rc_refcount != refchk.seen)
xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
out_free:
list_for_each_entry_safe(frag, n, &refchk.fragments, list) {
list_del(&frag->list);
- kmem_free(frag);
+ kfree(frag);
}
}
/* Cross-reference with the other btrees. */
STATIC void
xchk_refcountbt_xref(
- struct xfs_scrub *sc,
- xfs_agblock_t agbno,
- xfs_extlen_t len,
- xfs_nlink_t refcount)
+ struct xfs_scrub *sc,
+ const struct xfs_refcount_irec *irec)
{
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return;
- xchk_xref_is_used_space(sc, agbno, len);
- xchk_xref_is_not_inode_chunk(sc, agbno, len);
- xchk_refcountbt_xref_rmap(sc, agbno, len, refcount);
+ xchk_xref_is_used_space(sc, irec->rc_startblock, irec->rc_blockcount);
+ xchk_xref_is_not_inode_chunk(sc, irec->rc_startblock,
+ irec->rc_blockcount);
+ xchk_refcountbt_xref_rmap(sc, irec);
}
/* Scrub a refcountbt record. */
@@ -334,35 +331,27 @@ xchk_refcountbt_rec(
struct xchk_btree *bs,
const union xfs_btree_rec *rec)
{
+ struct xfs_refcount_irec irec;
xfs_agblock_t *cow_blocks = bs->private;
struct xfs_perag *pag = bs->cur->bc_ag.pag;
- xfs_agblock_t bno;
- xfs_extlen_t len;
- xfs_nlink_t refcount;
- bool has_cowflag;
- bno = be32_to_cpu(rec->refc.rc_startblock);
- len = be32_to_cpu(rec->refc.rc_blockcount);
- refcount = be32_to_cpu(rec->refc.rc_refcount);
+ xfs_refcount_btrec_to_irec(rec, &irec);
- /* Only CoW records can have refcount == 1. */
- has_cowflag = (bno & XFS_REFC_COW_START);
- if ((refcount == 1 && !has_cowflag) || (refcount != 1 && has_cowflag))
+ /* Check the domain and refcount are not incompatible. */
+ if (!xfs_refcount_check_domain(&irec))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
- if (has_cowflag)
- (*cow_blocks) += len;
+
+ if (irec.rc_domain == XFS_REFC_DOMAIN_COW)
+ (*cow_blocks) += irec.rc_blockcount;
/* Check the extent. */
- bno &= ~XFS_REFC_COW_START;
- if (bno + len <= bno ||
- !xfs_verify_agbno(pag, bno) ||
- !xfs_verify_agbno(pag, bno + len - 1))
+ if (!xfs_verify_agbext(pag, irec.rc_startblock, irec.rc_blockcount))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
- if (refcount == 0)
+ if (irec.rc_refcount == 0)
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
- xchk_refcountbt_xref(bs->sc, bno, len, refcount);
+ xchk_refcountbt_xref(bs->sc, &irec);
return 0;
}
@@ -426,7 +415,6 @@ xchk_xref_is_cow_staging(
xfs_extlen_t len)
{
struct xfs_refcount_irec rc;
- bool has_cowflag;
int has_refcount;
int error;
@@ -434,8 +422,8 @@ xchk_xref_is_cow_staging(
return;
/* Find the CoW staging extent. */
- error = xfs_refcount_lookup_le(sc->sa.refc_cur,
- agbno + XFS_REFC_COW_START, &has_refcount);
+ error = xfs_refcount_lookup_le(sc->sa.refc_cur, XFS_REFC_DOMAIN_COW,
+ agbno, &has_refcount);
if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur))
return;
if (!has_refcount) {
@@ -451,9 +439,8 @@ xchk_xref_is_cow_staging(
return;
}
- /* CoW flag must be set, refcount must be 1. */
- has_cowflag = (rc.rc_startblock & XFS_REFC_COW_START);
- if (!has_cowflag || rc.rc_refcount != 1)
+ /* CoW lookup returned a shared extent record? */
+ if (rc.rc_domain != XFS_REFC_DOMAIN_COW)
xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
/* Must be at least as long as what was passed in */
@@ -477,7 +464,8 @@ xchk_xref_is_not_shared(
if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm))
return;
- error = xfs_refcount_has_record(sc->sa.refc_cur, agbno, len, &shared);
+ error = xfs_refcount_has_record(sc->sa.refc_cur, XFS_REFC_DOMAIN_SHARED,
+ agbno, len, &shared);
if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur))
return;
if (shared)
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index c18bd039fce9..4b92f9253ccd 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -61,7 +61,6 @@ xrep_attempt(
sc->flags |= XREP_ALREADY_FIXED;
return -EAGAIN;
case -EDEADLOCK:
- case -EAGAIN:
/* Tell the caller to try again having grabbed all the locks. */
if (!(sc->flags & XCHK_TRY_HARDER)) {
sc->flags |= XCHK_TRY_HARDER;
@@ -70,10 +69,15 @@ xrep_attempt(
/*
* We tried harder but still couldn't grab all the resources
* we needed to fix it. The corruption has not been fixed,
- * so report back to userspace.
+ * so exit to userspace with the scan's output flags unchanged.
*/
- return -EFSCORRUPTED;
+ return 0;
default:
+ /*
+ * EAGAIN tells the caller to re-scrub, so we cannot return
+ * that here.
+ */
+ ASSERT(error != -EAGAIN);
return error;
}
}
@@ -121,32 +125,40 @@ xrep_roll_ag_trans(
{
int error;
- /* Keep the AG header buffers locked so we can keep going. */
- if (sc->sa.agi_bp)
+ /*
+ * Keep the AG header buffers locked while we roll the transaction.
+ * Ensure that both AG buffers are dirty and held when we roll the
+ * transaction so that they move forward in the log without losing the
+ * bli (and hence the bli type) when the transaction commits.
+ *
+ * Normal code would never hold clean buffers across a roll, but repair
+ * needs both buffers to maintain a total lock on the AG.
+ */
+ if (sc->sa.agi_bp) {
+ xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM);
xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
- if (sc->sa.agf_bp)
+ }
+
+ if (sc->sa.agf_bp) {
+ xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM);
xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
- if (sc->sa.agfl_bp)
- xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
+ }
/*
- * Roll the transaction. We still own the buffer and the buffer lock
- * regardless of whether or not the roll succeeds. If the roll fails,
- * the buffers will be released during teardown on our way out of the
- * kernel. If it succeeds, we join them to the new transaction and
- * move on.
+ * Roll the transaction. We still hold the AG header buffers locked
+ * regardless of whether or not that succeeds. On failure, the buffers
+ * will be released during teardown on our way out of the kernel. If
+ * successful, join the buffers to the new transaction and move on.
*/
error = xfs_trans_roll(&sc->tp);
if (error)
return error;
- /* Join AG headers to the new transaction. */
+ /* Join the AG headers to the new transaction. */
if (sc->sa.agi_bp)
xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
if (sc->sa.agf_bp)
xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
- if (sc->sa.agfl_bp)
- xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
return 0;
}
@@ -498,6 +510,7 @@ xrep_put_freelist(
struct xfs_scrub *sc,
xfs_agblock_t agbno)
{
+ struct xfs_buf *agfl_bp;
int error;
/* Make sure there's space on the freelist. */
@@ -516,8 +529,12 @@ xrep_put_freelist(
return error;
/* Put the block on the AGFL. */
+ error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
+ if (error)
+ return error;
+
error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
- sc->sa.agfl_bp, agbno, 0);
+ agfl_bp, agbno, 0);
if (error)
return error;
xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 2e8e400f10a9..07a7a75f987f 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -174,7 +174,7 @@ xchk_teardown(
if (sc->flags & XCHK_REAPING_DISABLED)
xchk_start_reaping(sc);
if (sc->buf) {
- kmem_free(sc->buf);
+ kvfree(sc->buf);
sc->buf = NULL;
}
return error;
@@ -467,7 +467,7 @@ xfs_scrub_metadata(
xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB,
"EXPERIMENTAL online scrub feature in use. Use at your own risk!");
- sc = kmem_zalloc(sizeof(struct xfs_scrub), KM_NOFS | KM_MAYFAIL);
+ sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS);
if (!sc) {
error = -ENOMEM;
goto out;
@@ -557,7 +557,7 @@ out_nofix:
out_teardown:
error = xchk_teardown(sc, error);
out_sc:
- kmem_free(sc);
+ kfree(sc);
out:
trace_xchk_done(XFS_I(file_inode(file)), sm, error);
if (error == -EFSCORRUPTED || error == -EFSBADCRC) {
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 3de5287e98d8..b4d391b4c938 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -8,6 +8,15 @@
struct xfs_scrub;
+/*
+ * Standard flags for allocating memory within scrub. NOFS context is
+ * configured by the process allocation scope. Scrub and repair must be able
+ * to back out gracefully if there isn't enough memory. Force-cast to avoid
+ * complaints from static checkers.
+ */
+#define XCHK_GFP_FLAGS ((__force gfp_t)(GFP_KERNEL | __GFP_NOWARN | \
+ __GFP_RETRY_MAYFAIL))
+
/* Type info and names for the scrub types. */
enum xchk_type {
ST_NONE = 1, /* disabled */
@@ -39,7 +48,6 @@ struct xchk_ag {
/* AG btree roots */
struct xfs_buf *agf_bp;
- struct xfs_buf *agfl_bp;
struct xfs_buf *agi_bp;
/* AG btrees */
@@ -161,12 +169,4 @@ void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno,
# define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0)
#endif
-struct xchk_fscounters {
- uint64_t icount;
- uint64_t ifree;
- uint64_t fdblocks;
- unsigned long long icount_min;
- unsigned long long icount_max;
-};
-
#endif /* __XFS_SCRUB_SCRUB_H__ */
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index 75311f8daeeb..c1c99ffe7408 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -21,7 +21,7 @@ xchk_setup_symlink(
struct xfs_scrub *sc)
{
/* Allocate the buffer without the inode lock held. */
- sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, GFP_KERNEL);
+ sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, XCHK_GFP_FLAGS);
if (!sc->buf)
return -ENOMEM;
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index b744c62052b6..a05f44eb8178 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -242,12 +242,13 @@ xfs_acl_set_mode(
}
int
-xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
umode_t mode;
bool set_mode = false;
int error = 0;
+ struct inode *inode = d_inode(dentry);
if (!acl)
goto set_acl;
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 263404d0bfda..dcd176149c7a 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -11,7 +11,7 @@ struct posix_acl;
#ifdef CONFIG_XFS_POSIX_ACL
extern struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu);
-extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+extern int xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
void xfs_forget_acl(struct inode *inode, const char *name);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 5d1a995b15f8..41734202796f 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -17,6 +17,8 @@
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
#include "xfs_reflink.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
struct xfs_writepage_ctx {
struct iomap_writepage_ctx ctx;
@@ -114,9 +116,8 @@ xfs_end_ioend(
if (unlikely(error)) {
if (ioend->io_flags & IOMAP_F_SHARED) {
xfs_reflink_cancel_cow_range(ip, offset, size, true);
- xfs_bmap_punch_delalloc_range(ip,
- XFS_B_TO_FSBT(mp, offset),
- XFS_B_TO_FSB(mp, size));
+ xfs_bmap_punch_delalloc_range(ip, offset,
+ offset + size);
}
goto done;
}
@@ -218,11 +219,17 @@ xfs_imap_valid(
* checked (and found nothing at this offset) could have added
* overlapping blocks.
*/
- if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq))
+ if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) {
+ trace_xfs_wb_data_iomap_invalid(ip, &wpc->iomap,
+ XFS_WPC(wpc)->data_seq, XFS_DATA_FORK);
return false;
+ }
if (xfs_inode_has_cow_data(ip) &&
- XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
+ XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) {
+ trace_xfs_wb_cow_iomap_invalid(ip, &wpc->iomap,
+ XFS_WPC(wpc)->cow_seq, XFS_COW_FORK);
return false;
+ }
return true;
}
@@ -286,6 +293,8 @@ xfs_map_blocks(
if (xfs_is_shutdown(mp))
return -EIO;
+ XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
+
/*
* COW fork blocks can overlap data fork blocks even if the blocks
* aren't shared. COW I/O always takes precedent, so we must always
@@ -373,7 +382,7 @@ retry:
isnullstartblock(imap.br_startblock))
goto allocate_blocks;
- xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0);
+ xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq);
trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
return 0;
allocate_blocks:
@@ -455,12 +464,8 @@ xfs_discard_folio(
struct folio *folio,
loff_t pos)
{
- struct inode *inode = folio->mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_inode *ip = XFS_I(folio->mapping->host);
struct xfs_mount *mp = ip->i_mount;
- size_t offset = offset_in_folio(folio, pos);
- xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, pos);
- xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, offset);
int error;
if (xfs_is_shutdown(mp))
@@ -470,8 +475,9 @@ xfs_discard_folio(
"page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
folio, ip->i_ino, pos);
- error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
- i_blocks_per_folio(inode, folio) - pageoff_fsb);
+ error = xfs_bmap_punch_delalloc_range(ip, pos,
+ round_up(pos, folio_size(folio)));
+
if (error && !xfs_is_shutdown(mp))
xfs_alert(mp, "page discard unable to remove delalloc mapping.");
}
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 5077a7ad5646..2788a6f2edcd 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -86,8 +86,6 @@ xfs_attri_log_nameval_alloc(
*/
nv = xlog_kvmalloc(sizeof(struct xfs_attri_log_nameval) +
name_len + value_len);
- if (!nv)
- return nv;
nv->name.i_addr = nv + 1;
nv->name.i_len = name_len;
@@ -247,28 +245,6 @@ xfs_attri_init(
return attrip;
}
-/*
- * Copy an attr format buffer from the given buf, and into the destination attr
- * format structure.
- */
-STATIC int
-xfs_attri_copy_format(
- struct xfs_log_iovec *buf,
- struct xfs_attri_log_format *dst_attr_fmt)
-{
- struct xfs_attri_log_format *src_attr_fmt = buf->i_addr;
- size_t len;
-
- len = sizeof(struct xfs_attri_log_format);
- if (buf->i_len != len) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
- return -EFSCORRUPTED;
- }
-
- memcpy((char *)dst_attr_fmt, (char *)src_attr_fmt, len);
- return 0;
-}
-
static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip)
{
return container_of(lip, struct xfs_attrd_log_item, attrd_item);
@@ -441,8 +417,6 @@ xfs_attr_create_intent(
attr->xattri_nameval = xfs_attri_log_nameval_alloc(args->name,
args->namelen, args->value, args->valuelen);
}
- if (!attr->xattri_nameval)
- return ERR_PTR(-ENOMEM);
attrip = xfs_attri_init(mp, attr->xattri_nameval);
xfs_trans_add_item(tp, &attrip->attri_item);
@@ -735,24 +709,50 @@ xlog_recover_attri_commit_pass2(
struct xfs_attri_log_nameval *nv;
const void *attr_value = NULL;
const void *attr_name;
- int error;
+ size_t len;
attri_formatp = item->ri_buf[0].i_addr;
attr_name = item->ri_buf[1].i_addr;
/* Validate xfs_attri_log_format before the large memory allocation */
+ len = sizeof(struct xfs_attri_log_format);
+ if (item->ri_buf[0].i_len != len) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
if (!xfs_attri_validate(mp, attri_formatp)) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ /* Validate the attr name */
+ if (item->ri_buf[1].i_len !=
+ xlog_calc_iovec_len(attri_formatp->alfi_name_len)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
return -EFSCORRUPTED;
}
if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[1].i_addr, item->ri_buf[1].i_len);
return -EFSCORRUPTED;
}
- if (attri_formatp->alfi_value_len)
+ /* Validate the attr value, if present */
+ if (attri_formatp->alfi_value_len != 0) {
+ if (item->ri_buf[2].i_len != xlog_calc_iovec_len(attri_formatp->alfi_value_len)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr,
+ item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
attr_value = item->ri_buf[2].i_addr;
+ }
/*
* Memory alloc failure will cause replay to abort. We attach the
@@ -762,13 +762,9 @@ xlog_recover_attri_commit_pass2(
nv = xfs_attri_log_nameval_alloc(attr_name,
attri_formatp->alfi_name_len, attr_value,
attri_formatp->alfi_value_len);
- if (!nv)
- return -ENOMEM;
attrip = xfs_attri_init(mp, nv);
- error = xfs_attri_copy_format(&item->ri_buf[0], &attrip->attri_format);
- if (error)
- goto out;
+ memcpy(&attrip->attri_format, attri_formatp, len);
/*
* The ATTRI has two references. One for the ATTRD and one for ATTRI to
@@ -780,10 +776,6 @@ xlog_recover_attri_commit_pass2(
xfs_attri_release(attrip);
xfs_attri_log_nameval_put(nv);
return 0;
-out:
- xfs_attri_item_free(attrip);
- xfs_attri_log_nameval_put(nv);
- return error;
}
/*
@@ -848,7 +840,8 @@ xlog_recover_attrd_commit_pass2(
attrd_formatp = item->ri_buf[0].i_addr;
if (item->ri_buf[0].i_len != sizeof(struct xfs_attrd_log_format)) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 51f66e982484..41323da523d1 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -608,28 +608,18 @@ static const struct xfs_item_ops xfs_bui_item_ops = {
.iop_relog = xfs_bui_item_relog,
};
-/*
- * Copy an BUI format buffer from the given buf, and into the destination
- * BUI format structure. The BUI/BUD items were designed not to need any
- * special alignment handling.
- */
-static int
+static inline void
xfs_bui_copy_format(
- struct xfs_log_iovec *buf,
- struct xfs_bui_log_format *dst_bui_fmt)
+ struct xfs_bui_log_format *dst,
+ const struct xfs_bui_log_format *src)
{
- struct xfs_bui_log_format *src_bui_fmt;
- uint len;
+ unsigned int i;
- src_bui_fmt = buf->i_addr;
- len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents);
+ memcpy(dst, src, offsetof(struct xfs_bui_log_format, bui_extents));
- if (buf->i_len == len) {
- memcpy(dst_bui_fmt, src_bui_fmt, len);
- return 0;
- }
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
- return -EFSCORRUPTED;
+ for (i = 0; i < src->bui_nextents; i++)
+ memcpy(&dst->bui_extents[i], &src->bui_extents[i],
+ sizeof(struct xfs_map_extent));
}
/*
@@ -646,23 +636,34 @@ xlog_recover_bui_commit_pass2(
struct xlog_recover_item *item,
xfs_lsn_t lsn)
{
- int error;
struct xfs_mount *mp = log->l_mp;
struct xfs_bui_log_item *buip;
struct xfs_bui_log_format *bui_formatp;
+ size_t len;
bui_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len < xfs_bui_log_format_sizeof(0)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
return -EFSCORRUPTED;
}
- buip = xfs_bui_init(mp);
- error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format);
- if (error) {
- xfs_bui_item_free(buip);
- return error;
+
+ len = xfs_bui_log_format_sizeof(bui_formatp->bui_nextents);
+ if (item->ri_buf[0].i_len != len) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
}
+
+ buip = xfs_bui_init(mp);
+ xfs_bui_copy_format(&buip->bui_format, bui_formatp);
atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents);
/*
* Insert the intent into the AIL directly and drop one reference so
@@ -696,7 +697,8 @@ xlog_recover_bud_commit_pass2(
bud_formatp = item->ri_buf[0].i_addr;
if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 04d0c2bff67c..867645b74d88 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -590,11 +590,13 @@ out_unlock_iolock:
int
xfs_bmap_punch_delalloc_range(
struct xfs_inode *ip,
- xfs_fileoff_t start_fsb,
- xfs_fileoff_t length)
+ xfs_off_t start_byte,
+ xfs_off_t end_byte)
{
+ struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = &ip->i_df;
- xfs_fileoff_t end_fsb = start_fsb + length;
+ xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte);
struct xfs_bmbt_irec got, del;
struct xfs_iext_cursor icur;
int error = 0;
@@ -607,7 +609,7 @@ xfs_bmap_punch_delalloc_range(
while (got.br_startoff + got.br_blockcount > start_fsb) {
del = got;
- xfs_trim_extent(&del, start_fsb, length);
+ xfs_trim_extent(&del, start_fsb, end_fsb - start_fsb);
/*
* A delete can push the cursor forward. Step back to the
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 24b37d211f1d..6888078f5c31 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -31,7 +31,7 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
#endif /* CONFIG_XFS_RT */
int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
- xfs_fileoff_t start_fsb, xfs_fileoff_t length);
+ xfs_off_t start_byte, xfs_off_t end_byte);
struct kgetbmap {
__s64 bmv_offset; /* file offset of segment in blocks */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index dde346450952..54c774af6e1c 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1945,6 +1945,7 @@ xfs_free_buftarg(
list_lru_destroy(&btp->bt_lru);
blkdev_issue_flush(btp->bt_bdev);
+ invalidate_bdev(btp->bt_bdev);
fs_put_dax(btp->bt_daxdev, btp->bt_mount);
kmem_free(btp);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 522d450a94b1..df7322ed73fa 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1018,6 +1018,8 @@ xfs_buf_item_relse(
trace_xfs_buf_item_relse(bp, _RET_IP_);
ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
+ if (atomic_read(&bip->bli_refcount))
+ return;
bp->b_log_item = NULL;
xfs_buf_rele(bp);
xfs_buf_item_free(bip);
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index e295fc8062d8..9f3ceb461515 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -512,7 +512,7 @@ xfs_readdir(
{
struct xfs_da_args args = { NULL };
unsigned int lock_mode;
- int isblock;
+ bool isblock;
int error;
trace_xfs_readdir(dp);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 296faa41d81d..ae082808cfed 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -46,7 +46,7 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_REFCOUNT_FINISH_ONE,
XFS_RANDOM_BMAP_FINISH_ONE,
XFS_RANDOM_AG_RESV_CRITICAL,
- XFS_RANDOM_DROP_WRITES,
+ 0, /* XFS_RANDOM_DROP_WRITES has been removed */
XFS_RANDOM_LOG_BAD_CRC,
XFS_RANDOM_LOG_ITEM_PIN,
XFS_RANDOM_BUF_LRU_REF,
@@ -60,6 +60,8 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_LARP,
XFS_RANDOM_DA_LEAF_SPLIT,
XFS_RANDOM_ATTR_LEAF_TO_NODE,
+ XFS_RANDOM_WB_DELAY_MS,
+ XFS_RANDOM_WRITE_DELAY_MS,
};
struct xfs_errortag_attr {
@@ -162,7 +164,6 @@ XFS_ERRORTAG_ATTR_RW(refcount_continue_update, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDA
XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE);
XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE);
XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL);
-XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES);
XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC);
XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN);
XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF);
@@ -176,6 +177,8 @@ XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL);
XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP);
XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT);
XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE);
+XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS);
+XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -206,7 +209,6 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(refcount_finish_one),
XFS_ERRORTAG_ATTR_LIST(bmap_finish_one),
XFS_ERRORTAG_ATTR_LIST(ag_resv_critical),
- XFS_ERRORTAG_ATTR_LIST(drop_writes),
XFS_ERRORTAG_ATTR_LIST(log_bad_crc),
XFS_ERRORTAG_ATTR_LIST(log_item_pin),
XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
@@ -220,6 +222,8 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(larp),
XFS_ERRORTAG_ATTR_LIST(da_leaf_split),
XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node),
+ XFS_ERRORTAG_ATTR_LIST(wb_delay_ms),
+ XFS_ERRORTAG_ATTR_LIST(write_delay_ms),
NULL,
};
ATTRIBUTE_GROUPS(xfs_errortag);
@@ -234,13 +238,18 @@ int
xfs_errortag_init(
struct xfs_mount *mp)
{
+ int ret;
+
mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX,
KM_MAYFAIL);
if (!mp->m_errortag)
return -ENOMEM;
- return xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype,
- &mp->m_kobj, "errortag");
+ ret = xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype,
+ &mp->m_kobj, "errortag");
+ if (ret)
+ kmem_free(mp->m_errortag);
+ return ret;
}
void
@@ -251,6 +260,32 @@ xfs_errortag_del(
kmem_free(mp->m_errortag);
}
+static bool
+xfs_errortag_valid(
+ unsigned int error_tag)
+{
+ if (error_tag >= XFS_ERRTAG_MAX)
+ return false;
+
+ /* Error out removed injection types */
+ if (error_tag == XFS_ERRTAG_DROP_WRITES)
+ return false;
+ return true;
+}
+
+bool
+xfs_errortag_enabled(
+ struct xfs_mount *mp,
+ unsigned int tag)
+{
+ if (!mp->m_errortag)
+ return false;
+ if (!xfs_errortag_valid(tag))
+ return false;
+
+ return mp->m_errortag[tag] != 0;
+}
+
bool
xfs_errortag_test(
struct xfs_mount *mp,
@@ -272,9 +307,11 @@ xfs_errortag_test(
if (!mp->m_errortag)
return false;
- ASSERT(error_tag < XFS_ERRTAG_MAX);
+ if (!xfs_errortag_valid(error_tag))
+ return false;
+
randfactor = mp->m_errortag[error_tag];
- if (!randfactor || prandom_u32() % randfactor)
+ if (!randfactor || get_random_u32_below(randfactor))
return false;
xfs_warn_ratelimited(mp,
@@ -288,7 +325,7 @@ xfs_errortag_get(
struct xfs_mount *mp,
unsigned int error_tag)
{
- if (error_tag >= XFS_ERRTAG_MAX)
+ if (!xfs_errortag_valid(error_tag))
return -EINVAL;
return mp->m_errortag[error_tag];
@@ -300,7 +337,7 @@ xfs_errortag_set(
unsigned int error_tag,
unsigned int tag_value)
{
- if (error_tag >= XFS_ERRTAG_MAX)
+ if (!xfs_errortag_valid(error_tag))
return -EINVAL;
mp->m_errortag[error_tag] = tag_value;
@@ -314,7 +351,7 @@ xfs_errortag_add(
{
BUILD_BUG_ON(ARRAY_SIZE(xfs_errortag_random_default) != XFS_ERRTAG_MAX);
- if (error_tag >= XFS_ERRTAG_MAX)
+ if (!xfs_errortag_valid(error_tag))
return -EINVAL;
return xfs_errortag_set(mp, error_tag,
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 5191e9145e55..dbe6c37dc697 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -45,6 +45,18 @@ extern bool xfs_errortag_test(struct xfs_mount *mp, const char *expression,
const char *file, int line, unsigned int error_tag);
#define XFS_TEST_ERROR(expr, mp, tag) \
((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag)))
+bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag);
+#define XFS_ERRORTAG_DELAY(mp, tag) \
+ do { \
+ might_sleep(); \
+ if (!xfs_errortag_enabled((mp), (tag))) \
+ break; \
+ xfs_warn_ratelimited((mp), \
+"Injecting %ums delay at file %s, line %d, on filesystem \"%s\"", \
+ (mp)->m_errortag[(tag)], __FILE__, __LINE__, \
+ (mp)->m_super->s_id); \
+ mdelay((mp)->m_errortag[(tag)]); \
+ } while (0)
extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag);
extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag,
@@ -55,6 +67,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
#define xfs_errortag_init(mp) (0)
#define xfs_errortag_del(mp)
#define XFS_TEST_ERROR(expr, mp, tag) (expr)
+#define XFS_ERRORTAG_DELAY(mp, tag) ((void)0)
#define xfs_errortag_set(mp, tag, val) (ENOSYS)
#define xfs_errortag_add(mp, tag) (ENOSYS)
#define xfs_errortag_clearall(mp) (ENOSYS)
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 27ccfcd82f04..d5130d1fcfae 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -66,27 +66,16 @@ xfs_efi_release(
xfs_efi_item_free(efip);
}
-/*
- * This returns the number of iovecs needed to log the given efi item.
- * We only need 1 iovec for an efi item. It just logs the efi_log_format
- * structure.
- */
-static inline int
-xfs_efi_item_sizeof(
- struct xfs_efi_log_item *efip)
-{
- return sizeof(struct xfs_efi_log_format) +
- (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
-}
-
STATIC void
xfs_efi_item_size(
struct xfs_log_item *lip,
int *nvecs,
int *nbytes)
{
+ struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+
*nvecs += 1;
- *nbytes += xfs_efi_item_sizeof(EFI_ITEM(lip));
+ *nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents);
}
/*
@@ -112,7 +101,7 @@ xfs_efi_item_format(
xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT,
&efip->efi_format,
- xfs_efi_item_sizeof(efip));
+ xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents));
}
@@ -155,13 +144,11 @@ xfs_efi_init(
{
struct xfs_efi_log_item *efip;
- uint size;
ASSERT(nextents > 0);
if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
- size = (uint)(sizeof(struct xfs_efi_log_item) +
- ((nextents - 1) * sizeof(xfs_extent_t)));
- efip = kmem_zalloc(size, 0);
+ efip = kzalloc(xfs_efi_log_item_sizeof(nextents),
+ GFP_KERNEL | __GFP_NOFAIL);
} else {
efip = kmem_cache_zalloc(xfs_efi_cache,
GFP_KERNEL | __GFP_NOFAIL);
@@ -188,15 +175,17 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
{
xfs_efi_log_format_t *src_efi_fmt = buf->i_addr;
uint i;
- uint len = sizeof(xfs_efi_log_format_t) +
- (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t);
- uint len32 = sizeof(xfs_efi_log_format_32_t) +
- (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t);
- uint len64 = sizeof(xfs_efi_log_format_64_t) +
- (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t);
+ uint len = xfs_efi_log_format_sizeof(src_efi_fmt->efi_nextents);
+ uint len32 = xfs_efi_log_format32_sizeof(src_efi_fmt->efi_nextents);
+ uint len64 = xfs_efi_log_format64_sizeof(src_efi_fmt->efi_nextents);
if (buf->i_len == len) {
- memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len);
+ memcpy(dst_efi_fmt, src_efi_fmt,
+ offsetof(struct xfs_efi_log_format, efi_extents));
+ for (i = 0; i < src_efi_fmt->efi_nextents; i++)
+ memcpy(&dst_efi_fmt->efi_extents[i],
+ &src_efi_fmt->efi_extents[i],
+ sizeof(struct xfs_extent));
return 0;
} else if (buf->i_len == len32) {
xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr;
@@ -227,7 +216,8 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
}
return 0;
}
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, NULL, buf->i_addr,
+ buf->i_len);
return -EFSCORRUPTED;
}
@@ -246,27 +236,16 @@ xfs_efd_item_free(struct xfs_efd_log_item *efdp)
kmem_cache_free(xfs_efd_cache, efdp);
}
-/*
- * This returns the number of iovecs needed to log the given efd item.
- * We only need 1 iovec for an efd item. It just logs the efd_log_format
- * structure.
- */
-static inline int
-xfs_efd_item_sizeof(
- struct xfs_efd_log_item *efdp)
-{
- return sizeof(xfs_efd_log_format_t) +
- (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
-}
-
STATIC void
xfs_efd_item_size(
struct xfs_log_item *lip,
int *nvecs,
int *nbytes)
{
+ struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
+
*nvecs += 1;
- *nbytes += xfs_efd_item_sizeof(EFD_ITEM(lip));
+ *nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents);
}
/*
@@ -291,7 +270,7 @@ xfs_efd_item_format(
xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT,
&efdp->efd_format,
- xfs_efd_item_sizeof(efdp));
+ xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents));
}
/*
@@ -340,9 +319,8 @@ xfs_trans_get_efd(
ASSERT(nextents > 0);
if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
- efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
- (nextents - 1) * sizeof(struct xfs_extent),
- 0);
+ efdp = kzalloc(xfs_efd_log_item_sizeof(nextents),
+ GFP_KERNEL | __GFP_NOFAIL);
} else {
efdp = kmem_cache_zalloc(xfs_efd_cache,
GFP_KERNEL | __GFP_NOFAIL);
@@ -733,6 +711,12 @@ xlog_recover_efi_commit_pass2(
efi_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len < xfs_efi_log_format_sizeof(0)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format);
if (error) {
@@ -769,12 +753,24 @@ xlog_recover_efd_commit_pass2(
xfs_lsn_t lsn)
{
struct xfs_efd_log_format *efd_formatp;
+ int buflen = item->ri_buf[0].i_len;
efd_formatp = item->ri_buf[0].i_addr;
- ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
- ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
- (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
- ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
+
+ if (buflen < sizeof(struct xfs_efd_log_format)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ efd_formatp, buflen);
+ return -EFSCORRUPTED;
+ }
+
+ if (item->ri_buf[0].i_len != xfs_efd_log_format32_sizeof(
+ efd_formatp->efd_nextents) &&
+ item->ri_buf[0].i_len != xfs_efd_log_format64_sizeof(
+ efd_formatp->efd_nextents)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ efd_formatp, buflen);
+ return -EFSCORRUPTED;
+ }
xlog_recover_release_intent(log, XFS_LI_EFI, efd_formatp->efd_efi_id);
return 0;
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 186d0f2137f1..da6a5afa607c 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -52,6 +52,14 @@ struct xfs_efi_log_item {
xfs_efi_log_format_t efi_format;
};
+static inline size_t
+xfs_efi_log_item_sizeof(
+ unsigned int nr)
+{
+ return offsetof(struct xfs_efi_log_item, efi_format) +
+ xfs_efi_log_format_sizeof(nr);
+}
+
/*
* This is the "extent free done" log item. It is used to log
* the fact that some extents earlier mentioned in an efi item
@@ -64,6 +72,14 @@ struct xfs_efd_log_item {
xfs_efd_log_format_t efd_format;
};
+static inline size_t
+xfs_efd_log_item_sizeof(
+ unsigned int nr)
+{
+ return offsetof(struct xfs_efd_log_item, efd_format) +
+ xfs_efd_log_format_sizeof(nr);
+}
+
/*
* Max number of extents in fast allocation path.
*/
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c6c80265c0b2..595a5bcf46b9 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1261,7 +1261,7 @@ xfs_file_llseek(
}
#ifdef CONFIG_FS_DAX
-static int
+static inline vm_fault_t
xfs_dax_fault(
struct vm_fault *vmf,
enum page_entry_size pe_size,
@@ -1274,14 +1274,15 @@ xfs_dax_fault(
&xfs_read_iomap_ops);
}
#else
-static int
+static inline vm_fault_t
xfs_dax_fault(
struct vm_fault *vmf,
enum page_entry_size pe_size,
bool write_fault,
pfn_t *pfn)
{
- return 0;
+ ASSERT(0);
+ return VM_FAULT_SIGBUS;
}
#endif
@@ -1324,7 +1325,7 @@ __xfs_filemap_fault(
if (write_fault) {
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
ret = iomap_page_mkwrite(vmf,
- &xfs_buffered_write_iomap_ops);
+ &xfs_page_mkwrite_iomap_ops);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
} else {
ret = filemap_fault(vmf);
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index d8337274c74d..88a88506ffff 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -524,7 +524,7 @@ xfs_getfsmap_rtdev_rtbitmap_query(
struct xfs_mount *mp = tp->t_mountp;
int error;
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED);
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
/*
* Set up query parameters to return free rtextents covering the range
@@ -551,7 +551,7 @@ xfs_getfsmap_rtdev_rtbitmap_query(
if (error)
goto err;
err:
- xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED);
+ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
return error;
}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 2bbe7916a998..f35e2cee5265 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -342,6 +342,9 @@ xfs_iget_recycle(
trace_xfs_iget_recycle(ip);
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+ return -EAGAIN;
+
/*
* We need to make it look like the inode is being reclaimed to prevent
* the actual reclaim workers from stomping over us while we recycle
@@ -355,6 +358,7 @@ xfs_iget_recycle(
ASSERT(!rwsem_is_locked(&inode->i_rwsem));
error = xfs_reinit_inode(mp, inode);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error) {
/*
* Re-initializing the inode failed, and we are in deep
@@ -518,6 +522,8 @@ xfs_iget_cache_hit(
if (ip->i_flags & XFS_IRECLAIMABLE) {
/* Drops i_flags_lock and RCU read lock. */
error = xfs_iget_recycle(pag, ip);
+ if (error == -EAGAIN)
+ goto out_skip;
if (error)
return error;
} else {
@@ -596,7 +602,7 @@ xfs_iget_cache_miss(
*/
if (xfs_has_v3inodes(mp) &&
(flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) {
- VFS_I(ip)->i_generation = prandom_u32();
+ VFS_I(ip)->i_generation = get_random_u32();
} else {
struct xfs_buf *bp;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 28493c8e9bb2..d354ea2b74f9 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -835,9 +835,8 @@ xfs_init_new_inode(
* ID or one of the supplementary group IDs, the S_ISGID bit is cleared
* (and only if the irix_sgid_inherit compatibility variable is set).
*/
- if (irix_sgid_inherit &&
- (inode->i_mode & S_ISGID) &&
- !in_group_p(i_gid_into_mnt(mnt_userns, inode)))
+ if (irix_sgid_inherit && (inode->i_mode & S_ISGID) &&
+ !vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)))
inode->i_mode &= ~S_ISGID;
ip->i_disk_size = 0;
@@ -2480,7 +2479,7 @@ xfs_remove(
error = xfs_dir_replace(tp, ip, &xfs_name_dotdot,
tp->t_mountp->m_sb.sb_rootino, 0);
if (error)
- return error;
+ goto out_trans_cancel;
}
} else {
/*
@@ -2819,7 +2818,7 @@ retry:
* Lock all the participating inodes. Depending upon whether
* the target_name exists in the target directory, and
* whether the target directory is the same as the source
- * directory, we can lock from 2 to 4 inodes.
+ * directory, we can lock from 2 to 5 inodes.
*/
xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
@@ -3119,7 +3118,7 @@ xfs_iflush(
if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
mp, XFS_ERRTAG_IFLUSH_1)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
- "%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT,
+ "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT,
__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
goto flush_out;
}
@@ -3129,7 +3128,7 @@ xfs_iflush(
ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
mp, XFS_ERRTAG_IFLUSH_3)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
- "%s: Bad regular inode %Lu, ptr "PTR_FMT,
+ "%s: Bad regular inode %llu, ptr "PTR_FMT,
__func__, ip->i_ino, ip);
goto flush_out;
}
@@ -3140,7 +3139,7 @@ xfs_iflush(
ip->i_df.if_format != XFS_DINODE_FMT_LOCAL,
mp, XFS_ERRTAG_IFLUSH_4)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
- "%s: Bad directory inode %Lu, ptr "PTR_FMT,
+ "%s: Bad directory inode %llu, ptr "PTR_FMT,
__func__, ip->i_ino, ip);
goto flush_out;
}
@@ -3158,7 +3157,7 @@ xfs_iflush(
if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize,
mp, XFS_ERRTAG_IFLUSH_6)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
- "%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT,
+ "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT,
__func__, ip->i_ino, ip->i_forkoff, ip);
goto flush_out;
}
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 6e19ece916bf..ca2941ab6cbc 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -550,7 +550,7 @@ xfs_inode_item_push(
if (!bp || (ip->i_flags & XFS_ISTALE)) {
/*
- * Inode item/buffer is being being aborted due to cluster
+ * Inode item/buffer is being aborted due to cluster
* buffer deletion. Trigger a log force to have that operation
* completed and items removed from the AIL before the next push
* attempt.
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index d28ffaebd067..0e5dba2343ea 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -321,7 +321,7 @@ xlog_recover_inode_commit_pass2(
*/
if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) {
xfs_alert(mp,
- "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld",
+ "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %lld",
__func__, dip, bp, in_f->ilf_ino);
error = -EFSCORRUPTED;
goto out_release;
@@ -329,7 +329,7 @@ xlog_recover_inode_commit_pass2(
ldip = item->ri_buf[1].i_addr;
if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) {
xfs_alert(mp,
- "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld",
+ "%s: Bad inode log record, rec ptr "PTR_FMT", ino %lld",
__func__, item, in_f->ilf_ino);
error = -EFSCORRUPTED;
goto out_release;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 1f783e979629..13f1b2add390 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1138,10 +1138,6 @@ xfs_ioctl_setattr_xflags(
if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip))
ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
- /* Don't allow us to set DAX mode for a reflinked file for now. */
- if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip))
- return -EINVAL;
-
/* diflags2 only valid for v3 inodes. */
i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
if (i_flags2 && !xfs_has_v3inodes(mp))
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 07da03976ec1..669c1bc5c3a7 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -48,13 +48,53 @@ xfs_alert_fsblock_zero(
return -EFSCORRUPTED;
}
+u64
+xfs_iomap_inode_sequence(
+ struct xfs_inode *ip,
+ u16 iomap_flags)
+{
+ u64 cookie = 0;
+
+ if (iomap_flags & IOMAP_F_XATTR)
+ return READ_ONCE(ip->i_af.if_seq);
+ if ((iomap_flags & IOMAP_F_SHARED) && ip->i_cowfp)
+ cookie = (u64)READ_ONCE(ip->i_cowfp->if_seq) << 32;
+ return cookie | READ_ONCE(ip->i_df.if_seq);
+}
+
+/*
+ * Check that the iomap passed to us is still valid for the given offset and
+ * length.
+ */
+static bool
+xfs_iomap_valid(
+ struct inode *inode,
+ const struct iomap *iomap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+
+ if (iomap->validity_cookie !=
+ xfs_iomap_inode_sequence(ip, iomap->flags)) {
+ trace_xfs_iomap_invalid(ip, iomap);
+ return false;
+ }
+
+ XFS_ERRORTAG_DELAY(ip->i_mount, XFS_ERRTAG_WRITE_DELAY_MS);
+ return true;
+}
+
+const struct iomap_page_ops xfs_iomap_page_ops = {
+ .iomap_valid = xfs_iomap_valid,
+};
+
int
xfs_bmbt_to_iomap(
struct xfs_inode *ip,
struct iomap *iomap,
struct xfs_bmbt_irec *imap,
unsigned int mapping_flags,
- u16 iomap_flags)
+ u16 iomap_flags,
+ u64 sequence_cookie)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
@@ -91,6 +131,9 @@ xfs_bmbt_to_iomap(
if (xfs_ipincount(ip) &&
(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
iomap->flags |= IOMAP_F_DIRTY;
+
+ iomap->validity_cookie = sequence_cookie;
+ iomap->page_ops = &xfs_iomap_page_ops;
return 0;
}
@@ -195,7 +238,8 @@ xfs_iomap_write_direct(
xfs_fileoff_t offset_fsb,
xfs_fileoff_t count_fsb,
unsigned int flags,
- struct xfs_bmbt_irec *imap)
+ struct xfs_bmbt_irec *imap,
+ u64 *seq)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
@@ -285,6 +329,7 @@ xfs_iomap_write_direct(
error = xfs_alert_fsblock_zero(ip, imap);
out_unlock:
+ *seq = xfs_iomap_inode_sequence(ip, 0);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
@@ -743,6 +788,7 @@ xfs_direct_write_iomap_begin(
bool shared = false;
u16 iomap_flags = 0;
unsigned int lockmode = XFS_ILOCK_SHARED;
+ u64 seq;
ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
@@ -811,9 +857,10 @@ xfs_direct_write_iomap_begin(
goto out_unlock;
}
+ seq = xfs_iomap_inode_sequence(ip, iomap_flags);
xfs_iunlock(ip, lockmode);
trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq);
allocate_blocks:
error = -EAGAIN;
@@ -839,24 +886,26 @@ allocate_blocks:
xfs_iunlock(ip, lockmode);
error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
- flags, &imap);
+ flags, &imap, &seq);
if (error)
return error;
trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
- iomap_flags | IOMAP_F_NEW);
+ iomap_flags | IOMAP_F_NEW, seq);
out_found_cow:
- xfs_iunlock(ip, lockmode);
length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
if (imap.br_startblock != HOLESTARTBLOCK) {
- error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
+ seq = xfs_iomap_inode_sequence(ip, 0);
+ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
if (error)
- return error;
+ goto out_unlock;
}
- return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED);
+ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
+ xfs_iunlock(ip, lockmode);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
out_unlock:
if (lockmode)
@@ -915,6 +964,7 @@ xfs_buffered_write_iomap_begin(
int allocfork = XFS_DATA_FORK;
int error = 0;
unsigned int lockmode = XFS_ILOCK_EXCL;
+ u64 seq;
if (xfs_is_shutdown(mp))
return -EIO;
@@ -926,6 +976,10 @@ xfs_buffered_write_iomap_begin(
ASSERT(!XFS_IS_REALTIME_INODE(ip));
+ error = xfs_qm_dqattach(ip);
+ if (error)
+ return error;
+
error = xfs_ilock_for_iomap(ip, flags, &lockmode);
if (error)
return error;
@@ -1029,10 +1083,6 @@ xfs_buffered_write_iomap_begin(
allocfork = XFS_COW_FORK;
}
- error = xfs_qm_dqattach_locked(ip, false);
- if (error)
- goto out_unlock;
-
if (eof && offset + count > XFS_ISIZE(ip)) {
/*
* Determine the initial size of the preallocation.
@@ -1094,26 +1144,31 @@ retry:
* Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
* them out if the write happens to fail.
*/
+ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq);
found_imap:
+ seq = xfs_iomap_inode_sequence(ip, 0);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
found_cow:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ seq = xfs_iomap_inode_sequence(ip, 0);
if (imap.br_startoff <= offset_fsb) {
- error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
+ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
if (error)
- return error;
+ goto out_unlock;
+ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
- IOMAP_F_SHARED);
+ IOMAP_F_SHARED, seq);
}
xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
- return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq);
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -1121,6 +1176,16 @@ out_unlock:
}
static int
+xfs_buffered_write_delalloc_punch(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length)
+{
+ return xfs_bmap_punch_delalloc_range(XFS_I(inode), offset,
+ offset + length);
+}
+
+static int
xfs_buffered_write_iomap_end(
struct inode *inode,
loff_t offset,
@@ -1129,56 +1194,17 @@ xfs_buffered_write_iomap_end(
unsigned flags,
struct iomap *iomap)
{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t start_fsb;
- xfs_fileoff_t end_fsb;
- int error = 0;
-
- if (iomap->type != IOMAP_DELALLOC)
- return 0;
-
- /*
- * Behave as if the write failed if drop writes is enabled. Set the NEW
- * flag to force delalloc cleanup.
- */
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DROP_WRITES)) {
- iomap->flags |= IOMAP_F_NEW;
- written = 0;
- }
- /*
- * start_fsb refers to the first unused block after a short write. If
- * nothing was written, round offset down to point at the first block in
- * the range.
- */
- if (unlikely(!written))
- start_fsb = XFS_B_TO_FSBT(mp, offset);
- else
- start_fsb = XFS_B_TO_FSB(mp, offset + written);
- end_fsb = XFS_B_TO_FSB(mp, offset + length);
+ struct xfs_mount *mp = XFS_M(inode->i_sb);
+ int error;
- /*
- * Trim delalloc blocks if they were allocated by this write and we
- * didn't manage to write the whole range.
- *
- * We don't need to care about racing delalloc as we hold i_mutex
- * across the reserve/allocate/unreserve calls. If there are delalloc
- * blocks in the range, they are ours.
- */
- if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) {
- truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
- XFS_FSB_TO_B(mp, end_fsb) - 1);
-
- error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
- end_fsb - start_fsb);
- if (error && !xfs_is_shutdown(mp)) {
- xfs_alert(mp, "%s: unable to clean up ino %lld",
- __func__, ip->i_ino);
- return error;
- }
+ error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset,
+ length, written, &xfs_buffered_write_delalloc_punch);
+ if (error && !xfs_is_shutdown(mp)) {
+ xfs_alert(mp, "%s: unable to clean up ino 0x%llx",
+ __func__, XFS_I(inode)->i_ino);
+ return error;
}
-
return 0;
}
@@ -1187,6 +1213,15 @@ const struct iomap_ops xfs_buffered_write_iomap_ops = {
.iomap_end = xfs_buffered_write_iomap_end,
};
+/*
+ * iomap_page_mkwrite() will never fail in a way that requires delalloc extents
+ * that it allocated to be revoked. Hence we do not need an .iomap_end method
+ * for this operation.
+ */
+const struct iomap_ops xfs_page_mkwrite_iomap_ops = {
+ .iomap_begin = xfs_buffered_write_iomap_begin,
+};
+
static int
xfs_read_iomap_begin(
struct inode *inode,
@@ -1204,6 +1239,7 @@ xfs_read_iomap_begin(
int nimaps = 1, error = 0;
bool shared = false;
unsigned int lockmode = XFS_ILOCK_SHARED;
+ u64 seq;
ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
@@ -1215,15 +1251,16 @@ xfs_read_iomap_begin(
return error;
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
&nimaps, 0);
- if (!error && (flags & IOMAP_REPORT))
+ if (!error && ((flags & IOMAP_REPORT) || IS_DAX(inode)))
error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
+ seq = xfs_iomap_inode_sequence(ip, shared ? IOMAP_F_SHARED : 0);
xfs_iunlock(ip, lockmode);
if (error)
return error;
trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
- shared ? IOMAP_F_SHARED : 0);
+ shared ? IOMAP_F_SHARED : 0, seq);
}
const struct iomap_ops xfs_read_iomap_ops = {
@@ -1248,6 +1285,7 @@ xfs_seek_iomap_begin(
struct xfs_bmbt_irec imap, cmap;
int error = 0;
unsigned lockmode;
+ u64 seq;
if (xfs_is_shutdown(mp))
return -EIO;
@@ -1282,8 +1320,9 @@ xfs_seek_iomap_begin(
if (data_fsb < cow_fsb + cmap.br_blockcount)
end_fsb = min(end_fsb, data_fsb);
xfs_trim_extent(&cmap, offset_fsb, end_fsb);
+ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
- IOMAP_F_SHARED);
+ IOMAP_F_SHARED, seq);
/*
* This is a COW extent, so we must probe the page cache
* because there could be dirty page cache being backed
@@ -1304,8 +1343,9 @@ xfs_seek_iomap_begin(
imap.br_startblock = HOLESTARTBLOCK;
imap.br_state = XFS_EXT_NORM;
done:
+ seq = xfs_iomap_inode_sequence(ip, 0);
xfs_trim_extent(&imap, offset_fsb, end_fsb);
- error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
out_unlock:
xfs_iunlock(ip, lockmode);
return error;
@@ -1331,6 +1371,7 @@ xfs_xattr_iomap_begin(
struct xfs_bmbt_irec imap;
int nimaps = 1, error = 0;
unsigned lockmode;
+ int seq;
if (xfs_is_shutdown(mp))
return -EIO;
@@ -1347,12 +1388,14 @@ xfs_xattr_iomap_begin(
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
&nimaps, XFS_BMAPI_ATTRFORK);
out_unlock:
+
+ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_XATTR);
xfs_iunlock(ip, lockmode);
if (error)
return error;
ASSERT(nimaps);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_XATTR, seq);
}
const struct iomap_ops xfs_xattr_iomap_ops = {
@@ -1370,7 +1413,7 @@ xfs_zero_range(
if (IS_DAX(inode))
return dax_zero_range(inode, pos, len, did_zero,
- &xfs_direct_write_iomap_ops);
+ &xfs_dax_write_iomap_ops);
return iomap_zero_range(inode, pos, len, did_zero,
&xfs_buffered_write_iomap_ops);
}
@@ -1385,7 +1428,7 @@ xfs_truncate_page(
if (IS_DAX(inode))
return dax_truncate_page(inode, pos, did_zero,
- &xfs_direct_write_iomap_ops);
+ &xfs_dax_write_iomap_ops);
return iomap_truncate_page(inode, pos, did_zero,
&xfs_buffered_write_iomap_ops);
}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index c782e8c0479c..4da13440bae9 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -13,14 +13,15 @@ struct xfs_bmbt_irec;
int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
xfs_fileoff_t count_fsb, unsigned int flags,
- struct xfs_bmbt_irec *imap);
+ struct xfs_bmbt_irec *imap, u64 *sequence);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
xfs_fileoff_t end_fsb);
+u64 xfs_iomap_inode_sequence(struct xfs_inode *ip, u16 iomap_flags);
int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
struct xfs_bmbt_irec *imap, unsigned int mapping_flags,
- u16 iomap_flags);
+ u16 iomap_flags, u64 sequence_cookie);
int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
bool *did_zero);
@@ -47,6 +48,7 @@ xfs_aligned_fsb_count(
}
extern const struct iomap_ops xfs_buffered_write_iomap_ops;
+extern const struct iomap_ops xfs_page_mkwrite_iomap_ops;
extern const struct iomap_ops xfs_direct_write_iomap_ops;
extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 45518b8c613c..515318dfbc38 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -167,7 +167,7 @@ xfs_generic_create(
struct dentry *dentry,
umode_t mode,
dev_t rdev,
- bool tmpfile) /* unnamed file */
+ struct file *tmpfile) /* unnamed file */
{
struct inode *inode;
struct xfs_inode *ip = NULL;
@@ -234,7 +234,7 @@ xfs_generic_create(
* d_tmpfile can immediately set it back to zero.
*/
set_nlink(inode, 1);
- d_tmpfile(dentry, inode);
+ d_tmpfile(tmpfile, inode);
} else
d_instantiate(dentry, inode);
@@ -261,7 +261,7 @@ xfs_vn_mknod(
umode_t mode,
dev_t rdev)
{
- return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, false);
+ return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, NULL);
}
STATIC int
@@ -272,7 +272,7 @@ xfs_vn_create(
umode_t mode,
bool flags)
{
- return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, false);
+ return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, NULL);
}
STATIC int
@@ -283,7 +283,7 @@ xfs_vn_mkdir(
umode_t mode)
{
return xfs_generic_create(mnt_userns, dir, dentry, mode | S_IFDIR, 0,
- false);
+ NULL);
}
STATIC struct dentry *
@@ -558,6 +558,8 @@ xfs_vn_getattr(
struct inode *inode = d_inode(path->dentry);
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
+ vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
trace_xfs_getattr(ip);
@@ -568,8 +570,8 @@ xfs_vn_getattr(
stat->dev = inode->i_sb->s_dev;
stat->mode = inode->i_mode;
stat->nlink = inode->i_nlink;
- stat->uid = i_uid_into_mnt(mnt_userns, inode);
- stat->gid = i_gid_into_mnt(mnt_userns, inode);
+ stat->uid = vfsuid_into_kuid(vfsuid);
+ stat->gid = vfsgid_into_kgid(vfsgid);
stat->ino = ip->i_ino;
stat->atime = inode->i_atime;
stat->mtime = inode->i_mtime;
@@ -604,6 +606,16 @@ xfs_vn_getattr(
stat->blksize = BLKDEV_IOSIZE;
stat->rdev = inode->i_rdev;
break;
+ case S_IFREG:
+ if (request_mask & STATX_DIOALIGN) {
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ struct block_device *bdev = target->bt_bdev;
+
+ stat->result_mask |= STATX_DIOALIGN;
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+ stat->dio_offset_align = bdev_logical_block_size(bdev);
+ }
+ fallthrough;
default:
stat->blksize = xfs_stat_blksize(ip);
stat->rdev = 0;
@@ -639,6 +651,7 @@ xfs_vn_change_ok(
static int
xfs_setattr_nonsize(
struct user_namespace *mnt_userns,
+ struct dentry *dentry,
struct xfs_inode *ip,
struct iattr *iattr)
{
@@ -745,7 +758,7 @@ xfs_setattr_nonsize(
* Posix ACL code seems to care about this issue either.
*/
if (mask & ATTR_MODE) {
- error = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
+ error = posix_acl_chmod(mnt_userns, dentry, inode->i_mode);
if (error)
return error;
}
@@ -767,6 +780,7 @@ out_dqrele:
STATIC int
xfs_setattr_size(
struct user_namespace *mnt_userns,
+ struct dentry *dentry,
struct xfs_inode *ip,
struct iattr *iattr)
{
@@ -798,7 +812,7 @@ xfs_setattr_size(
* Use the regular setattr path to update the timestamps.
*/
iattr->ia_valid &= ~ATTR_SIZE;
- return xfs_setattr_nonsize(mnt_userns, ip, iattr);
+ return xfs_setattr_nonsize(mnt_userns, dentry, ip, iattr);
}
/*
@@ -975,7 +989,7 @@ xfs_vn_setattr_size(
error = xfs_vn_change_ok(mnt_userns, dentry, iattr);
if (error)
return error;
- return xfs_setattr_size(mnt_userns, ip, iattr);
+ return xfs_setattr_size(mnt_userns, dentry, ip, iattr);
}
STATIC int
@@ -1007,7 +1021,7 @@ xfs_vn_setattr(
error = xfs_vn_change_ok(mnt_userns, dentry, iattr);
if (!error)
- error = xfs_setattr_nonsize(mnt_userns, ip, iattr);
+ error = xfs_setattr_nonsize(mnt_userns, dentry, ip, iattr);
}
return error;
@@ -1080,14 +1094,16 @@ STATIC int
xfs_vn_tmpfile(
struct user_namespace *mnt_userns,
struct inode *dir,
- struct dentry *dentry,
+ struct file *file,
umode_t mode)
{
- return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, true);
+ int err = xfs_generic_create(mnt_userns, dir, file->f_path.dentry, mode, 0, file);
+
+ return finish_open_simple(file, err);
}
static const struct inode_operations xfs_inode_operations = {
- .get_acl = xfs_get_acl,
+ .get_inode_acl = xfs_get_acl,
.set_acl = xfs_set_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
@@ -1114,7 +1130,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
.rmdir = xfs_vn_unlink,
.mknod = xfs_vn_mknod,
.rename = xfs_vn_rename,
- .get_acl = xfs_get_acl,
+ .get_inode_acl = xfs_get_acl,
.set_acl = xfs_set_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
@@ -1141,7 +1157,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
.rmdir = xfs_vn_unlink,
.mknod = xfs_vn_mknod,
.rename = xfs_vn_rename,
- .get_acl = xfs_get_acl,
+ .get_inode_acl = xfs_get_acl,
.set_acl = xfs_set_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
@@ -1171,10 +1187,6 @@ xfs_inode_supports_dax(
if (!S_ISREG(VFS_I(ip)->i_mode))
return false;
- /* Only supported on non-reflinked files. */
- if (xfs_is_reflink_inode(ip))
- return false;
-
/* Block size must match page size */
if (mp->m_sb.sb_blocksize != PAGE_SIZE)
return false;
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index cb5fc68c9ea0..e570dcb5df8d 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -13,7 +13,6 @@ extern const struct file_operations xfs_dir_file_operations;
extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
-extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr);
int xfs_vn_setattr_size(struct user_namespace *mnt_userns,
struct dentry *dentry, struct iattr *vap);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 36312b00b164..a1c2bcf65d37 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -66,6 +66,8 @@ xfs_bulkstat_one_int(
struct xfs_bulkstat *buf = bc->buf;
xfs_extnum_t nextents;
int error = -EINVAL;
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
if (xfs_internal_inum(mp, ino))
goto out_advance;
@@ -81,14 +83,16 @@ xfs_bulkstat_one_int(
ASSERT(ip != NULL);
ASSERT(ip->i_imap.im_blkno != 0);
inode = VFS_I(ip);
+ vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
/* xfs_iget returns the following without needing
* further change.
*/
buf->bs_projectid = ip->i_projid;
buf->bs_ino = ino;
- buf->bs_uid = from_kuid(sb_userns, i_uid_into_mnt(mnt_userns, inode));
- buf->bs_gid = from_kgid(sb_userns, i_gid_into_mnt(mnt_userns, inode));
+ buf->bs_uid = from_kuid(sb_userns, vfsuid_into_kuid(vfsuid));
+ buf->bs_gid = from_kgid(sb_userns, vfsgid_into_kgid(vfsgid));
buf->bs_size = ip->i_disk_size;
buf->bs_nlink = inode->i_nlink;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 386b0307aed8..fc61cc024023 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -226,12 +226,12 @@ xlog_ticket_reservation(
if (head == &log->l_write_head) {
ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
return tic->t_unit_res;
- } else {
- if (tic->t_flags & XLOG_TIC_PERM_RESERV)
- return tic->t_unit_res * tic->t_cnt;
- else
- return tic->t_unit_res;
}
+
+ if (tic->t_flags & XLOG_TIC_PERM_RESERV)
+ return tic->t_unit_res * tic->t_cnt;
+
+ return tic->t_unit_res;
}
STATIC bool
@@ -644,12 +644,14 @@ xfs_log_mount(
int min_logfsbs;
if (!xfs_has_norecovery(mp)) {
- xfs_notice(mp, "Mounting V%d Filesystem",
- XFS_SB_VERSION_NUM(&mp->m_sb));
+ xfs_notice(mp, "Mounting V%d Filesystem %pU",
+ XFS_SB_VERSION_NUM(&mp->m_sb),
+ &mp->m_sb.sb_uuid);
} else {
xfs_notice(mp,
-"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.",
- XFS_SB_VERSION_NUM(&mp->m_sb));
+"Mounting V%d filesystem %pU in no-recovery mode. Filesystem will be inconsistent.",
+ XFS_SB_VERSION_NUM(&mp->m_sb),
+ &mp->m_sb.sb_uuid);
ASSERT(xfs_is_readonly(mp));
}
@@ -887,6 +889,23 @@ xlog_force_iclog(
}
/*
+ * Cycle all the iclogbuf locks to make sure all log IO completion
+ * is done before we tear down these buffers.
+ */
+static void
+xlog_wait_iclog_completion(struct xlog *log)
+{
+ int i;
+ struct xlog_in_core *iclog = log->l_iclog;
+
+ for (i = 0; i < log->l_iclog_bufs; i++) {
+ down(&iclog->ic_sema);
+ up(&iclog->ic_sema);
+ iclog = iclog->ic_next;
+ }
+}
+
+/*
* Wait for the iclog and all prior iclogs to be written disk as required by the
* log force state machine. Waiting on ic_force_wait ensures iclog completions
* have been ordered and callbacks run before we are woken here, hence
@@ -1111,6 +1130,14 @@ xfs_log_unmount(
{
xfs_log_clean(mp);
+ /*
+ * If shutdown has come from iclog IO context, the log
+ * cleaning will have been skipped and so we need to wait
+ * for the iclog to complete shutdown processing before we
+ * tear anything down.
+ */
+ xlog_wait_iclog_completion(mp->m_log);
+
xfs_buftarg_drain(mp->m_ddev_targp);
xfs_trans_ail_destroy(mp);
@@ -2114,17 +2141,6 @@ xlog_dealloc_log(
int i;
/*
- * Cycle all the iclogbuf locks to make sure all log IO completion
- * is done before we tear down these buffers.
- */
- iclog = log->l_iclog;
- for (i = 0; i < log->l_iclog_bufs; i++) {
- down(&iclog->ic_sema);
- up(&iclog->ic_sema);
- iclog = iclog->ic_next;
- }
-
- /*
* Destroy the CIL after waiting for iclog IO completion because an
* iclog EIO error will try to shut down the log, which accesses the
* CIL to wake up the waiters.
@@ -3544,7 +3560,7 @@ xlog_ticket_alloc(
tic->t_curr_res = unit_res;
tic->t_cnt = cnt;
tic->t_ocnt = cnt;
- tic->t_tid = prandom_u32();
+ tic->t_tid = get_random_u32();
if (permanent)
tic->t_flags |= XLOG_TIC_PERM_RESERV;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 17e923b9c5fa..322eb2ee6c55 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2552,6 +2552,8 @@ xlog_recover_process_intents(
for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
lip != NULL;
lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
+ const struct xfs_item_ops *ops;
+
if (!xlog_item_is_intent(lip))
break;
@@ -2567,13 +2569,17 @@ xlog_recover_process_intents(
* deferred ops, you /must/ attach them to the capture list in
* the recover routine or else those subsequent intents will be
* replayed in the wrong order!
+ *
+ * The recovery function can free the log item, so we must not
+ * access lip after it returns.
*/
spin_unlock(&ailp->ail_lock);
- error = lip->li_ops->iop_recover(lip, &capture_list);
+ ops = lip->li_ops;
+ error = ops->iop_recover(lip, &capture_list);
spin_lock(&ailp->ail_lock);
if (error) {
trace_xlog_intent_recovery_failed(log->l_mp, error,
- lip->li_ops->iop_recover);
+ ops->iop_recover);
break;
}
}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index f10c88cee116..fb87ffb48f7f 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -300,26 +300,28 @@ xfs_validate_new_dalign(
"alignment check failed: sunit/swidth vs. blocksize(%d)",
mp->m_sb.sb_blocksize);
return -EINVAL;
- } else {
- /*
- * Convert the stripe unit and width to FSBs.
- */
- mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
- if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) {
- xfs_warn(mp,
- "alignment check failed: sunit/swidth vs. agsize(%d)",
- mp->m_sb.sb_agblocks);
- return -EINVAL;
- } else if (mp->m_dalign) {
- mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
- } else {
- xfs_warn(mp,
- "alignment check failed: sunit(%d) less than bsize(%d)",
- mp->m_dalign, mp->m_sb.sb_blocksize);
- return -EINVAL;
- }
}
+ /*
+ * Convert the stripe unit and width to FSBs.
+ */
+ mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
+ if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) {
+ xfs_warn(mp,
+ "alignment check failed: sunit/swidth vs. agsize(%d)",
+ mp->m_sb.sb_agblocks);
+ return -EINVAL;
+ }
+
+ if (!mp->m_dalign) {
+ xfs_warn(mp,
+ "alignment check failed: sunit(%d) less than bsize(%d)",
+ mp->m_dalign, mp->m_sb.sb_blocksize);
+ return -EINVAL;
+ }
+
+ mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
+
if (!xfs_has_dalign(mp)) {
xfs_warn(mp,
"cannot change alignment: superblock does not support data alignment");
@@ -536,6 +538,20 @@ xfs_check_summary_counts(
return 0;
}
+static void
+xfs_unmount_check(
+ struct xfs_mount *mp)
+{
+ if (xfs_is_shutdown(mp))
+ return;
+
+ if (percpu_counter_sum(&mp->m_ifree) >
+ percpu_counter_sum(&mp->m_icount)) {
+ xfs_alert(mp, "ifree/icount mismatch at unmount");
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
+ }
+}
+
/*
* Flush and reclaim dirty inodes in preparation for unmount. Inodes and
* internal inode structures can be sitting in the CIL and AIL at this point,
@@ -1075,6 +1091,7 @@ xfs_unmountfs(
if (error)
xfs_warn(mp, "Unable to free reserved block pool. "
"Freespace may not be correct on next mount.");
+ xfs_unmount_check(mp);
xfs_log_unmount(mp);
xfs_da_unmount(mp);
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index 69d9c83ea4b2..c4078d0ec108 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -23,17 +23,18 @@
#include <linux/mm.h>
#include <linux/dax.h>
-struct failure_info {
+struct xfs_failure_info {
xfs_agblock_t startblock;
xfs_extlen_t blockcount;
int mf_flags;
+ bool want_shutdown;
};
static pgoff_t
xfs_failure_pgoff(
struct xfs_mount *mp,
const struct xfs_rmap_irec *rec,
- const struct failure_info *notify)
+ const struct xfs_failure_info *notify)
{
loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset);
@@ -47,7 +48,7 @@ static unsigned long
xfs_failure_pgcnt(
struct xfs_mount *mp,
const struct xfs_rmap_irec *rec,
- const struct failure_info *notify)
+ const struct xfs_failure_info *notify)
{
xfs_agblock_t end_rec;
xfs_agblock_t end_notify;
@@ -71,13 +72,13 @@ xfs_dax_failure_fn(
{
struct xfs_mount *mp = cur->bc_mp;
struct xfs_inode *ip;
- struct failure_info *notify = data;
+ struct xfs_failure_info *notify = data;
int error = 0;
if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
(rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
- return -EFSCORRUPTED;
+ notify->want_shutdown = true;
+ return 0;
}
/* Get files that incore, filter out others that are not in use. */
@@ -86,8 +87,10 @@ xfs_dax_failure_fn(
/* Continue the rmap query if the inode isn't incore */
if (error == -ENODATA)
return 0;
- if (error)
- return error;
+ if (error) {
+ notify->want_shutdown = true;
+ return 0;
+ }
error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
xfs_failure_pgoff(mp, rec, notify),
@@ -104,6 +107,7 @@ xfs_dax_notify_ddev_failure(
xfs_daddr_t bblen,
int mf_flags)
{
+ struct xfs_failure_info notify = { .mf_flags = mf_flags };
struct xfs_trans *tp = NULL;
struct xfs_btree_cur *cur = NULL;
struct xfs_buf *agf_bp = NULL;
@@ -120,7 +124,6 @@ xfs_dax_notify_ddev_failure(
for (; agno <= end_agno; agno++) {
struct xfs_rmap_irec ri_low = { };
struct xfs_rmap_irec ri_high;
- struct failure_info notify;
struct xfs_agf *agf;
xfs_agblock_t agend;
struct xfs_perag *pag;
@@ -161,6 +164,11 @@ xfs_dax_notify_ddev_failure(
}
xfs_trans_cancel(tp);
+ if (error || notify.want_shutdown) {
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
+ if (!error)
+ error = -EFSCORRUPTED;
+ }
return error;
}
@@ -175,13 +183,13 @@ xfs_dax_notify_failure(
u64 ddev_start;
u64 ddev_end;
- if (!(mp->m_sb.sb_flags & SB_BORN)) {
+ if (!(mp->m_super->s_flags & SB_BORN)) {
xfs_warn(mp, "filesystem is not ready for notify_failure()!");
return -EIO;
}
if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) {
- xfs_warn(mp,
+ xfs_debug(mp,
"notify_failure() not supported on realtime device!");
return -EOPNOTSUPP;
}
@@ -194,7 +202,7 @@ xfs_dax_notify_failure(
}
if (!xfs_has_rmapbt(mp)) {
- xfs_warn(mp, "notify_failure() needs rmapbt enabled!");
+ xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
return -EOPNOTSUPP;
}
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 758702b9495f..9737b5a9f405 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -118,10 +118,10 @@ xfs_check_ondisk_structs(void)
/* log structures */
XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88);
XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24);
- XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 28);
- XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 32);
- XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32, 28);
- XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64, 32);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_extent_32, 12);
XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176);
@@ -134,6 +134,21 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_attri_log_format, 40);
XFS_CHECK_STRUCT_SIZE(struct xfs_attrd_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_bui_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_bud_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_cui_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_cud_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_rui_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_rud_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent, 32);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_phys_extent, 16);
+
+ XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents, 16);
+ XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents, 16);
+ XFS_CHECK_OFFSET(struct xfs_rui_log_format, rui_extents, 16);
+ XFS_CHECK_OFFSET(struct xfs_efi_log_format, efi_extents, 16);
+ XFS_CHECK_OFFSET(struct xfs_efi_log_format_32, efi_extents, 16);
+ XFS_CHECK_OFFSET(struct xfs_efi_log_format_64, efi_extents, 16);
/*
* The v5 superblock format extended several v4 header structures with
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 37a24f0f7cd4..38d23f0e703a 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -125,6 +125,7 @@ xfs_fs_map_blocks(
int nimaps = 1;
uint lock_flags;
int error = 0;
+ u64 seq;
if (xfs_is_shutdown(mp))
return -EIO;
@@ -176,6 +177,7 @@ xfs_fs_map_blocks(
lock_flags = xfs_ilock_data_map_shared(ip);
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
&imap, &nimaps, bmapi_flags);
+ seq = xfs_iomap_inode_sequence(ip, 0);
ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK);
@@ -189,7 +191,7 @@ xfs_fs_map_blocks(
xfs_iunlock(ip, lock_flags);
error = xfs_iomap_write_direct(ip, offset_fsb,
- end_fsb - offset_fsb, 0, &imap);
+ end_fsb - offset_fsb, 0, &imap, &seq);
if (error)
goto out_unlock;
@@ -209,7 +211,7 @@ xfs_fs_map_blocks(
}
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq);
*device_generation = mp->m_generation;
return error;
out_unlock:
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 18bb4ec4d7c9..ff53d40a2dae 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -423,6 +423,14 @@ xfs_qm_dquot_isolate(
goto out_miss_busy;
/*
+ * If something else is freeing this dquot and hasn't yet removed it
+ * from the LRU, leave it for the freeing task to complete the freeing
+ * process rather than risk it being free from under us here.
+ */
+ if (dqp->q_flags & XFS_DQFLAG_FREEING)
+ goto out_miss_unlock;
+
+ /*
* This dquot has acquired a reference in the meantime remove it from
* the freelist and try again.
*/
@@ -441,10 +449,8 @@ xfs_qm_dquot_isolate(
* skip it so there is time for the IO to complete before we try to
* reclaim it again on the next LRU pass.
*/
- if (!xfs_dqflock_nowait(dqp)) {
- xfs_dqunlock(dqp);
- goto out_miss_busy;
- }
+ if (!xfs_dqflock_nowait(dqp))
+ goto out_miss_unlock;
if (XFS_DQ_IS_DIRTY(dqp)) {
struct xfs_buf *bp = NULL;
@@ -478,6 +484,8 @@ xfs_qm_dquot_isolate(
XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims);
return LRU_REMOVED;
+out_miss_unlock:
+ xfs_dqunlock(dqp);
out_miss_busy:
trace_xfs_dqreclaim_busy(dqp);
XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 7e97bf19793d..858e3e9eb4a8 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -523,7 +523,9 @@ xfs_cui_item_recover(
type = refc_type;
break;
default:
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &cuip->cui_format,
+ sizeof(cuip->cui_format));
error = -EFSCORRUPTED;
goto abort_error;
}
@@ -536,7 +538,8 @@ xfs_cui_item_recover(
&new_fsb, &new_len, &rcur);
if (error == -EFSCORRUPTED)
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- refc, sizeof(*refc));
+ &cuip->cui_format,
+ sizeof(cuip->cui_format));
if (error)
goto abort_error;
@@ -622,28 +625,18 @@ static const struct xfs_item_ops xfs_cui_item_ops = {
.iop_relog = xfs_cui_item_relog,
};
-/*
- * Copy an CUI format buffer from the given buf, and into the destination
- * CUI format structure. The CUI/CUD items were designed not to need any
- * special alignment handling.
- */
-static int
+static inline void
xfs_cui_copy_format(
- struct xfs_log_iovec *buf,
- struct xfs_cui_log_format *dst_cui_fmt)
+ struct xfs_cui_log_format *dst,
+ const struct xfs_cui_log_format *src)
{
- struct xfs_cui_log_format *src_cui_fmt;
- uint len;
+ unsigned int i;
- src_cui_fmt = buf->i_addr;
- len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents);
+ memcpy(dst, src, offsetof(struct xfs_cui_log_format, cui_extents));
- if (buf->i_len == len) {
- memcpy(dst_cui_fmt, src_cui_fmt, len);
- return 0;
- }
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
- return -EFSCORRUPTED;
+ for (i = 0; i < src->cui_nextents; i++)
+ memcpy(&dst->cui_extents[i], &src->cui_extents[i],
+ sizeof(struct xfs_phys_extent));
}
/*
@@ -660,19 +653,28 @@ xlog_recover_cui_commit_pass2(
struct xlog_recover_item *item,
xfs_lsn_t lsn)
{
- int error;
struct xfs_mount *mp = log->l_mp;
struct xfs_cui_log_item *cuip;
struct xfs_cui_log_format *cui_formatp;
+ size_t len;
cui_formatp = item->ri_buf[0].i_addr;
- cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
- error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format);
- if (error) {
- xfs_cui_item_free(cuip);
- return error;
+ if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
}
+
+ len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents);
+ if (item->ri_buf[0].i_len != len) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
+ xfs_cui_copy_format(&cuip->cui_format, cui_formatp);
atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
/*
* Insert the intent into the AIL directly and drop one reference so
@@ -706,7 +708,8 @@ xlog_recover_cud_commit_pass2(
cud_formatp = item->ri_buf[0].i_addr;
if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 251f20ddd368..fe46bce8cae6 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -200,7 +200,9 @@ xfs_reflink_trim_around_shared(
if (fbno == NULLAGBLOCK) {
/* No shared blocks at all. */
return 0;
- } else if (fbno == agbno) {
+ }
+
+ if (fbno == agbno) {
/*
* The start of this extent is shared. Truncate the
* mapping at the end of the shared region so that a
@@ -210,16 +212,16 @@ xfs_reflink_trim_around_shared(
irec->br_blockcount = flen;
*shared = true;
return 0;
- } else {
- /*
- * There's a shared extent midway through this extent.
- * Truncate the mapping at the start of the shared
- * extent so that a subsequent iteration starts at the
- * start of the shared region.
- */
- irec->br_blockcount = fbno - agbno;
- return 0;
}
+
+ /*
+ * There's a shared extent midway through this extent.
+ * Truncate the mapping at the start of the shared
+ * extent so that a subsequent iteration starts at the
+ * start of the shared region.
+ */
+ irec->br_blockcount = fbno - agbno;
+ return 0;
}
int
@@ -1691,8 +1693,12 @@ xfs_reflink_unshare(
inode_dio_wait(inode);
- error = iomap_file_unshare(inode, offset, len,
- &xfs_buffered_write_iomap_ops);
+ if (IS_DAX(inode))
+ error = dax_file_unshare(inode, offset, len,
+ &xfs_dax_write_iomap_ops);
+ else
+ error = iomap_file_unshare(inode, offset, len,
+ &xfs_buffered_write_iomap_ops);
if (error)
goto out;
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index fef92e02f3bb..534504ede1a3 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -155,31 +155,6 @@ xfs_rui_init(
return ruip;
}
-/*
- * Copy an RUI format buffer from the given buf, and into the destination
- * RUI format structure. The RUI/RUD items were designed not to need any
- * special alignment handling.
- */
-STATIC int
-xfs_rui_copy_format(
- struct xfs_log_iovec *buf,
- struct xfs_rui_log_format *dst_rui_fmt)
-{
- struct xfs_rui_log_format *src_rui_fmt;
- uint len;
-
- src_rui_fmt = buf->i_addr;
- len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents);
-
- if (buf->i_len != len) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
- return -EFSCORRUPTED;
- }
-
- memcpy(dst_rui_fmt, src_rui_fmt, len);
- return 0;
-}
-
static inline struct xfs_rud_log_item *RUD_ITEM(struct xfs_log_item *lip)
{
return container_of(lip, struct xfs_rud_log_item, rud_item);
@@ -582,7 +557,9 @@ xfs_rui_item_recover(
type = XFS_RMAP_FREE;
break;
default:
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &ruip->rui_format,
+ sizeof(ruip->rui_format));
error = -EFSCORRUPTED;
goto abort_error;
}
@@ -652,6 +629,20 @@ static const struct xfs_item_ops xfs_rui_item_ops = {
.iop_relog = xfs_rui_item_relog,
};
+static inline void
+xfs_rui_copy_format(
+ struct xfs_rui_log_format *dst,
+ const struct xfs_rui_log_format *src)
+{
+ unsigned int i;
+
+ memcpy(dst, src, offsetof(struct xfs_rui_log_format, rui_extents));
+
+ for (i = 0; i < src->rui_nextents; i++)
+ memcpy(&dst->rui_extents[i], &src->rui_extents[i],
+ sizeof(struct xfs_map_extent));
+}
+
/*
* This routine is called to create an in-core extent rmap update
* item from the rui format structure which was logged on disk.
@@ -666,19 +657,28 @@ xlog_recover_rui_commit_pass2(
struct xlog_recover_item *item,
xfs_lsn_t lsn)
{
- int error;
struct xfs_mount *mp = log->l_mp;
struct xfs_rui_log_item *ruip;
struct xfs_rui_log_format *rui_formatp;
+ size_t len;
rui_formatp = item->ri_buf[0].i_addr;
- ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
- error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format);
- if (error) {
- xfs_rui_item_free(ruip);
- return error;
+ if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents);
+ if (item->ri_buf[0].i_len != len) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
}
+
+ ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
+ xfs_rui_copy_format(&ruip->rui_format, rui_formatp);
atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
/*
* Insert the intent into the AIL directly and drop one reference so
@@ -711,7 +711,11 @@ xlog_recover_rud_commit_pass2(
struct xfs_rud_log_format *rud_formatp;
rud_formatp = item->ri_buf[0].i_addr;
- ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format));
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ rud_formatp, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
xlog_recover_release_intent(log, XFS_LI_RUI, rud_formatp->rud_rui_id);
return 0;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 292d5e54a92c..16534e9873f6 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1311,10 +1311,10 @@ xfs_rtalloc_reinit_frextents(
uint64_t val = 0;
int error;
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent,
&val);
- xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
if (error)
return error;
@@ -1326,6 +1326,41 @@ xfs_rtalloc_reinit_frextents(
}
/*
+ * Read in the bmbt of an rt metadata inode so that we never have to load them
+ * at runtime. This enables the use of shared ILOCKs for rtbitmap scans. Use
+ * an empty transaction to avoid deadlocking on loops in the bmbt.
+ */
+static inline int
+xfs_rtmount_iread_extents(
+ struct xfs_inode *ip,
+ unsigned int lock_class)
+{
+ struct xfs_trans *tp;
+ int error;
+
+ error = xfs_trans_alloc_empty(ip->i_mount, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL | lock_class);
+
+ error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+
+ if (xfs_inode_has_attr_fork(ip)) {
+ error = xfs_iread_extents(tp, ip, XFS_ATTR_FORK);
+ if (error)
+ goto out_unlock;
+ }
+
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL | lock_class);
+ xfs_trans_cancel(tp);
+ return error;
+}
+
+/*
* Get the bitmap and summary inodes and the summary cache into the mount
* structure at mount time.
*/
@@ -1342,14 +1377,27 @@ xfs_rtmount_inodes(
return error;
ASSERT(mp->m_rbmip != NULL);
+ error = xfs_rtmount_iread_extents(mp->m_rbmip, XFS_ILOCK_RTBITMAP);
+ if (error)
+ goto out_rele_bitmap;
+
error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip);
- if (error) {
- xfs_irele(mp->m_rbmip);
- return error;
- }
+ if (error)
+ goto out_rele_bitmap;
ASSERT(mp->m_rsumip != NULL);
+
+ error = xfs_rtmount_iread_extents(mp->m_rsumip, XFS_ILOCK_RTSUM);
+ if (error)
+ goto out_rele_summary;
+
xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks);
return 0;
+
+out_rele_summary:
+ xfs_irele(mp->m_rsumip);
+out_rele_bitmap:
+ xfs_irele(mp->m_rbmip);
+ return error;
}
void
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 20e0534a772c..90a77cd3ebad 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -74,7 +74,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
defer_relog += per_cpu_ptr(stats, i)->s.defer_relog;
}
- len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
+ len += scnprintf(buf + len, PATH_MAX-len, "xpc %llu %llu %llu\n",
xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n",
defer_relog);
@@ -125,7 +125,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v)
{
int j;
- seq_printf(m, "qm");
+ seq_puts(m, "qm");
for (j = XFSSTAT_START_XQMSTAT; j < XFSSTAT_END_XQMSTAT; j++)
seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j));
seq_putc(m, '\n');
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 9ac59814bbb6..0c4b73e9b29d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -653,7 +653,7 @@ xfs_fs_destroy_inode(
static void
xfs_fs_dirty_inode(
struct inode *inode,
- int flag)
+ int flags)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -661,7 +661,13 @@ xfs_fs_dirty_inode(
if (!(inode->i_sb->s_flags & SB_LAZYTIME))
return;
- if (flag != I_DIRTY_SYNC || !(inode->i_state & I_DIRTY_TIME))
+
+ /*
+ * Only do the timestamp update if the inode is dirty (I_DIRTY_SYNC)
+ * and has dirty timestamp (I_DIRTY_TIME). I_DIRTY_TIME can be passed
+ * in flags possibly together with I_DIRTY_SYNC.
+ */
+ if ((flags & ~I_DIRTY_TIME) != I_DIRTY_SYNC || !(flags & I_DIRTY_TIME))
return;
if (xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp))
@@ -1104,7 +1110,7 @@ xfs_fs_put_super(
if (!sb->s_fs_info)
return;
- xfs_notice(mp, "Unmounting Filesystem");
+ xfs_notice(mp, "Unmounting Filesystem %pU", &mp->m_sb.sb_uuid);
xfs_filestream_unmount(mp);
xfs_unmountfs(mp);
@@ -2022,18 +2028,14 @@ xfs_init_caches(void)
goto out_destroy_trans_cache;
xfs_efd_cache = kmem_cache_create("xfs_efd_item",
- (sizeof(struct xfs_efd_log_item) +
- (XFS_EFD_MAX_FAST_EXTENTS - 1) *
- sizeof(struct xfs_extent)),
- 0, 0, NULL);
+ xfs_efd_log_item_sizeof(XFS_EFD_MAX_FAST_EXTENTS),
+ 0, 0, NULL);
if (!xfs_efd_cache)
goto out_destroy_buf_item_cache;
xfs_efi_cache = kmem_cache_create("xfs_efi_item",
- (sizeof(struct xfs_efi_log_item) +
- (XFS_EFI_MAX_FAST_EXTENTS - 1) *
- sizeof(struct xfs_extent)),
- 0, 0, NULL);
+ xfs_efi_log_item_sizeof(XFS_EFI_MAX_FAST_EXTENTS),
+ 0, 0, NULL);
if (!xfs_efi_cache)
goto out_destroy_efd_cache;
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
index 43585850f154..513095e353a5 100644
--- a/fs/xfs/xfs_sysfs.h
+++ b/fs/xfs/xfs_sysfs.h
@@ -33,10 +33,15 @@ xfs_sysfs_init(
const char *name)
{
struct kobject *parent;
+ int err;
parent = parent_kobj ? &parent_kobj->kobject : NULL;
init_completion(&kobj->complete);
- return kobject_init_and_add(&kobj->kobject, ktype, parent, "%s", name);
+ err = kobject_init_and_add(&kobj->kobject, ktype, parent, "%s", name);
+ if (err)
+ kobject_put(&kobj->kobject);
+
+ return err;
}
static inline void
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index d269ef57ff01..8a5dc1538aa8 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -34,6 +34,8 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_error.h"
+#include <linux/iomap.h>
+#include "xfs_iomap.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index f9057af6e0c8..421d1e504ac4 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -799,6 +799,9 @@ TRACE_DEFINE_ENUM(PE_SIZE_PTE);
TRACE_DEFINE_ENUM(PE_SIZE_PMD);
TRACE_DEFINE_ENUM(PE_SIZE_PUD);
+TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED);
+TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW);
+
TRACE_EVENT(xfs_filemap_fault,
TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size,
bool write_fault),
@@ -1170,7 +1173,7 @@ DECLARE_EVENT_CLASS(xfs_dqtrx_class,
__entry->ino_res_used = qtrx->qt_ino_res_used;
__entry->icount_delta = qtrx->qt_icount_delta;
),
- TP_printk("dev %d:%d dquot id 0x%x type %s flags %s"
+ TP_printk("dev %d:%d dquot id 0x%x type %s flags %s "
"blk_res %llu bcount_delta %lld delbcnt_delta %lld "
"rtblk_res %llu rtblk_res_used %llu rtbcount_delta %lld delrtb_delta %lld "
"ino_res %llu ino_res_used %llu icount_delta %lld",
@@ -1602,7 +1605,7 @@ TRACE_EVENT(xfs_bunmap,
__entry->caller_ip = caller_ip;
__entry->flags = flags;
),
- TP_printk("dev %d:%d ino 0x%llx disize 0x%llx fileoff 0x%llx fsbcount 0x%llx"
+ TP_printk("dev %d:%d ino 0x%llx disize 0x%llx fileoff 0x%llx fsbcount 0x%llx "
"flags %s caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
@@ -2925,6 +2928,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
+ __field(enum xfs_refc_domain, domain)
__field(xfs_agblock_t, startblock)
__field(xfs_extlen_t, blockcount)
__field(xfs_nlink_t, refcount)
@@ -2932,13 +2936,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
+ __entry->domain = irec->rc_domain;
__entry->startblock = irec->rc_startblock;
__entry->blockcount = irec->rc_blockcount;
__entry->refcount = irec->rc_refcount;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u",
+ TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
+ __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
__entry->startblock,
__entry->blockcount,
__entry->refcount)
@@ -2958,6 +2964,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
+ __field(enum xfs_refc_domain, domain)
__field(xfs_agblock_t, startblock)
__field(xfs_extlen_t, blockcount)
__field(xfs_nlink_t, refcount)
@@ -2966,14 +2973,16 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
+ __entry->domain = irec->rc_domain;
__entry->startblock = irec->rc_startblock;
__entry->blockcount = irec->rc_blockcount;
__entry->refcount = irec->rc_refcount;
__entry->agbno = agbno;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x",
+ TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
+ __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
__entry->startblock,
__entry->blockcount,
__entry->refcount,
@@ -2994,9 +3003,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
+ __field(enum xfs_refc_domain, i1_domain)
__field(xfs_agblock_t, i1_startblock)
__field(xfs_extlen_t, i1_blockcount)
__field(xfs_nlink_t, i1_refcount)
+ __field(enum xfs_refc_domain, i2_domain)
__field(xfs_agblock_t, i2_startblock)
__field(xfs_extlen_t, i2_blockcount)
__field(xfs_nlink_t, i2_refcount)
@@ -3004,20 +3015,24 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
+ __entry->i1_domain = i1->rc_domain;
__entry->i1_startblock = i1->rc_startblock;
__entry->i1_blockcount = i1->rc_blockcount;
__entry->i1_refcount = i1->rc_refcount;
+ __entry->i2_domain = i2->rc_domain;
__entry->i2_startblock = i2->rc_startblock;
__entry->i2_blockcount = i2->rc_blockcount;
__entry->i2_refcount = i2->rc_refcount;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- "
- "agbno 0x%x fsbcount 0x%x refcount %u",
+ TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
+ "dom %s agbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
+ __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i1_startblock,
__entry->i1_blockcount,
__entry->i1_refcount,
+ __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i2_startblock,
__entry->i2_blockcount,
__entry->i2_refcount)
@@ -3038,9 +3053,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
+ __field(enum xfs_refc_domain, i1_domain)
__field(xfs_agblock_t, i1_startblock)
__field(xfs_extlen_t, i1_blockcount)
__field(xfs_nlink_t, i1_refcount)
+ __field(enum xfs_refc_domain, i2_domain)
__field(xfs_agblock_t, i2_startblock)
__field(xfs_extlen_t, i2_blockcount)
__field(xfs_nlink_t, i2_refcount)
@@ -3049,21 +3066,25 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
+ __entry->i1_domain = i1->rc_domain;
__entry->i1_startblock = i1->rc_startblock;
__entry->i1_blockcount = i1->rc_blockcount;
__entry->i1_refcount = i1->rc_refcount;
+ __entry->i2_domain = i2->rc_domain;
__entry->i2_startblock = i2->rc_startblock;
__entry->i2_blockcount = i2->rc_blockcount;
__entry->i2_refcount = i2->rc_refcount;
__entry->agbno = agbno;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- "
- "agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x",
+ TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
+ "dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
+ __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i1_startblock,
__entry->i1_blockcount,
__entry->i1_refcount,
+ __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i2_startblock,
__entry->i2_blockcount,
__entry->i2_refcount,
@@ -3086,12 +3107,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
+ __field(enum xfs_refc_domain, i1_domain)
__field(xfs_agblock_t, i1_startblock)
__field(xfs_extlen_t, i1_blockcount)
__field(xfs_nlink_t, i1_refcount)
+ __field(enum xfs_refc_domain, i2_domain)
__field(xfs_agblock_t, i2_startblock)
__field(xfs_extlen_t, i2_blockcount)
__field(xfs_nlink_t, i2_refcount)
+ __field(enum xfs_refc_domain, i3_domain)
__field(xfs_agblock_t, i3_startblock)
__field(xfs_extlen_t, i3_blockcount)
__field(xfs_nlink_t, i3_refcount)
@@ -3099,27 +3123,33 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
+ __entry->i1_domain = i1->rc_domain;
__entry->i1_startblock = i1->rc_startblock;
__entry->i1_blockcount = i1->rc_blockcount;
__entry->i1_refcount = i1->rc_refcount;
+ __entry->i2_domain = i2->rc_domain;
__entry->i2_startblock = i2->rc_startblock;
__entry->i2_blockcount = i2->rc_blockcount;
__entry->i2_refcount = i2->rc_refcount;
+ __entry->i3_domain = i3->rc_domain;
__entry->i3_startblock = i3->rc_startblock;
__entry->i3_blockcount = i3->rc_blockcount;
__entry->i3_refcount = i3->rc_refcount;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- "
- "agbno 0x%x fsbcount 0x%x refcount %u -- "
- "agbno 0x%x fsbcount 0x%x refcount %u",
+ TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
+ "dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
+ "dom %s agbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
+ __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i1_startblock,
__entry->i1_blockcount,
__entry->i1_refcount,
+ __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i2_startblock,
__entry->i2_blockcount,
__entry->i2_refcount,
+ __print_symbolic(__entry->i3_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i3_startblock,
__entry->i3_blockcount,
__entry->i3_refcount)
@@ -3322,6 +3352,92 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \
TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), \
TP_ARGS(ip, irec))
+/* inode iomap invalidation events */
+DECLARE_EVENT_CLASS(xfs_wb_invalid_class,
+ TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap, unsigned int wpcseq, int whichfork),
+ TP_ARGS(ip, iomap, wpcseq, whichfork),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(u64, addr)
+ __field(loff_t, pos)
+ __field(u64, len)
+ __field(u16, type)
+ __field(u16, flags)
+ __field(u32, wpcseq)
+ __field(u32, forkseq)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->addr = iomap->addr;
+ __entry->pos = iomap->offset;
+ __entry->len = iomap->length;
+ __entry->type = iomap->type;
+ __entry->flags = iomap->flags;
+ __entry->wpcseq = wpcseq;
+ __entry->forkseq = READ_ONCE(xfs_ifork_ptr(ip, whichfork)->if_seq);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx pos 0x%llx addr 0x%llx bytecount 0x%llx type 0x%x flags 0x%x wpcseq 0x%x forkseq 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->pos,
+ __entry->addr,
+ __entry->len,
+ __entry->type,
+ __entry->flags,
+ __entry->wpcseq,
+ __entry->forkseq)
+);
+#define DEFINE_WB_INVALID_EVENT(name) \
+DEFINE_EVENT(xfs_wb_invalid_class, name, \
+ TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap, unsigned int wpcseq, int whichfork), \
+ TP_ARGS(ip, iomap, wpcseq, whichfork))
+DEFINE_WB_INVALID_EVENT(xfs_wb_cow_iomap_invalid);
+DEFINE_WB_INVALID_EVENT(xfs_wb_data_iomap_invalid);
+
+DECLARE_EVENT_CLASS(xfs_iomap_invalid_class,
+ TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap),
+ TP_ARGS(ip, iomap),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(u64, addr)
+ __field(loff_t, pos)
+ __field(u64, len)
+ __field(u64, validity_cookie)
+ __field(u64, inodeseq)
+ __field(u16, type)
+ __field(u16, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->addr = iomap->addr;
+ __entry->pos = iomap->offset;
+ __entry->len = iomap->length;
+ __entry->validity_cookie = iomap->validity_cookie;
+ __entry->type = iomap->type;
+ __entry->flags = iomap->flags;
+ __entry->inodeseq = xfs_iomap_inode_sequence(ip, iomap->flags);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx pos 0x%llx addr 0x%llx bytecount 0x%llx type 0x%x flags 0x%x validity_cookie 0x%llx inodeseq 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->pos,
+ __entry->addr,
+ __entry->len,
+ __entry->type,
+ __entry->flags,
+ __entry->validity_cookie,
+ __entry->inodeseq)
+);
+#define DEFINE_IOMAP_INVALID_EVENT(name) \
+DEFINE_EVENT(xfs_iomap_invalid_class, name, \
+ TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap), \
+ TP_ARGS(ip, iomap))
+DEFINE_IOMAP_INVALID_EVENT(xfs_iomap_invalid);
+
/* refcount/reflink tracepoint definitions */
/* reflink tracepoints */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index d3a97a028560..7d4109af193e 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -422,7 +422,7 @@ xfsaild_push(
struct xfs_ail_cursor cur;
struct xfs_log_item *lip;
xfs_lsn_t lsn;
- xfs_lsn_t target;
+ xfs_lsn_t target = NULLCOMMITLSN;
long tout;
int stuck = 0;
int flushing = 0;
@@ -472,6 +472,8 @@ xfsaild_push(
XFS_STATS_INC(mp, xs_push_ail);
+ ASSERT(target != NULLCOMMITLSN);
+
lsn = lip->li_lsn;
while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
int lock_result;
@@ -602,9 +604,9 @@ xfsaild(
while (1) {
if (tout && tout <= 20)
- set_current_state(TASK_KILLABLE);
+ set_current_state(TASK_KILLABLE|TASK_FREEZABLE);
else
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
/*
* Check kthread_should_stop() after we set the task state to
@@ -653,14 +655,14 @@ xfsaild(
ailp->ail_target == ailp->ail_target_prev &&
list_empty(&ailp->ail_buf_list)) {
spin_unlock(&ailp->ail_lock);
- freezable_schedule();
+ schedule();
tout = 0;
continue;
}
spin_unlock(&ailp->ail_lock);
if (tout)
- freezable_schedule_timeout(msecs_to_jiffies(tout));
+ schedule_timeout(msecs_to_jiffies(tout));
__set_current_state(TASK_RUNNING);
@@ -730,11 +732,10 @@ void
xfs_ail_push_all_sync(
struct xfs_ail *ailp)
{
- struct xfs_log_item *lip;
DEFINE_WAIT(wait);
spin_lock(&ailp->ail_lock);
- while ((lip = xfs_ail_max(ailp)) != NULL) {
+ while (xfs_ail_max(ailp) != NULL) {
prepare_to_wait(&ailp->ail_empty, &wait, TASK_UNINTERRUPTIBLE);
wake_up_process(ailp->ail_task);
spin_unlock(&ailp->ail_lock);
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index c325a28b89a8..10aa1fd39d2b 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -210,7 +210,7 @@ __xfs_xattr_put_listent(
return;
}
offset = context->buffer + context->count;
- strncpy(offset, prefix, prefix_len);
+ memcpy(offset, prefix, prefix_len);
offset += prefix_len;
strncpy(offset, (char *)name, namelen); /* real name */
offset += namelen;
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 860f0b1032c6..2c53fbb8d918 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -41,6 +41,13 @@ static void zonefs_account_active(struct inode *inode)
return;
/*
+ * For zones that transitioned to the offline or readonly condition,
+ * we only need to clear the active state.
+ */
+ if (zi->i_flags & (ZONEFS_ZONE_OFFLINE | ZONEFS_ZONE_READONLY))
+ goto out;
+
+ /*
* If the zone is active, that is, if it is explicitly open or
* partially written, check if it was already accounted as active.
*/
@@ -53,6 +60,7 @@ static void zonefs_account_active(struct inode *inode)
return;
}
+out:
/* The zone is not active. If it was, update the active count */
if (zi->i_flags & ZONEFS_ZONE_ACTIVE) {
zi->i_flags &= ~ZONEFS_ZONE_ACTIVE;
@@ -324,6 +332,7 @@ static loff_t zonefs_check_zone_condition(struct inode *inode,
inode->i_flags |= S_IMMUTABLE;
inode->i_mode &= ~0777;
zone->wp = zone->start;
+ zi->i_flags |= ZONEFS_ZONE_OFFLINE;
return 0;
case BLK_ZONE_COND_READONLY:
/*
@@ -342,8 +351,10 @@ static loff_t zonefs_check_zone_condition(struct inode *inode,
zone->cond = BLK_ZONE_COND_OFFLINE;
inode->i_mode &= ~0777;
zone->wp = zone->start;
+ zi->i_flags |= ZONEFS_ZONE_OFFLINE;
return 0;
}
+ zi->i_flags |= ZONEFS_ZONE_READONLY;
inode->i_mode &= ~0222;
return i_size_read(inode);
case BLK_ZONE_COND_FULL:
@@ -478,8 +489,7 @@ static void __zonefs_io_error(struct inode *inode, bool write)
struct super_block *sb = inode->i_sb;
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
unsigned int noio_flag;
- unsigned int nr_zones =
- zi->i_zone_size >> (sbi->s_zone_sectors_shift + SECTOR_SHIFT);
+ unsigned int nr_zones = 1;
struct zonefs_ioerr_data err = {
.inode = inode,
.write = write,
@@ -487,6 +497,15 @@ static void __zonefs_io_error(struct inode *inode, bool write)
int ret;
/*
+ * The only files that have more than one zone are conventional zone
+ * files with aggregated conventional zones, for which the inode zone
+ * size is always larger than the device zone size.
+ */
+ if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev))
+ nr_zones = zi->i_zone_size >>
+ (sbi->s_zone_sectors_shift + SECTOR_SHIFT);
+
+ /*
* Memory allocations in blkdev_report_zones() can trigger a memory
* reclaim which may in turn cause a recursion into zonefs as well as
* struct request allocations for the same device. The former case may
@@ -1407,6 +1426,14 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
zi->i_ztype = type;
zi->i_zsector = zone->start;
zi->i_zone_size = zone->len << SECTOR_SHIFT;
+ if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT &&
+ !(sbi->s_features & ZONEFS_F_AGGRCNV)) {
+ zonefs_err(sb,
+ "zone size %llu doesn't match device's zone sectors %llu\n",
+ zi->i_zone_size,
+ bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT);
+ return -EINVAL;
+ }
zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE,
zone->capacity << SECTOR_SHIFT);
@@ -1456,11 +1483,11 @@ static struct dentry *zonefs_create_inode(struct dentry *parent,
struct inode *dir = d_inode(parent);
struct dentry *dentry;
struct inode *inode;
- int ret;
+ int ret = -ENOMEM;
dentry = d_alloc_name(parent, name);
if (!dentry)
- return NULL;
+ return ERR_PTR(ret);
inode = new_inode(parent->d_sb);
if (!inode)
@@ -1485,7 +1512,7 @@ static struct dentry *zonefs_create_inode(struct dentry *parent,
dput:
dput(dentry);
- return NULL;
+ return ERR_PTR(ret);
}
struct zonefs_zone_data {
@@ -1505,7 +1532,7 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd,
struct blk_zone *zone, *next, *end;
const char *zgroup_name;
char *file_name;
- struct dentry *dir;
+ struct dentry *dir, *dent;
unsigned int n = 0;
int ret;
@@ -1523,8 +1550,8 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd,
zgroup_name = "seq";
dir = zonefs_create_inode(sb->s_root, zgroup_name, NULL, type);
- if (!dir) {
- ret = -ENOMEM;
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
goto free;
}
@@ -1570,8 +1597,9 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd,
* Use the file number within its group as file name.
*/
snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", n);
- if (!zonefs_create_inode(dir, file_name, zone, type)) {
- ret = -ENOMEM;
+ dent = zonefs_create_inode(dir, file_name, zone, type);
+ if (IS_ERR(dent)) {
+ ret = PTR_ERR(dent);
goto free;
}
@@ -1905,18 +1933,18 @@ static int __init zonefs_init(void)
if (ret)
return ret;
- ret = register_filesystem(&zonefs_type);
+ ret = zonefs_sysfs_init();
if (ret)
goto destroy_inodecache;
- ret = zonefs_sysfs_init();
+ ret = register_filesystem(&zonefs_type);
if (ret)
- goto unregister_fs;
+ goto sysfs_exit;
return 0;
-unregister_fs:
- unregister_filesystem(&zonefs_type);
+sysfs_exit:
+ zonefs_sysfs_exit();
destroy_inodecache:
zonefs_destroy_inodecache();
@@ -1925,9 +1953,9 @@ destroy_inodecache:
static void __exit zonefs_exit(void)
{
+ unregister_filesystem(&zonefs_type);
zonefs_sysfs_exit();
zonefs_destroy_inodecache();
- unregister_filesystem(&zonefs_type);
}
MODULE_AUTHOR("Damien Le Moal");
diff --git a/fs/zonefs/sysfs.c b/fs/zonefs/sysfs.c
index 9cb6755ce39a..9920689dc098 100644
--- a/fs/zonefs/sysfs.c
+++ b/fs/zonefs/sysfs.c
@@ -15,11 +15,6 @@ struct zonefs_sysfs_attr {
ssize_t (*show)(struct zonefs_sb_info *sbi, char *buf);
};
-static inline struct zonefs_sysfs_attr *to_attr(struct attribute *attr)
-{
- return container_of(attr, struct zonefs_sysfs_attr, attr);
-}
-
#define ZONEFS_SYSFS_ATTR_RO(name) \
static struct zonefs_sysfs_attr zonefs_sysfs_attr_##name = __ATTR_RO(name)
diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h
index 4b3de66c3233..1dbe78119ff1 100644
--- a/fs/zonefs/zonefs.h
+++ b/fs/zonefs/zonefs.h
@@ -39,8 +39,10 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone)
return ZONEFS_ZTYPE_SEQ;
}
-#define ZONEFS_ZONE_OPEN (1 << 0)
-#define ZONEFS_ZONE_ACTIVE (1 << 1)
+#define ZONEFS_ZONE_OPEN (1U << 0)
+#define ZONEFS_ZONE_ACTIVE (1U << 1)
+#define ZONEFS_ZONE_OFFLINE (1U << 2)
+#define ZONEFS_ZONE_READONLY (1U << 3)
/*
* In-memory inode data.