summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_addr.c5
-rw-r--r--fs/9p/vfs_dentry.c1
-rw-r--r--fs/9p/vfs_file.c13
-rw-r--r--fs/9p/vfs_super.c10
-rw-r--r--fs/Kconfig11
-rw-r--r--fs/Makefile3
-rw-r--r--fs/adfs/file.c2
-rw-r--r--fs/adfs/inode.c9
-rw-r--r--fs/adfs/super.c2
-rw-r--r--fs/affs/file.c28
-rw-r--r--fs/affs/super.c4
-rw-r--r--fs/afs/addr_prefs.c2
-rw-r--r--fs/afs/cell.c1
-rw-r--r--fs/afs/file.c12
-rw-r--r--fs/afs/super.c4
-rw-r--r--fs/afs/write.c9
-rw-r--r--fs/aio.c8
-rw-r--r--fs/anon_inodes.c23
-rw-r--r--fs/attr.c10
-rw-r--r--fs/autofs/inode.c2
-rw-r--r--fs/backing-file.c8
-rw-r--r--fs/bcachefs/alloc_background.c92
-rw-r--r--fs/bcachefs/alloc_background.h9
-rw-r--r--fs/bcachefs/alloc_foreground.c111
-rw-r--r--fs/bcachefs/alloc_foreground.h8
-rw-r--r--fs/bcachefs/backpointers.c70
-rw-r--r--fs/bcachefs/backpointers.h5
-rw-r--r--fs/bcachefs/bcachefs.h87
-rw-r--r--fs/bcachefs/btree_cache.c50
-rw-r--r--fs/bcachefs/btree_cache.h1
-rw-r--r--fs/bcachefs/btree_gc.c163
-rw-r--r--fs/bcachefs/btree_io.c178
-rw-r--r--fs/bcachefs/btree_iter.c253
-rw-r--r--fs/bcachefs/btree_iter.h31
-rw-r--r--fs/bcachefs/btree_journal_iter.c99
-rw-r--r--fs/bcachefs/btree_journal_iter_types.h5
-rw-r--r--fs/bcachefs/btree_key_cache.c28
-rw-r--r--fs/bcachefs/btree_locking.c68
-rw-r--r--fs/bcachefs/btree_locking.h6
-rw-r--r--fs/bcachefs/btree_node_scan.c92
-rw-r--r--fs/bcachefs/btree_node_scan.h2
-rw-r--r--fs/bcachefs/btree_trans_commit.c54
-rw-r--r--fs/bcachefs/btree_types.h32
-rw-r--r--fs/bcachefs/btree_update.c75
-rw-r--r--fs/bcachefs/btree_update.h19
-rw-r--r--fs/bcachefs/btree_update_interior.c153
-rw-r--r--fs/bcachefs/btree_update_interior.h10
-rw-r--r--fs/bcachefs/btree_write_buffer.c14
-rw-r--r--fs/bcachefs/btree_write_buffer.h6
-rw-r--r--fs/bcachefs/buckets.c163
-rw-r--r--fs/bcachefs/buckets.h12
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal.c3
-rw-r--r--fs/bcachefs/chardev.c42
-rw-r--r--fs/bcachefs/checksum.c8
-rw-r--r--fs/bcachefs/clock.c47
-rw-r--r--fs/bcachefs/clock.h1
-rw-r--r--fs/bcachefs/compress.c20
-rw-r--r--fs/bcachefs/darray.h46
-rw-r--r--fs/bcachefs/data_update.c175
-rw-r--r--fs/bcachefs/debug.c41
-rw-r--r--fs/bcachefs/dirent.c180
-rw-r--r--fs/bcachefs/dirent.h25
-rw-r--r--fs/bcachefs/disk_accounting.c42
-rw-r--r--fs/bcachefs/disk_accounting.h6
-rw-r--r--fs/bcachefs/disk_groups.c37
-rw-r--r--fs/bcachefs/ec.c108
-rw-r--r--fs/bcachefs/errcode.c4
-rw-r--r--fs/bcachefs/errcode.h21
-rw-r--r--fs/bcachefs/error.c100
-rw-r--r--fs/bcachefs/error.h12
-rw-r--r--fs/bcachefs/extent_update.c13
-rw-r--r--fs/bcachefs/extents.c79
-rw-r--r--fs/bcachefs/fs-io-buffered.c34
-rw-r--r--fs/bcachefs/fs-io-buffered.h4
-rw-r--r--fs/bcachefs/fs-io-pagecache.c2
-rw-r--r--fs/bcachefs/fs-io.c12
-rw-r--r--fs/bcachefs/fs-ioctl.c4
-rw-r--r--fs/bcachefs/fs.c70
-rw-r--r--fs/bcachefs/fsck.c452
-rw-r--r--fs/bcachefs/fsck.h6
-rw-r--r--fs/bcachefs/inode.c99
-rw-r--r--fs/bcachefs/inode.h14
-rw-r--r--fs/bcachefs/io_misc.c29
-rw-r--r--fs/bcachefs/io_misc.h2
-rw-r--r--fs/bcachefs/io_read.c58
-rw-r--r--fs/bcachefs/io_read.h7
-rw-r--r--fs/bcachefs/io_write.c26
-rw-r--r--fs/bcachefs/journal.c138
-rw-r--r--fs/bcachefs/journal.h7
-rw-r--r--fs/bcachefs/journal_io.c304
-rw-r--r--fs/bcachefs/journal_io.h1
-rw-r--r--fs/bcachefs/journal_reclaim.c50
-rw-r--r--fs/bcachefs/journal_sb.c2
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c4
-rw-r--r--fs/bcachefs/lru.c6
-rw-r--r--fs/bcachefs/migrate.c4
-rw-r--r--fs/bcachefs/move.c132
-rw-r--r--fs/bcachefs/movinggc.c50
-rw-r--r--fs/bcachefs/movinggc.h3
-rw-r--r--fs/bcachefs/namei.c61
-rw-r--r--fs/bcachefs/opts.h10
-rw-r--r--fs/bcachefs/printbuf.h8
-rw-r--r--fs/bcachefs/quota.c6
-rw-r--r--fs/bcachefs/rcu_pending.c22
-rw-r--r--fs/bcachefs/rebalance.c27
-rw-r--r--fs/bcachefs/rebalance.h8
-rw-r--r--fs/bcachefs/rebalance_types.h1
-rw-r--r--fs/bcachefs/recovery.c64
-rw-r--r--fs/bcachefs/recovery_passes.c125
-rw-r--r--fs/bcachefs/recovery_passes.h14
-rw-r--r--fs/bcachefs/recovery_passes_format.h2
-rw-r--r--fs/bcachefs/reflink.c21
-rw-r--r--fs/bcachefs/replicas.c35
-rw-r--r--fs/bcachefs/sb-counters_format.h1
-rw-r--r--fs/bcachefs/sb-downgrade.c7
-rw-r--r--fs/bcachefs/sb-errors.c22
-rw-r--r--fs/bcachefs/sb-errors.h1
-rw-r--r--fs/bcachefs/sb-errors_format.h35
-rw-r--r--fs/bcachefs/sb-members.c55
-rw-r--r--fs/bcachefs/sb-members.h32
-rw-r--r--fs/bcachefs/six.c7
-rw-r--r--fs/bcachefs/snapshot.c162
-rw-r--r--fs/bcachefs/snapshot.h85
-rw-r--r--fs/bcachefs/str_hash.c244
-rw-r--r--fs/bcachefs/str_hash.h26
-rw-r--r--fs/bcachefs/subvolume.c45
-rw-r--r--fs/bcachefs/super-io.c8
-rw-r--r--fs/bcachefs/super.c185
-rw-r--r--fs/bcachefs/super.h1
-rw-r--r--fs/bcachefs/sysfs.c24
-rw-r--r--fs/bcachefs/trace.h188
-rw-r--r--fs/bcachefs/util.c10
-rw-r--r--fs/bcachefs/util.h5
-rw-r--r--fs/bfs/file.c9
-rw-r--r--fs/binfmt_elf.c42
-rw-r--r--fs/binfmt_elf_fdpic.c19
-rw-r--r--fs/binfmt_misc.c44
-rw-r--r--fs/bpf_fs_kfuncs.c34
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/accessors.c162
-rw-r--r--fs/btrfs/accessors.h37
-rw-r--r--fs/btrfs/backref.c47
-rw-r--r--fs/btrfs/backref.h23
-rw-r--r--fs/btrfs/bio.c24
-rw-r--r--fs/btrfs/block-group.c86
-rw-r--r--fs/btrfs/block-group.h7
-rw-r--r--fs/btrfs/btrfs_inode.h13
-rw-r--r--fs/btrfs/compression.c24
-rw-r--r--fs/btrfs/compression.h9
-rw-r--r--fs/btrfs/ctree.c201
-rw-r--r--fs/btrfs/ctree.h35
-rw-r--r--fs/btrfs/defrag.c80
-rw-r--r--fs/btrfs/delayed-inode.c111
-rw-r--r--fs/btrfs/delayed-inode.h7
-rw-r--r--fs/btrfs/delayed-ref.c10
-rw-r--r--fs/btrfs/delayed-ref.h6
-rw-r--r--fs/btrfs/dev-replace.c18
-rw-r--r--fs/btrfs/dir-item.c4
-rw-r--r--fs/btrfs/dir-item.h2
-rw-r--r--fs/btrfs/disk-io.c56
-rw-r--r--fs/btrfs/extent-io-tree.c20
-rw-r--r--fs/btrfs/extent-io-tree.h9
-rw-r--r--fs/btrfs/extent-tree.c134
-rw-r--r--fs/btrfs/extent-tree.h2
-rw-r--r--fs/btrfs/extent_io.c190
-rw-r--r--fs/btrfs/extent_io.h6
-rw-r--r--fs/btrfs/extent_map.c6
-rw-r--r--fs/btrfs/fiemap.c2
-rw-r--r--fs/btrfs/file-item.c2
-rw-r--r--fs/btrfs/file.c185
-rw-r--r--fs/btrfs/free-space-cache.c8
-rw-r--r--fs/btrfs/free-space-tree.c419
-rw-r--r--fs/btrfs/free-space-tree.h52
-rw-r--r--fs/btrfs/fs.h13
-rw-r--r--fs/btrfs/inode-item.c24
-rw-r--r--fs/btrfs/inode-item.h11
-rw-r--r--fs/btrfs/inode.c508
-rw-r--r--fs/btrfs/ioctl.c181
-rw-r--r--fs/btrfs/ioctl.h6
-rw-r--r--fs/btrfs/messages.h105
-rw-r--r--fs/btrfs/misc.h38
-rw-r--r--fs/btrfs/ordered-data.c2
-rw-r--r--fs/btrfs/print-tree.c4
-rw-r--r--fs/btrfs/qgroup.c362
-rw-r--r--fs/btrfs/raid-stripe-tree.c7
-rw-r--r--fs/btrfs/rcu-string.h58
-rw-r--r--fs/btrfs/ref-verify.c146
-rw-r--r--fs/btrfs/ref-verify.h4
-rw-r--r--fs/btrfs/reflink.c24
-rw-r--r--fs/btrfs/relocation.c140
-rw-r--r--fs/btrfs/relocation.h3
-rw-r--r--fs/btrfs/scrub.c77
-rw-r--r--fs/btrfs/send.c47
-rw-r--r--fs/btrfs/space-info.c14
-rw-r--r--fs/btrfs/space-info.h3
-rw-r--r--fs/btrfs/subpage.c247
-rw-r--r--fs/btrfs/subpage.h59
-rw-r--r--fs/btrfs/super.c291
-rw-r--r--fs/btrfs/sysfs.c78
-rw-r--r--fs/btrfs/tests/extent-io-tests.c28
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c93
-rw-r--r--fs/btrfs/tests/inode-tests.c24
-rw-r--r--fs/btrfs/transaction.c48
-rw-r--r--fs/btrfs/tree-checker.c12
-rw-r--r--fs/btrfs/tree-log.c617
-rw-r--r--fs/btrfs/tree-mod-log.c81
-rw-r--r--fs/btrfs/ulist.c59
-rw-r--r--fs/btrfs/volumes.c138
-rw-r--r--fs/btrfs/volumes.h38
-rw-r--r--fs/btrfs/xattr.c9
-rw-r--r--fs/btrfs/zoned.c201
-rw-r--r--fs/btrfs/zoned.h3
-rw-r--r--fs/btrfs/zstd.c3
-rw-r--r--fs/buffer.c47
-rw-r--r--fs/cachefiles/io.c18
-rw-r--r--fs/cachefiles/namei.c4
-rw-r--r--fs/cachefiles/ondemand.c4
-rw-r--r--fs/ceph/addr.c31
-rw-r--r--fs/ceph/caps.c18
-rw-r--r--fs/ceph/crypto.c95
-rw-r--r--fs/ceph/crypto.h28
-rw-r--r--fs/ceph/dir.c7
-rw-r--r--fs/ceph/export.c21
-rw-r--r--fs/ceph/file.c29
-rw-r--r--fs/ceph/inode.c3
-rw-r--r--fs/ceph/mds_client.c4
-rw-r--r--fs/ceph/super.c6
-rw-r--r--fs/ceph/super.h2
-rw-r--r--fs/coda/dir.c12
-rw-r--r--fs/coda/file.c6
-rw-r--r--fs/coda/inode.c2
-rw-r--r--fs/configfs/Kconfig1
-rw-r--r--fs/configfs/dir.c1
-rw-r--r--fs/configfs/mount.c3
-rw-r--r--fs/coredump.c863
-rw-r--r--fs/cramfs/inode.c5
-rw-r--r--fs/crypto/bio.c9
-rw-r--r--fs/crypto/crypto.c52
-rw-r--r--fs/crypto/fname.c69
-rw-r--r--fs/crypto/fscrypt_private.h23
-rw-r--r--fs/crypto/hkdf.c4
-rw-r--r--fs/crypto/hooks.c2
-rw-r--r--fs/crypto/inline_crypt.c1
-rw-r--r--fs/crypto/keyring.c5
-rw-r--r--fs/crypto/keysetup.c23
-rw-r--r--fs/crypto/keysetup_v1.c55
-rw-r--r--fs/crypto/policy.c4
-rw-r--r--fs/d_path.c8
-rw-r--r--fs/dax.c72
-rw-r--r--fs/dcache.c163
-rw-r--r--fs/debugfs/file.c89
-rw-r--r--fs/debugfs/inode.c27
-rw-r--r--fs/debugfs/internal.h2
-rw-r--r--fs/devpts/inode.c2
-rw-r--r--fs/direct-io.c10
-rw-r--r--fs/dlm/lock.c2
-rw-r--r--fs/ecryptfs/file.c2
-rw-r--r--fs/ecryptfs/inode.c8
-rw-r--r--fs/ecryptfs/main.c5
-rw-r--r--fs/ecryptfs/mmap.c10
-rw-r--r--fs/efivarfs/inode.c4
-rw-r--r--fs/efivarfs/super.c10
-rw-r--r--fs/erofs/Kconfig2
-rw-r--r--fs/erofs/data.c101
-rw-r--r--fs/erofs/decompressor.c14
-rw-r--r--fs/erofs/dir.c23
-rw-r--r--fs/erofs/erofs_fs.h15
-rw-r--r--fs/erofs/fileio.c19
-rw-r--r--fs/erofs/fscache.c9
-rw-r--r--fs/erofs/inode.c21
-rw-r--r--fs/erofs/internal.h46
-rw-r--r--fs/erofs/super.c23
-rw-r--r--fs/erofs/sysfs.c4
-rw-r--r--fs/erofs/xattr.c56
-rw-r--r--fs/erofs/xattr.h3
-rw-r--r--fs/erofs/zdata.c28
-rw-r--r--fs/erofs/zmap.c159
-rw-r--r--fs/eventpoll.c70
-rw-r--r--fs/exec.c84
-rw-r--r--fs/exfat/file.c21
-rw-r--r--fs/exfat/inode.c16
-rw-r--r--fs/exfat/super.c4
-rw-r--r--fs/exportfs/expfs.c4
-rw-r--r--fs/ext2/dir.c2
-rw-r--r--fs/ext2/ext2.h4
-rw-r--r--fs/ext2/file.c12
-rw-r--r--fs/ext2/inode.c23
-rw-r--r--fs/ext2/ioctl.c4
-rw-r--r--fs/ext4/balloc.c2
-rw-r--r--fs/ext4/ext4.h78
-rw-r--r--fs/ext4/ext4_extents.h7
-rw-r--r--fs/ext4/extents.c72
-rw-r--r--fs/ext4/file.c18
-rw-r--r--fs/ext4/ialloc.c3
-rw-r--r--fs/ext4/inline.c91
-rw-r--r--fs/ext4/inode.c393
-rw-r--r--fs/ext4/ioctl.c4
-rw-r--r--fs/ext4/mballoc-test.c5
-rw-r--r--fs/ext4/mballoc.c895
-rw-r--r--fs/ext4/mballoc.h9
-rw-r--r--fs/ext4/move_extent.c3
-rw-r--r--fs/ext4/namei.c69
-rw-r--r--fs/ext4/page-io.c16
-rw-r--r--fs/ext4/super.c2
-rw-r--r--fs/ext4/xattr.c4
-rw-r--r--fs/f2fs/data.c8
-rw-r--r--fs/f2fs/f2fs.h4
-rw-r--r--fs/f2fs/file.c49
-rw-r--r--fs/f2fs/node.c1
-rw-r--r--fs/fat/file.c2
-rw-r--r--fs/fat/inode.c18
-rw-r--r--fs/fat/namei_msdos.c2
-rw-r--r--fs/fat/namei_vfat.c4
-rw-r--r--fs/fhandle.c62
-rw-r--r--fs/file.c23
-rw-r--r--fs/file_attr.c498
-rw-r--r--fs/file_table.c15
-rw-r--r--fs/fs_struct.c36
-rw-r--r--fs/fuse/Kconfig1
-rw-r--r--fs/fuse/control.c30
-rw-r--r--fs/fuse/dax.c3
-rw-r--r--fs/fuse/dev.c182
-rw-r--r--fs/fuse/dev_uring.c34
-rw-r--r--fs/fuse/dir.c53
-rw-r--r--fs/fuse/file.c797
-rw-r--r--fs/fuse/fuse_dev_i.h9
-rw-r--r--fs/fuse/fuse_i.h21
-rw-r--r--fs/fuse/inode.c21
-rw-r--r--fs/fuse/ioctl.c8
-rw-r--r--fs/fuse/readdir.c36
-rw-r--r--fs/fuse/virtio_fs.c11
-rw-r--r--fs/gfs2/aops.c8
-rw-r--r--fs/gfs2/bmap.c48
-rw-r--r--fs/gfs2/bmap.h1
-rw-r--r--fs/gfs2/dir.c6
-rw-r--r--fs/gfs2/file.c7
-rw-r--r--fs/gfs2/glock.c43
-rw-r--r--fs/gfs2/glock.h10
-rw-r--r--fs/gfs2/glops.c6
-rw-r--r--fs/gfs2/incore.h1
-rw-r--r--fs/gfs2/inode.c7
-rw-r--r--fs/gfs2/inode.h10
-rw-r--r--fs/gfs2/lock_dlm.c9
-rw-r--r--fs/gfs2/meta_io.c10
-rw-r--r--fs/gfs2/ops_fstype.c18
-rw-r--r--fs/gfs2/super.c6
-rw-r--r--fs/gfs2/sys.c1
-rw-r--r--fs/gfs2/util.c31
-rw-r--r--fs/hfs/bfind.c3
-rw-r--r--fs/hfs/bnode.c93
-rw-r--r--fs/hfs/btree.c57
-rw-r--r--fs/hfs/extent.c2
-rw-r--r--fs/hfs/hfs_fs.h3
-rw-r--r--fs/hfs/inode.c7
-rw-r--r--fs/hfs/super.c2
-rw-r--r--fs/hfsplus/bnode.c92
-rw-r--r--fs/hfsplus/extents.c3
-rw-r--r--fs/hfsplus/hfsplus_fs.h10
-rw-r--r--fs/hfsplus/inode.c15
-rw-r--r--fs/hfsplus/super.c8
-rw-r--r--fs/hfsplus/unicode.c7
-rw-r--r--fs/hfsplus/xattr.c6
-rw-r--r--fs/hostfs/hostfs_kern.c12
-rw-r--r--fs/hpfs/file.c20
-rw-r--r--fs/hpfs/super.c2
-rw-r--r--fs/hugetlbfs/inode.c28
-rw-r--r--fs/inode.c13
-rw-r--r--fs/internal.h5
-rw-r--r--fs/ioctl.c309
-rw-r--r--fs/iomap/Makefile6
-rw-r--r--fs/iomap/buffered-io.c557
-rw-r--r--fs/iomap/direct-io.c5
-rw-r--r--fs/iomap/fiemap.c3
-rw-r--r--fs/iomap/internal.h1
-rw-r--r--fs/iomap/ioend.c220
-rw-r--r--fs/iomap/iter.c1
-rw-r--r--fs/iomap/seek.c4
-rw-r--r--fs/iomap/swapfile.c3
-rw-r--r--fs/iomap/trace.c1
-rw-r--r--fs/iomap/trace.h4
-rw-r--r--fs/isofs/inode.c11
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/jffs2/erase.c4
-rw-r--r--fs/jffs2/file.c30
-rw-r--r--fs/jffs2/scan.c4
-rw-r--r--fs/jffs2/summary.c7
-rw-r--r--fs/jfs/file.c5
-rw-r--r--fs/jfs/inode.c18
-rw-r--r--fs/jfs/ioctl.c4
-rw-r--r--fs/jfs/jfs_dmap.c10
-rw-r--r--fs/jfs/jfs_inode.h4
-rw-r--r--fs/jfs/jfs_metapage.c114
-rw-r--r--fs/jfs/jfs_xtree.c142
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/kernfs/inode.c74
-rw-r--r--fs/kernfs/mount.c2
-rw-r--r--fs/libfs.c152
-rw-r--r--fs/locks.c4
-rw-r--r--fs/minix/dir.c2
-rw-r--r--fs/minix/file.c2
-rw-r--r--fs/minix/inode.c7
-rw-r--r--fs/mount.h40
-rw-r--r--fs/namei.c93
-rw-r--r--fs/namespace.c921
-rw-r--r--fs/netfs/buffered_read.c56
-rw-r--r--fs/netfs/buffered_write.c43
-rw-r--r--fs/netfs/direct_read.c16
-rw-r--r--fs/netfs/direct_write.c28
-rw-r--r--fs/netfs/fscache_io.c10
-rw-r--r--fs/netfs/internal.h66
-rw-r--r--fs/netfs/main.c7
-rw-r--r--fs/netfs/misc.c231
-rw-r--r--fs/netfs/objects.c48
-rw-r--r--fs/netfs/read_collect.c213
-rw-r--r--fs/netfs/read_pgpriv2.c9
-rw-r--r--fs/netfs/read_retry.c26
-rw-r--r--fs/netfs/read_single.c6
-rw-r--r--fs/netfs/write_collect.c95
-rw-r--r--fs/netfs/write_issue.c38
-rw-r--r--fs/netfs/write_retry.c22
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c53
-rw-r--r--fs/nfs/client.c6
-rw-r--r--fs/nfs/delegation.c25
-rw-r--r--fs/nfs/file.c21
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c120
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c2
-rw-r--r--fs/nfs/fscache.c1
-rw-r--r--fs/nfs/inode.c68
-rw-r--r--fs/nfs/internal.h3
-rw-r--r--fs/nfs/localio.c51
-rw-r--r--fs/nfs/nfs42.h1
-rw-r--r--fs/nfs/nfs42proc.c29
-rw-r--r--fs/nfs/nfs42xdr.c64
-rw-r--r--fs/nfs/nfs4_fs.h3
-rw-r--r--fs/nfs/nfs4file.c12
-rw-r--r--fs/nfs/nfs4idmap.c14
-rw-r--r--fs/nfs/nfs4proc.c75
-rw-r--r--fs/nfs/nfs4xdr.c1
-rw-r--r--fs/nfs/pnfs.c4
-rw-r--r--fs/nfs/pnfs_nfs.c11
-rw-r--r--fs/nfs/read.c3
-rw-r--r--fs/nfs/super.c21
-rw-r--r--fs/nfs/sysfs.c28
-rw-r--r--fs/nfs/write.c56
-rw-r--r--fs/nfs_common/nfslocalio.c99
-rw-r--r--fs/nfsd/blocklayout.c20
-rw-r--r--fs/nfsd/blocklayoutxdr.c111
-rw-r--r--fs/nfsd/blocklayoutxdr.h8
-rw-r--r--fs/nfsd/export.c8
-rw-r--r--fs/nfsd/export.h2
-rw-r--r--fs/nfsd/filecache.c34
-rw-r--r--fs/nfsd/filecache.h3
-rw-r--r--fs/nfsd/localio.c70
-rw-r--r--fs/nfsd/nfs3proc.c2
-rw-r--r--fs/nfsd/nfs4callback.c1
-rw-r--r--fs/nfsd/nfs4layouts.c4
-rw-r--r--fs/nfsd/nfs4proc.c21
-rw-r--r--fs/nfsd/nfs4recover.c49
-rw-r--r--fs/nfsd/nfs4state.c119
-rw-r--r--fs/nfsd/nfs4xdr.c4
-rw-r--r--fs/nfsd/nfsctl.c70
-rw-r--r--fs/nfsd/nfsd.h6
-rw-r--r--fs/nfsd/nfsfh.c16
-rw-r--r--fs/nfsd/nfsfh.h26
-rw-r--r--fs/nfsd/nfsproc.c2
-rw-r--r--fs/nfsd/state.h1
-rw-r--r--fs/nfsd/trace.h27
-rw-r--r--fs/nfsd/vfs.c24
-rw-r--r--fs/nfsd/xdr4.h1
-rw-r--r--fs/nilfs2/btree.c4
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/direct.c3
-rw-r--r--fs/nilfs2/file.c8
-rw-r--r--fs/nilfs2/inode.c17
-rw-r--r--fs/nilfs2/ioctl.c4
-rw-r--r--fs/nilfs2/mdt.c2
-rw-r--r--fs/nilfs2/nilfs.h4
-rw-r--r--fs/nilfs2/recovery.c3
-rw-r--r--fs/nilfs2/segment.c18
-rw-r--r--fs/nilfs2/segment.h1
-rw-r--r--fs/notify/dnotify/dnotify.c8
-rw-r--r--fs/notify/fanotify/fanotify.c8
-rw-r--r--fs/notify/fsnotify.c87
-rw-r--r--fs/ntfs3/dir.c6
-rw-r--r--fs/ntfs3/file.c87
-rw-r--r--fs/ntfs3/frecord.c31
-rw-r--r--fs/ntfs3/fsntfs.c6
-rw-r--r--fs/ntfs3/inode.c98
-rw-r--r--fs/ntfs3/namei.c26
-rw-r--r--fs/ntfs3/ntfs.h3
-rw-r--r--fs/ntfs3/ntfs_fs.h27
-rw-r--r--fs/ntfs3/super.c3
-rw-r--r--fs/ntfs3/xattr.c22
-rw-r--r--fs/ocfs2/aops.c6
-rw-r--r--fs/ocfs2/cluster/tcp.c5
-rw-r--r--fs/ocfs2/file.c4
-rw-r--r--fs/ocfs2/filecheck.c2
-rw-r--r--fs/ocfs2/ioctl.c4
-rw-r--r--fs/ocfs2/ioctl.h4
-rw-r--r--fs/ocfs2/mmap.c5
-rw-r--r--fs/ocfs2/mmap.h2
-rw-r--r--fs/ocfs2/quota_local.c2
-rw-r--r--fs/ocfs2/stackglue.c3
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/omfs/file.c9
-rw-r--r--fs/open.c12
-rw-r--r--fs/orangefs/file.c10
-rw-r--r--fs/orangefs/inode.c20
-rw-r--r--fs/orangefs/orangefs-debugfs.c8
-rw-r--r--fs/orangefs/orangefs-sysfs.c28
-rw-r--r--fs/orangefs/super.c2
-rw-r--r--fs/overlayfs/copy_up.c58
-rw-r--r--fs/overlayfs/dir.c260
-rw-r--r--fs/overlayfs/file.c4
-rw-r--r--fs/overlayfs/inode.c17
-rw-r--r--fs/overlayfs/namei.c139
-rw-r--r--fs/overlayfs/overlayfs.h63
-rw-r--r--fs/overlayfs/ovl_entry.h3
-rw-r--r--fs/overlayfs/params.c52
-rw-r--r--fs/overlayfs/readdir.c48
-rw-r--r--fs/overlayfs/super.c52
-rw-r--r--fs/overlayfs/util.c57
-rw-r--r--fs/pidfs.c440
-rw-r--r--fs/pipe.c11
-rw-r--r--fs/pnode.c697
-rw-r--r--fs/pnode.h29
-rw-r--r--fs/proc/base.c27
-rw-r--r--fs/proc/fd.c11
-rw-r--r--fs/proc/generic.c12
-rw-r--r--fs/proc/inode.c4
-rw-r--r--fs/proc/internal.h16
-rw-r--r--fs/proc/meminfo.c3
-rw-r--r--fs/proc/namespaces.c3
-rw-r--r--fs/proc/page.c207
-rw-r--r--fs/proc/proc_sysctl.c25
-rw-r--r--fs/proc/root.c10
-rw-r--r--fs/proc/task_mmu.c203
-rw-r--r--fs/proc/task_nommu.c4
-rw-r--r--fs/pstore/inode.c5
-rw-r--r--fs/ramfs/file-mmu.c2
-rw-r--r--fs/ramfs/file-nommu.c12
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/read_write.c4
-rw-r--r--fs/resctrl/ctrlmondata.c13
-rw-r--r--fs/resctrl/internal.h4
-rw-r--r--fs/resctrl/monitor.c6
-rw-r--r--fs/resctrl/pseudo_lock.c4
-rw-r--r--fs/resctrl/rdtgroup.c10
-rw-r--r--fs/romfs/mmap-nommu.c6
-rw-r--r--fs/select.c4
-rw-r--r--fs/smb/client/cached_dir.c46
-rw-r--r--fs/smb/client/cached_dir.h14
-rw-r--r--fs/smb/client/cifs_debug.c78
-rw-r--r--fs/smb/client/cifs_ioctl.h2
-rw-r--r--fs/smb/client/cifsencrypt.c83
-rw-r--r--fs/smb/client/cifsfs.c17
-rw-r--r--fs/smb/client/cifsfs.h8
-rw-r--r--fs/smb/client/cifsglob.h36
-rw-r--r--fs/smb/client/cifspdu.h6
-rw-r--r--fs/smb/client/cifsproto.h12
-rw-r--r--fs/smb/client/cifssmb.c187
-rw-r--r--fs/smb/client/connect.c109
-rw-r--r--fs/smb/client/dir.c25
-rw-r--r--fs/smb/client/file.c53
-rw-r--r--fs/smb/client/fs_context.c49
-rw-r--r--fs/smb/client/ioctl.c2
-rw-r--r--fs/smb/client/link.c13
-rw-r--r--fs/smb/client/misc.c14
-rw-r--r--fs/smb/client/namespace.c3
-rw-r--r--fs/smb/client/readdir.c30
-rw-r--r--fs/smb/client/reparse.c59
-rw-r--r--fs/smb/client/reparse.h4
-rw-r--r--fs/smb/client/sess.c37
-rw-r--r--fs/smb/client/smb1ops.c31
-rw-r--r--fs/smb/client/smb2inode.c12
-rw-r--r--fs/smb/client/smb2ops.c41
-rw-r--r--fs/smb/client/smb2pdu.c163
-rw-r--r--fs/smb/client/smb2proto.h8
-rw-r--r--fs/smb/client/smbdirect.c549
-rw-r--r--fs/smb/client/smbdirect.h71
-rw-r--r--fs/smb/client/trace.h24
-rw-r--r--fs/smb/client/transport.c14
-rw-r--r--fs/smb/common/smbdirect/smbdirect.h37
-rw-r--r--fs/smb/common/smbdirect/smbdirect_pdu.h55
-rw-r--r--fs/smb/common/smbdirect/smbdirect_socket.h43
-rw-r--r--fs/smb/server/Kconfig1
-rw-r--r--fs/smb/server/auth.c34
-rw-r--r--fs/smb/server/auth.h2
-rw-r--r--fs/smb/server/connection.c2
-rw-r--r--fs/smb/server/connection.h2
-rw-r--r--fs/smb/server/crypto_ctx.c8
-rw-r--r--fs/smb/server/crypto_ctx.h4
-rw-r--r--fs/smb/server/server.c1
-rw-r--r--fs/smb/server/smb2pdu.c268
-rw-r--r--fs/smb/server/smb2pdu.h3
-rw-r--r--fs/smb/server/transport_rdma.c15
-rw-r--r--fs/smb/server/transport_tcp.c9
-rw-r--r--fs/smb/server/vfs.c268
-rw-r--r--fs/smb/server/vfs.h7
-rw-r--r--fs/smb/server/vfs_cache.h1
-rw-r--r--fs/squashfs/Kconfig21
-rw-r--r--fs/squashfs/block.c28
-rw-r--r--fs/squashfs/super.c5
-rw-r--r--fs/stack.c4
-rw-r--r--fs/super.c15
-rw-r--r--fs/sysfs/file.c10
-rw-r--r--fs/tracefs/inode.c28
-rw-r--r--fs/ubifs/crypto.c2
-rw-r--r--fs/ubifs/file.c28
-rw-r--r--fs/ubifs/ioctl.c4
-rw-r--r--fs/ubifs/journal.c2
-rw-r--r--fs/ubifs/ubifs.h4
-rw-r--r--fs/udf/inode.c39
-rw-r--r--fs/udf/super.c13
-rw-r--r--fs/ufs/dir.c2
-rw-r--r--fs/ufs/file.c2
-rw-r--r--fs/ufs/inode.c16
-rw-r--r--fs/ufs/super.c2
-rw-r--r--fs/userfaultfd.c97
-rw-r--r--fs/vboxsf/file.c13
-rw-r--r--fs/vboxsf/super.c2
-rw-r--r--fs/verity/Kconfig6
-rw-r--r--fs/verity/enable.c9
-rw-r--r--fs/verity/fsverity_private.h24
-rw-r--r--fs/verity/hash_algs.c194
-rw-r--r--fs/verity/measure.c1
-rw-r--r--fs/verity/open.c37
-rw-r--r--fs/verity/read_metadata.c1
-rw-r--r--fs/verity/verify.c8
-rw-r--r--fs/xattr.c3
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c41
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c52
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c32
-rw-r--r--fs/xfs/libxfs/xfs_btree.c33
-rw-r--r--fs/xfs/libxfs/xfs_btree.h41
-rw-r--r--fs/xfs/libxfs/xfs_format.h2
-rw-r--r--fs/xfs/libxfs/xfs_group.c17
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c31
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c24
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h4
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c4
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c18
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c67
-rw-r--r--fs/xfs/libxfs/xfs_rtrefcount_btree.c18
-rw-r--r--fs/xfs/libxfs/xfs_rtrmap_btree.c67
-rw-r--r--fs/xfs/scrub/btree.c2
-rw-r--r--fs/xfs/scrub/common.c7
-rw-r--r--fs/xfs/scrub/common.h2
-rw-r--r--fs/xfs/scrub/dir_repair.c8
-rw-r--r--fs/xfs/scrub/fscounters.c3
-rw-r--r--fs/xfs/scrub/metapath.c4
-rw-r--r--fs/xfs/scrub/nlinks.c8
-rw-r--r--fs/xfs/scrub/nlinks_repair.c4
-rw-r--r--fs/xfs/scrub/parent_repair.c12
-rw-r--r--fs/xfs/scrub/quotacheck.c4
-rw-r--r--fs/xfs/scrub/rcbag_btree.c38
-rw-r--r--fs/xfs/scrub/repair.c36
-rw-r--r--fs/xfs/scrub/repair.h4
-rw-r--r--fs/xfs/scrub/rmap_repair.c14
-rw-r--r--fs/xfs/scrub/rtrmap_repair.c14
-rw-r--r--fs/xfs/scrub/scrub.c5
-rw-r--r--fs/xfs/scrub/trace.h2
-rw-r--r--fs/xfs/xfs_aops.c228
-rw-r--r--fs/xfs/xfs_attr_item.c148
-rw-r--r--fs/xfs/xfs_attr_item.h8
-rw-r--r--fs/xfs/xfs_bmap_item.c18
-rw-r--r--fs/xfs/xfs_buf.c53
-rw-r--r--fs/xfs/xfs_buf.h9
-rw-r--r--fs/xfs/xfs_buf_item.c303
-rw-r--r--fs/xfs/xfs_buf_item.h5
-rw-r--r--fs/xfs/xfs_buf_item_recover.c38
-rw-r--r--fs/xfs/xfs_discard.c41
-rw-r--r--fs/xfs/xfs_dquot.c4
-rw-r--r--fs/xfs/xfs_dquot_item_recover.c20
-rw-r--r--fs/xfs/xfs_exchmaps_item.c8
-rw-r--r--fs/xfs/xfs_extent_busy.h8
-rw-r--r--fs/xfs/xfs_extfree_item.c59
-rw-r--r--fs/xfs/xfs_file.c56
-rw-r--r--fs/xfs/xfs_fsmap.c4
-rw-r--r--fs/xfs/xfs_icache.c13
-rw-r--r--fs/xfs/xfs_icreate_item.c2
-rw-r--r--fs/xfs/xfs_inode.c9
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_inode_item.c11
-rw-r--r--fs/xfs/xfs_inode_item.h4
-rw-r--r--fs/xfs/xfs_inode_item_recover.c26
-rw-r--r--fs/xfs/xfs_ioctl.c21
-rw-r--r--fs/xfs/xfs_ioctl.h4
-rw-r--r--fs/xfs/xfs_iomap.c14
-rw-r--r--fs/xfs/xfs_iomap.h1
-rw-r--r--fs/xfs/xfs_iops.c6
-rw-r--r--fs/xfs/xfs_itable.c18
-rw-r--r--fs/xfs/xfs_iwalk.c11
-rw-r--r--fs/xfs/xfs_log.c16
-rw-r--r--fs/xfs/xfs_log.h16
-rw-r--r--fs/xfs/xfs_log_cil.c75
-rw-r--r--fs/xfs/xfs_log_priv.h4
-rw-r--r--fs/xfs/xfs_log_recover.c16
-rw-r--r--fs/xfs/xfs_mount.c102
-rw-r--r--fs/xfs/xfs_mount.h17
-rw-r--r--fs/xfs/xfs_mru_cache.c19
-rw-r--r--fs/xfs/xfs_notify_failure.c9
-rw-r--r--fs/xfs/xfs_qm.c96
-rw-r--r--fs/xfs/xfs_refcount_item.c34
-rw-r--r--fs/xfs/xfs_reflink.c3
-rw-r--r--fs/xfs/xfs_rmap_item.c34
-rw-r--r--fs/xfs/xfs_rtalloc.c15
-rw-r--r--fs/xfs/xfs_super.c5
-rw-r--r--fs/xfs/xfs_trace.h121
-rw-r--r--fs/xfs/xfs_trans.c211
-rw-r--r--fs/xfs/xfs_trans.h4
-rw-r--r--fs/xfs/xfs_xattr.c2
-rw-r--r--fs/xfs/xfs_zone_alloc.c87
-rw-r--r--fs/xfs/xfs_zone_alloc.h4
-rw-r--r--fs/xfs/xfs_zone_gc.c20
-rw-r--r--fs/xfs/xfs_zone_info.c2
-rw-r--r--fs/xfs/xfs_zone_priv.h16
-rw-r--r--fs/xfs/xfs_zone_space_resv.c17
-rw-r--r--fs/zonefs/file.c50
-rw-r--r--fs/zonefs/super.c5
720 files changed, 18205 insertions, 14899 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 1286d96a29bc..862164181bac 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -59,7 +59,7 @@ static void v9fs_issue_write(struct netfs_io_subrequest *subreq)
len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err);
if (len > 0)
__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
- netfs_write_subrequest_terminated(subreq, len ?: err, false);
+ netfs_write_subrequest_terminated(subreq, len ?: err);
}
/**
@@ -77,7 +77,8 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
/* if we just extended the file size, any portion not in
* cache won't be on server and is zeroes */
- if (subreq->rreq->origin != NETFS_DIO_READ)
+ if (subreq->rreq->origin != NETFS_UNBUFFERED_READ &&
+ subreq->rreq->origin != NETFS_DIO_READ)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
if (pos + total >= i_size_read(rreq->inode))
__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 5061f192eafd..04795508a795 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -127,7 +127,6 @@ const struct dentry_operations v9fs_cached_dentry_operations = {
};
const struct dentry_operations v9fs_dentry_operations = {
- .d_delete = always_delete_dentry,
.d_release = v9fs_dentry_release,
.d_unalias_trylock = v9fs_dentry_unalias_trylock,
.d_unalias_unlock = v9fs_dentry_unalias_unlock,
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 348cc90bf9c5..eb0b083da269 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -454,9 +454,10 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
}
static int
-v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma)
+v9fs_file_mmap_prepare(struct vm_area_desc *desc)
{
int retval;
+ struct file *filp = desc->file;
struct inode *inode = file_inode(filp);
struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
@@ -464,12 +465,12 @@ v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma)
if (!(v9ses->cache & CACHE_WRITEBACK)) {
p9_debug(P9_DEBUG_CACHE, "(read-only mmap mode)");
- return generic_file_readonly_mmap(filp, vma);
+ return generic_file_readonly_mmap_prepare(desc);
}
- retval = generic_file_mmap(filp, vma);
+ retval = generic_file_mmap_prepare(desc);
if (!retval)
- vma->vm_ops = &v9fs_mmap_file_vm_ops;
+ desc->vm_ops = &v9fs_mmap_file_vm_ops;
return retval;
}
@@ -516,7 +517,7 @@ const struct file_operations v9fs_file_operations = {
.open = v9fs_file_open,
.release = v9fs_dir_release,
.lock = v9fs_file_lock,
- .mmap = generic_file_readonly_mmap,
+ .mmap_prepare = generic_file_readonly_mmap_prepare,
.splice_read = v9fs_file_splice_read,
.splice_write = iter_file_splice_write,
.fsync = v9fs_file_fsync,
@@ -531,7 +532,7 @@ const struct file_operations v9fs_file_operations_dotl = {
.release = v9fs_dir_release,
.lock = v9fs_file_lock_dotl,
.flock = v9fs_file_flock_dotl,
- .mmap = v9fs_file_mmap,
+ .mmap_prepare = v9fs_file_mmap_prepare,
.splice_read = v9fs_file_splice_read,
.splice_write = iter_file_splice_write,
.fsync = v9fs_file_fsync_dotl,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 489db161abc9..795c6388744c 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -134,10 +134,12 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
if (retval)
goto release_sb;
- if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
- sb->s_d_op = &v9fs_cached_dentry_operations;
- else
- sb->s_d_op = &v9fs_dentry_operations;
+ if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
+ set_default_d_op(sb, &v9fs_cached_dentry_operations);
+ } else {
+ set_default_d_op(sb, &v9fs_dentry_operations);
+ sb->s_d_flags |= DCACHE_DONTCACHE;
+ }
inode = v9fs_get_new_inode_from_fid(v9ses, fid, sb);
if (IS_ERR(inode)) {
diff --git a/fs/Kconfig b/fs/Kconfig
index 44b6cdd36dc1..c654a3642897 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -59,7 +59,7 @@ endif # BLOCK
config FS_DAX
bool "File system based Direct Access (DAX) support"
depends on MMU
- depends on ZONE_DEVICE || FS_DAX_LIMITED
+ depends on ZONE_DEVICE
select FS_IOMAP
select DAX
help
@@ -95,13 +95,6 @@ config FS_DAX_PMD
depends on ZONE_DEVICE
depends on TRANSPARENT_HUGEPAGE
-# Selected by DAX drivers that do not expect filesystem DAX to support
-# get_user_pages() of DAX mappings. I.e. "limited" indicates no support
-# for fork() of processes with MAP_SHARED mappings or support for
-# direct-I/O to a DAX mapping.
-config FS_DAX_LIMITED
- bool
-
# Posix ACL utility routines
#
# Note: Posix ACLs can be implemented without these helpers. Never use
@@ -256,7 +249,7 @@ config ARCH_SUPPORTS_HUGETLBFS
menuconfig HUGETLBFS
bool "HugeTLB file system support"
- depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
+ depends on ARCH_SUPPORTS_HUGETLBFS
depends on (SYSFS || SYSCTL)
select MEMFD_CREATE
select PADATA if SMP
diff --git a/fs/Makefile b/fs/Makefile
index 79c08b914c47..334654f9584b 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -15,7 +15,8 @@ obj-y := open.o read_write.o file_table.o super.o \
pnode.o splice.o sync.o utimes.o d_path.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
- kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o
+ kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \
+ file_attr.o
obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o
obj-$(CONFIG_PROC_FS) += proc_namespace.o
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index ee80718aaeec..cd13165fd904 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -25,7 +25,7 @@
const struct file_operations adfs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.fsync = generic_file_fsync,
.write_iter = generic_file_write_iter,
.splice_read = filemap_splice_read,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 21527189e430..6830f8bc8d4e 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -53,13 +53,14 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to)
truncate_pagecache(inode, inode->i_size);
}
-static int adfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int adfs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int ret;
- ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata,
+ ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata,
adfs_get_block,
&ADFS_I(mapping->host)->mmu_private);
if (unlikely(ret))
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 017c48a80203..fdccdbbfc213 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -397,7 +397,7 @@ static int adfs_fill_super(struct super_block *sb, struct fs_context *fc)
if (asb->s_ftsuffix)
asb->s_namelen += 4;
- sb->s_d_op = &adfs_dentry_operations;
+ set_default_d_op(sb, &adfs_dentry_operations);
root = adfs_iget(sb, &root_obj);
sb->s_root = d_make_root(root);
if (!sb->s_root) {
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 7a71018e3f67..765c3443663e 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -415,13 +415,14 @@ affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
return ret;
}
-static int affs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int affs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int ret;
- ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata,
+ ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata,
affs_get_block,
&AFFS_I(mapping->host)->mmu_private);
if (unlikely(ret))
@@ -430,14 +431,15 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
-static int affs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned int len, unsigned int copied,
+static int affs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos,
+ unsigned int len, unsigned int copied,
struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
int ret;
- ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
/* Clear Archived bit on file writes, as AmigaOS would do */
if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) {
@@ -645,7 +647,8 @@ static int affs_read_folio_ofs(struct file *file, struct folio *folio)
return err;
}
-static int affs_write_begin_ofs(struct file *file, struct address_space *mapping,
+static int affs_write_begin_ofs(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
@@ -684,9 +687,10 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
return err;
}
-static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+static int affs_write_end_ofs(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
struct super_block *sb = inode->i_sb;
@@ -999,7 +1003,7 @@ const struct file_operations affs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.open = affs_file_open,
.release = affs_file_release,
.fsync = affs_file_fsync,
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 2fa40337776d..44f8aa883100 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -500,9 +500,9 @@ got_root:
return PTR_ERR(root_inode);
if (affs_test_opt(AFFS_SB(sb)->s_flags, SF_INTL))
- sb->s_d_op = &affs_intl_dentry_operations;
+ set_default_d_op(sb, &affs_intl_dentry_operations);
else
- sb->s_d_op = &affs_dentry_operations;
+ set_default_d_op(sb, &affs_dentry_operations);
sb->s_root = d_make_root(root_inode);
if (!sb->s_root) {
diff --git a/fs/afs/addr_prefs.c b/fs/afs/addr_prefs.c
index c0384201b8fe..133736412c3d 100644
--- a/fs/afs/addr_prefs.c
+++ b/fs/afs/addr_prefs.c
@@ -48,7 +48,7 @@ static int afs_split_string(char **pbuf, char *strv[], unsigned int maxstrv)
strv[count++] = p;
/* Skip over word */
- while (!isspace(*p))
+ while (!isspace(*p) && *p)
p++;
if (!*p)
break;
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 0168bbf53fe0..f31359922e98 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -177,6 +177,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
VL_SERVICE, AFS_VL_PORT);
if (IS_ERR(vllist)) {
ret = PTR_ERR(vllist);
+ vllist = NULL;
goto parse_failed;
}
diff --git a/fs/afs/file.c b/fs/afs/file.c
index fc15497608c6..f66a92294284 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -19,7 +19,7 @@
#include <trace/events/netfs.h>
#include "internal.h"
-static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
+static int afs_file_mmap_prepare(struct vm_area_desc *desc);
static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos,
@@ -35,7 +35,7 @@ const struct file_operations afs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = afs_file_read_iter,
.write_iter = netfs_file_write_iter,
- .mmap = afs_file_mmap,
+ .mmap_prepare = afs_file_mmap_prepare,
.splice_read = afs_file_splice_read,
.splice_write = iter_file_splice_write,
.fsync = afs_fsync,
@@ -492,16 +492,16 @@ static void afs_drop_open_mmap(struct afs_vnode *vnode)
/*
* Handle setting up a memory mapping on an AFS file.
*/
-static int afs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int afs_file_mmap_prepare(struct vm_area_desc *desc)
{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
+ struct afs_vnode *vnode = AFS_FS_I(file_inode(desc->file));
int ret;
afs_add_open_mmap(vnode);
- ret = generic_file_mmap(file, vma);
+ ret = generic_file_mmap_prepare(desc);
if (ret == 0)
- vma->vm_ops = &afs_vm_ops;
+ desc->vm_ops = &afs_vm_ops;
else
afs_drop_open_mmap(vnode);
return ret;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 25b306db6992..da407f2d6f0d 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -483,9 +483,9 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
goto error;
if (as->dyn_root) {
- sb->s_d_op = &afs_dynroot_dentry_operations;
+ set_default_d_op(sb, &afs_dynroot_dentry_operations);
} else {
- sb->s_d_op = &afs_fs_dentry_operations;
+ set_default_d_op(sb, &afs_fs_dentry_operations);
rcu_assign_pointer(as->volume->sb, sb);
}
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 18b0a9f1615e..2e7526ea883a 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -120,17 +120,17 @@ static void afs_issue_write_worker(struct work_struct *work)
#if 0 // Error injection
if (subreq->debug_index == 3)
- return netfs_write_subrequest_terminated(subreq, -ENOANO, false);
+ return netfs_write_subrequest_terminated(subreq, -ENOANO);
if (!subreq->retry_count) {
set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
- return netfs_write_subrequest_terminated(subreq, -EAGAIN, false);
+ return netfs_write_subrequest_terminated(subreq, -EAGAIN);
}
#endif
op = afs_alloc_operation(wreq->netfs_priv, vnode->volume);
if (IS_ERR(op))
- return netfs_write_subrequest_terminated(subreq, -EAGAIN, false);
+ return netfs_write_subrequest_terminated(subreq, -EAGAIN);
afs_op_set_vnode(op, 0, vnode);
op->file[0].dv_delta = 1;
@@ -166,7 +166,7 @@ static void afs_issue_write_worker(struct work_struct *work)
break;
}
- netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len, false);
+ netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len);
}
void afs_issue_write(struct netfs_io_subrequest *subreq)
@@ -202,6 +202,7 @@ void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *st
case NETFS_READ_GAPS:
case NETFS_READ_SINGLE:
case NETFS_READ_FOR_WRITE:
+ case NETFS_UNBUFFERED_READ:
case NETFS_DIO_READ:
return;
default:
diff --git a/fs/aio.c b/fs/aio.c
index 793b7b15ec4b..7fc7b6221312 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -392,15 +392,15 @@ static const struct vm_operations_struct aio_ring_vm_ops = {
#endif
};
-static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
+static int aio_ring_mmap_prepare(struct vm_area_desc *desc)
{
- vm_flags_set(vma, VM_DONTEXPAND);
- vma->vm_ops = &aio_ring_vm_ops;
+ desc->vm_flags |= VM_DONTEXPAND;
+ desc->vm_ops = &aio_ring_vm_ops;
return 0;
}
static const struct file_operations aio_ring_fops = {
- .mmap = aio_ring_mmap,
+ .mmap_prepare = aio_ring_mmap_prepare,
};
#if IS_ENABLED(CONFIG_MIGRATION)
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index e51e7d88980a..1d847a939f29 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -98,14 +98,25 @@ static struct file_system_type anon_inode_fs_type = {
.kill_sb = kill_anon_super,
};
-static struct inode *anon_inode_make_secure_inode(
- const char *name,
- const struct inode *context_inode)
+/**
+ * anon_inode_make_secure_inode - allocate an anonymous inode with security context
+ * @sb: [in] Superblock to allocate from
+ * @name: [in] Name of the class of the newfile (e.g., "secretmem")
+ * @context_inode:
+ * [in] Optional parent inode for security inheritance
+ *
+ * The function ensures proper security initialization through the LSM hook
+ * security_inode_init_security_anon().
+ *
+ * Return: Pointer to new inode on success, ERR_PTR on failure.
+ */
+struct inode *anon_inode_make_secure_inode(struct super_block *sb, const char *name,
+ const struct inode *context_inode)
{
struct inode *inode;
int error;
- inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
+ inode = alloc_anon_inode(sb);
if (IS_ERR(inode))
return inode;
inode->i_flags &= ~S_PRIVATE;
@@ -118,6 +129,7 @@ static struct inode *anon_inode_make_secure_inode(
}
return inode;
}
+EXPORT_SYMBOL_GPL_FOR_MODULES(anon_inode_make_secure_inode, "kvm");
static struct file *__anon_inode_getfile(const char *name,
const struct file_operations *fops,
@@ -132,7 +144,8 @@ static struct file *__anon_inode_getfile(const char *name,
return ERR_PTR(-ENOENT);
if (make_inode) {
- inode = anon_inode_make_secure_inode(name, context_inode);
+ inode = anon_inode_make_secure_inode(anon_inode_mnt->mnt_sb,
+ name, context_inode);
if (IS_ERR(inode)) {
file = ERR_CAST(inode);
goto err;
diff --git a/fs/attr.c b/fs/attr.c
index 9caf63d20d03..5425c1dbbff9 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -230,7 +230,7 @@ EXPORT_SYMBOL(setattr_prepare);
* @inode: the inode to be truncated
* @offset: the new size to assign to the inode
*
- * inode_newsize_ok must be called with i_mutex held.
+ * inode_newsize_ok must be called with i_rwsem held exclusively.
*
* inode_newsize_ok will check filesystem limits and ulimits to check that the
* new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
@@ -318,7 +318,7 @@ static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr)
* @inode: the inode to be updated
* @attr: the new attributes
*
- * setattr_copy must be called with i_mutex held.
+ * setattr_copy must be called with i_rwsem held exclusively.
*
* setattr_copy updates the inode's metadata with that specified
* in attr on idmapped mounts. Necessary permission checks to determine
@@ -403,13 +403,13 @@ EXPORT_SYMBOL(may_setattr);
* @attr: new attributes
* @delegated_inode: returns inode, if the inode is delegated
*
- * The caller must hold the i_mutex on the affected object.
+ * The caller must hold the i_rwsem exclusively on the affected object.
*
* If notify_change discovers a delegation in need of breaking,
* it will return -EWOULDBLOCK and return a reference to the inode in
* delegated_inode. The caller should then break the delegation and
* retry. Because breaking a delegation may take a long time, the
- * caller should drop the i_mutex before doing so.
+ * caller should drop the i_rwsem before doing so.
*
* Alternatively, a caller may pass NULL for delegated_inode. This may
* be appropriate for callers that expect the underlying filesystem not
@@ -456,7 +456,7 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
- /* Flag setting protected by i_mutex */
+ /* Flag setting protected by i_rwsem */
if (is_sxid(attr->ia_mode))
inode->i_flags &= ~S_NOSEC;
}
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index ee2edccaef70..f5c16ffba013 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -311,7 +311,7 @@ static int autofs_fill_super(struct super_block *s, struct fs_context *fc)
s->s_blocksize_bits = 10;
s->s_magic = AUTOFS_SUPER_MAGIC;
s->s_op = &autofs_sops;
- s->s_d_op = &autofs_dentry_operations;
+ set_default_d_op(s, &autofs_dentry_operations);
s->s_time_gran = 1;
/*
diff --git a/fs/backing-file.c b/fs/backing-file.c
index 763fbe9b72b2..15a7f8031084 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -41,7 +41,7 @@ struct file *backing_file_open(const struct path *user_path, int flags,
return f;
path_get(user_path);
- *backing_file_user_path(f) = *user_path;
+ backing_file_set_user_path(f, user_path);
error = vfs_open(real_path, f);
if (error) {
fput(f);
@@ -65,7 +65,7 @@ struct file *backing_tmpfile_open(const struct path *user_path, int flags,
return f;
path_get(user_path);
- *backing_file_user_path(f) = *user_path;
+ backing_file_set_user_path(f, user_path);
error = vfs_tmpfile(real_idmap, real_parentpath, f, mode);
if (error) {
fput(f);
@@ -333,13 +333,13 @@ int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
return -EIO;
- if (!file->f_op->mmap)
+ if (!can_mmap_file(file))
return -ENODEV;
vma_set_file(vma, file);
old_cred = override_creds(ctx->cred);
- ret = call_mmap(vma->vm_file, vma);
+ ret = vfs_mmap(vma->vm_file, vma);
revert_creds(old_cred);
if (ctx->accessed)
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 173e81c2bbcb..66de46318620 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -21,7 +21,6 @@
#include "error.h"
#include "lru.h"
#include "recovery.h"
-#include "trace.h"
#include "varint.h"
#include <linux/kthread.h>
@@ -337,11 +336,10 @@ void bch2_alloc_v4_swab(struct bkey_s k)
a->stripe_sectors = swab32(a->stripe_sectors);
}
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+static inline void __bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c,
+ unsigned dev, const struct bch_alloc_v4 *a)
{
- struct bch_alloc_v4 _a;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
- struct bch_dev *ca = c ? bch2_dev_bucket_tryget_noerror(c, k.k->p) : NULL;
+ struct bch_dev *ca = c ? bch2_dev_tryget_noerror(c, dev) : NULL;
prt_newline(out);
printbuf_indent_add(out, 2);
@@ -369,6 +367,19 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
bch2_dev_put(ca);
}
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bch_alloc_v4 _a;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
+
+ __bch2_alloc_v4_to_text(out, c, k.k->p.inode, a);
+}
+
+void bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+ __bch2_alloc_v4_to_text(out, c, k.k->p.inode, bkey_s_c_to_alloc_v4(k).v);
+}
+
void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
{
if (k.k->type == KEY_TYPE_alloc_v4) {
@@ -697,8 +708,8 @@ static int __need_discard_or_freespace_err(struct btree_trans *trans,
set ? "" : "un",
bch2_btree_id_str(btree),
buf.buf);
- if (ret == -BCH_ERR_fsck_ignore ||
- ret == -BCH_ERR_fsck_errors_not_fixed)
+ if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) ||
+ bch2_err_matches(ret, BCH_ERR_fsck_errors_not_fixed))
ret = 0;
printbuf_exit(&buf);
@@ -854,7 +865,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
if (!ca)
- return -BCH_ERR_trigger_alloc;
+ return bch_err_throw(c, trigger_alloc);
struct bch_alloc_v4 old_a_convert;
const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
@@ -988,14 +999,11 @@ int bch2_trigger_alloc(struct btree_trans *trans,
}
if (new_a->gen != old_a->gen) {
- rcu_read_lock();
+ guard(rcu)();
u8 *gen = bucket_gen(ca, new.k->p.offset);
- if (unlikely(!gen)) {
- rcu_read_unlock();
+ if (unlikely(!gen))
goto invalid_bucket;
- }
*gen = new_a->gen;
- rcu_read_unlock();
}
#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; })
@@ -1021,15 +1029,12 @@ int bch2_trigger_alloc(struct btree_trans *trans,
}
if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) {
- rcu_read_lock();
+ guard(rcu)();
struct bucket *g = gc_bucket(ca, new.k->p.offset);
- if (unlikely(!g)) {
- rcu_read_unlock();
+ if (unlikely(!g))
goto invalid_bucket;
- }
g->gen_valid = 1;
g->gen = new_a->gen;
- rcu_read_unlock();
}
err:
fsck_err:
@@ -1039,7 +1044,7 @@ fsck_err:
invalid_bucket:
bch2_fs_inconsistent(c, "reference to invalid bucket\n%s",
(bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
- ret = -BCH_ERR_trigger_alloc;
+ ret = bch_err_throw(c, trigger_alloc);
goto err;
}
@@ -1105,13 +1110,12 @@ static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *buck
bucket->offset = 0;
}
- rcu_read_lock();
+ guard(rcu)();
*ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
if (*ca) {
*bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket);
bch2_dev_get(*ca);
}
- rcu_read_unlock();
return *ca != NULL;
}
@@ -1402,6 +1406,9 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite
: BCH_DATA_free;
struct printbuf buf = PRINTBUF;
+ unsigned fsck_flags = (async_repair ? FSCK_ERR_NO_LOG : 0)|
+ FSCK_CAN_FIX|FSCK_CAN_IGNORE;
+
struct bpos bucket = iter->pos;
bucket.offset &= ~(~0ULL << 56);
u64 genbits = iter->pos.offset & (~0ULL << 56);
@@ -1415,9 +1422,10 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite
return ret;
if (!bch2_dev_bucket_exists(c, bucket)) {
- if (fsck_err(trans, need_discard_freespace_key_to_invalid_dev_bucket,
- "entry in %s btree for nonexistant dev:bucket %llu:%llu",
- bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset))
+ if (__fsck_err(trans, fsck_flags,
+ need_discard_freespace_key_to_invalid_dev_bucket,
+ "entry in %s btree for nonexistant dev:bucket %llu:%llu",
+ bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset))
goto delete;
ret = 1;
goto out;
@@ -1429,7 +1437,8 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite
if (a->data_type != state ||
(state == BCH_DATA_free &&
genbits != alloc_freespace_genbits(*a))) {
- if (fsck_err(trans, need_discard_freespace_key_bad,
+ if (__fsck_err(trans, fsck_flags,
+ need_discard_freespace_key_bad,
"%s\nincorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
bch2_btree_id_str(iter->btree_id),
@@ -1454,7 +1463,7 @@ delete:
ret = bch2_btree_bit_mod_iter(trans, iter, false) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc) ?:
- -BCH_ERR_transaction_restart_commit;
+ bch_err_throw(c, transaction_restart_commit);
goto out;
} else {
/*
@@ -1777,14 +1786,16 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress)
{
+ struct bch_fs *c = ca->fs;
int ret;
mutex_lock(&ca->discard_buckets_in_flight_lock);
- darray_for_each(ca->discard_buckets_in_flight, i)
- if (i->bucket == bucket) {
- ret = -BCH_ERR_EEXIST_discard_in_flight_add;
- goto out;
- }
+ struct discard_in_flight *i =
+ darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket);
+ if (i) {
+ ret = bch_err_throw(c, EEXIST_discard_in_flight_add);
+ goto out;
+ }
ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) {
.in_progress = in_progress,
@@ -1798,14 +1809,11 @@ out:
static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
{
mutex_lock(&ca->discard_buckets_in_flight_lock);
- darray_for_each(ca->discard_buckets_in_flight, i)
- if (i->bucket == bucket) {
- BUG_ON(!i->in_progress);
- darray_remove_item(&ca->discard_buckets_in_flight, i);
- goto found;
- }
- BUG();
-found:
+ struct discard_in_flight *i =
+ darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket);
+ BUG_ON(!i || !i->in_progress);
+
+ darray_remove_item(&ca->discard_buckets_in_flight, i);
mutex_unlock(&ca->discard_buckets_in_flight_lock);
}
@@ -2504,7 +2512,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
lockdep_assert_held(&c->state_lock);
- rcu_read_lock();
+ guard(rcu)();
for_each_member_device_rcu(c, ca, NULL) {
struct block_device *bdev = READ_ONCE(ca->disk_sb.bdev);
if (bdev)
@@ -2549,7 +2557,6 @@ void bch2_recalc_capacity(struct bch_fs *c)
bucket_size_max = max_t(unsigned, bucket_size_max,
ca->mi.bucket_size);
}
- rcu_read_unlock();
bch2_set_ra_pages(c, ra_pages);
@@ -2574,10 +2581,9 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *c)
{
u64 ret = U64_MAX;
- rcu_read_lock();
+ guard(rcu)();
for_each_rw_member_rcu(c, ca)
ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
- rcu_read_unlock();
return ret;
}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 4f94c6a661bf..0cc5adc55b6f 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -13,11 +13,9 @@
static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
{
- rcu_read_lock();
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode);
- bool ret = ca && bucket_valid(ca, pos.offset);
- rcu_read_unlock();
- return ret;
+ return ca && bucket_valid(ca, pos.offset);
}
static inline u64 bucket_to_u64(struct bpos bucket)
@@ -253,6 +251,7 @@ int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c,
struct bkey_validate_context);
void bch2_alloc_v4_swab(struct bkey_s);
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_alloc_v4_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_alloc ((struct bkey_ops) { \
.key_validate = bch2_alloc_v1_validate, \
@@ -277,7 +276,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \
.key_validate = bch2_alloc_v4_validate, \
- .val_to_text = bch2_alloc_to_text, \
+ .val_to_text = bch2_alloc_v4_to_text, \
.swab = bch2_alloc_v4_swab, \
.trigger = bch2_trigger_alloc, \
.min_val_size = 48, \
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 1a52c12c51ae..b58525ec7b4d 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -69,10 +69,9 @@ const char * const bch2_watermarks[] = {
void bch2_reset_alloc_cursors(struct bch_fs *c)
{
- rcu_read_lock();
+ guard(rcu)();
for_each_member_device_rcu(c, ca, NULL)
memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor));
- rcu_read_unlock();
}
static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
@@ -166,9 +165,8 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
ARRAY_SIZE(c->open_buckets_partial));
spin_lock(&c->freelist_lock);
- rcu_read_lock();
- bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++;
- rcu_read_unlock();
+ scoped_guard(rcu)
+ bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++;
ob->on_partial_list = true;
c->open_buckets_partial[c->open_buckets_partial_nr++] =
@@ -229,7 +227,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c,
track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
spin_unlock(&c->freelist_lock);
- return ERR_PTR(-BCH_ERR_open_buckets_empty);
+ return ERR_PTR(bch_err_throw(c, open_buckets_empty));
}
/* Recheck under lock: */
@@ -513,7 +511,8 @@ again:
bch2_dev_usage_read_fast(ca, &req->usage);
avail = dev_buckets_free(ca, req->usage, req->watermark);
- if (req->usage.buckets[BCH_DATA_need_discard] > avail)
+ if (req->usage.buckets[BCH_DATA_need_discard] >
+ min(avail, ca->mi.nbuckets >> 7))
bch2_dev_do_discards(ca);
if (req->usage.buckets[BCH_DATA_need_gc_gens] > avail)
@@ -535,7 +534,7 @@ again:
track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
- ob = ERR_PTR(-BCH_ERR_freelist_empty);
+ ob = ERR_PTR(bch_err_throw(c, freelist_empty));
goto err;
}
@@ -560,7 +559,7 @@ alloc:
}
err:
if (!ob)
- ob = ERR_PTR(-BCH_ERR_no_buckets_found);
+ ob = ERR_PTR(bch_err_throw(c, no_buckets_found));
if (!IS_ERR(ob))
ob->data_type = req->data_type;
@@ -603,18 +602,18 @@ static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
-struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
- struct dev_stripe_state *stripe,
- struct bch_devs_mask *devs)
+void bch2_dev_alloc_list(struct bch_fs *c,
+ struct dev_stripe_state *stripe,
+ struct bch_devs_mask *devs,
+ struct dev_alloc_list *ret)
{
- struct dev_alloc_list ret = { .nr = 0 };
- unsigned i;
+ ret->nr = 0;
+ unsigned i;
for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
- ret.data[ret.nr++] = i;
+ ret->data[ret->nr++] = i;
- bubble_sort(ret.data, ret.nr, dev_stripe_cmp);
- return ret;
+ bubble_sort(ret->data, ret->nr, dev_stripe_cmp);
}
static const u64 stripe_clock_hand_rescale = 1ULL << 62; /* trigger rescale at */
@@ -705,18 +704,19 @@ static int add_new_bucket(struct bch_fs *c,
return 0;
}
-int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
- struct alloc_request *req,
- struct dev_stripe_state *stripe,
- struct closure *cl)
+inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
+ struct alloc_request *req,
+ struct dev_stripe_state *stripe,
+ struct closure *cl)
{
struct bch_fs *c = trans->c;
- int ret = -BCH_ERR_insufficient_devices;
+ int ret = 0;
BUG_ON(req->nr_effective >= req->nr_replicas);
- struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc);
- darray_for_each(devs_sorted, i) {
+ bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc, &req->devs_sorted);
+
+ darray_for_each(req->devs_sorted, i) {
req->ca = bch2_dev_tryget_noerror(c, *i);
if (!req->ca)
continue;
@@ -739,13 +739,16 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
continue;
}
- if (add_new_bucket(c, req, ob)) {
- ret = 0;
+ ret = add_new_bucket(c, req, ob);
+ if (ret)
break;
- }
}
- return ret;
+ if (ret == 1)
+ return 0;
+ if (ret)
+ return ret;
+ return bch_err_throw(c, insufficient_devices);
}
/* Allocate from stripes: */
@@ -776,9 +779,9 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
if (!h)
return 0;
- struct dev_alloc_list devs_sorted =
- bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc);
- darray_for_each(devs_sorted, i)
+ bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc, &req->devs_sorted);
+
+ darray_for_each(req->devs_sorted, i)
for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
if (!h->s->blocks[ec_idx])
continue;
@@ -872,9 +875,8 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
i);
ob->on_partial_list = false;
- rcu_read_lock();
- bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
- rcu_read_unlock();
+ scoped_guard(rcu)
+ bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
ret = add_new_bucket(c, req, ob);
if (ret)
@@ -1056,9 +1058,8 @@ void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
ob->on_partial_list = false;
- rcu_read_lock();
- bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
- rcu_read_unlock();
+ scoped_guard(rcu)
+ bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
spin_unlock(&c->freelist_lock);
bch2_open_bucket_put(c, ob);
@@ -1086,14 +1087,11 @@ static struct write_point *__writepoint_find(struct hlist_head *head,
{
struct write_point *wp;
- rcu_read_lock();
+ guard(rcu)();
hlist_for_each_entry_rcu(wp, head, node)
if (wp->write_point == write_point)
- goto out;
- wp = NULL;
-out:
- rcu_read_unlock();
- return wp;
+ return wp;
+ return NULL;
}
static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
@@ -1104,7 +1102,7 @@ static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
return stranded * factor > free;
}
-static bool try_increase_writepoints(struct bch_fs *c)
+static noinline bool try_increase_writepoints(struct bch_fs *c)
{
struct write_point *wp;
@@ -1117,7 +1115,7 @@ static bool try_increase_writepoints(struct bch_fs *c)
return true;
}
-static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
+static noinline bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
{
struct bch_fs *c = trans->c;
struct write_point *wp;
@@ -1379,11 +1377,11 @@ err:
goto retry;
if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
- ret = -BCH_ERR_bucket_alloc_blocked;
+ ret = bch_err_throw(c, bucket_alloc_blocked);
if (cl && !(flags & BCH_WRITE_alloc_nowait) &&
bch2_err_matches(ret, BCH_ERR_freelist_empty))
- ret = -BCH_ERR_bucket_alloc_blocked;
+ ret = bch_err_throw(c, bucket_alloc_blocked);
return ret;
}
@@ -1637,19 +1635,16 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
bch2_printbuf_make_room(&buf, 4096);
- rcu_read_lock();
buf.atomic++;
-
- for_each_online_member_rcu(c, ca) {
- prt_printf(&buf, "Dev %u:\n", ca->dev_idx);
- printbuf_indent_add(&buf, 2);
- bch2_dev_alloc_debug_to_text(&buf, ca);
- printbuf_indent_sub(&buf, 2);
- prt_newline(&buf);
- }
-
+ scoped_guard(rcu)
+ for_each_online_member_rcu(c, ca) {
+ prt_printf(&buf, "Dev %u:\n", ca->dev_idx);
+ printbuf_indent_add(&buf, 2);
+ bch2_dev_alloc_debug_to_text(&buf, ca);
+ printbuf_indent_sub(&buf, 2);
+ prt_newline(&buf);
+ }
--buf.atomic;
- rcu_read_unlock();
prt_printf(&buf, "Copygc debug:\n");
printbuf_indent_add(&buf, 2);
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 2e01c7b61ed1..1b3fc8460096 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -42,6 +42,7 @@ struct alloc_request {
struct bch_devs_mask devs_may_alloc;
/* bch2_bucket_alloc_set_trans(): */
+ struct dev_alloc_list devs_sorted;
struct bch_dev_usage usage;
/* bch2_bucket_alloc_trans(): */
@@ -71,9 +72,10 @@ struct alloc_request {
struct bch_devs_mask scratch_devs_may_alloc;
};
-struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
- struct dev_stripe_state *,
- struct bch_devs_mask *);
+void bch2_dev_alloc_list(struct bch_fs *,
+ struct dev_stripe_state *,
+ struct bch_devs_mask *,
+ struct dev_alloc_list *);
void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob)
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index cde7dd115267..77d93beb3c8f 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -48,17 +48,19 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke
{
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
- if (ca) {
- u32 bucket_offset;
- struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset);
- rcu_read_unlock();
+ struct bch_dev *ca;
+ u32 bucket_offset;
+ struct bpos bucket;
+ scoped_guard(rcu) {
+ ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
+ if (ca)
+ bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset);
+ }
+
+ if (ca)
prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset);
- } else {
- rcu_read_unlock();
+ else
prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT);
- }
bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level);
prt_str(out, " data_type=");
@@ -140,7 +142,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
}
if (!will_check && __bch2_inconsistent_error(c, &buf))
- ret = -BCH_ERR_erofs_unfixed_errors;
+ ret = bch_err_throw(c, erofs_unfixed_errors);
bch_err(c, "%s", buf.buf);
printbuf_exit(&buf);
@@ -293,7 +295,7 @@ static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans,
return b;
if (btree_node_will_make_reachable(b)) {
- b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
+ b = ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node));
} else {
int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key),
last_flushed, commit);
@@ -591,6 +593,7 @@ check_existing_bp:
bkey_for_each_ptr(other_extent_ptrs, ptr)
if (ptr->dev == bp->k.p.inode &&
dev_ptr_stale_rcu(ca, ptr)) {
+ rcu_read_unlock();
ret = drop_dev_and_update(trans, other_bp.v->btree_id,
other_extent, bp->k.p.inode);
if (ret)
@@ -648,7 +651,7 @@ check_existing_bp:
prt_newline(&buf);
bch2_bkey_val_to_text(&buf, c, other_extent);
bch_err(c, "%s", buf.buf);
- ret = -BCH_ERR_fsck_repair_unimplemented;
+ ret = bch_err_throw(c, fsck_repair_unimplemented);
goto err;
missing:
printbuf_reset(&buf);
@@ -679,26 +682,23 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
if (p.ptr.dev == BCH_SB_MEMBER_INVALID)
continue;
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
- if (!ca) {
- rcu_read_unlock();
- continue;
- }
+ bool empty;
+ {
+ /* scoped_guard() is a loop, so it breaks continue */
+ guard(rcu)();
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
+ if (!ca)
+ continue;
- if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr)) {
- rcu_read_unlock();
- continue;
- }
+ if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr))
+ continue;
- u64 b = PTR_BUCKET_NR(ca, &p.ptr);
- if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b)) {
- rcu_read_unlock();
- continue;
- }
+ u64 b = PTR_BUCKET_NR(ca, &p.ptr);
+ if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b))
+ continue;
- bool empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b);
- rcu_read_unlock();
+ empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b);
+ }
struct bkey_i_backpointer bp;
bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
@@ -953,7 +953,7 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
sectors[ALLOC_cached] > a->cached_sectors ||
sectors[ALLOC_stripe] > a->stripe_sectors) {
ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?:
- -BCH_ERR_transaction_restart_nested;
+ bch_err_throw(c, transaction_restart_nested);
goto err;
}
@@ -981,7 +981,7 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
case KEY_TYPE_btree_ptr_v2: {
bool ret = false;
- rcu_read_lock();
+ guard(rcu)();
struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key;
while (pos.inode <= k.k->p.inode) {
if (pos.inode >= c->sb.nr_devices)
@@ -1009,7 +1009,6 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
next:
pos = SPOS(pos.inode + 1, 0, 0);
}
- rcu_read_unlock();
return ret;
}
@@ -1352,7 +1351,7 @@ static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u
b->buckets = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
sizeof(unsigned long), GFP_KERNEL);
if (!b->buckets)
- return -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap;
+ return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap);
}
b->nr += !__test_and_set_bit(bit, b->buckets);
@@ -1361,7 +1360,8 @@ static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u
return 0;
}
-int bch2_bucket_bitmap_resize(struct bucket_bitmap *b, u64 old_size, u64 new_size)
+int bch2_bucket_bitmap_resize(struct bch_dev *ca, struct bucket_bitmap *b,
+ u64 old_size, u64 new_size)
{
scoped_guard(mutex, &b->lock) {
if (!b->buckets)
@@ -1370,7 +1370,7 @@ int bch2_bucket_bitmap_resize(struct bucket_bitmap *b, u64 old_size, u64 new_siz
unsigned long *n = kvcalloc(BITS_TO_LONGS(new_size),
sizeof(unsigned long), GFP_KERNEL);
if (!n)
- return -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap;
+ return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap);
memcpy(n, b->buckets,
BITS_TO_LONGS(min(old_size, new_size)) * sizeof(unsigned long));
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 6840561084ce..7e71afee1ac0 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -53,11 +53,10 @@ static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca,
static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
{
- rcu_read_lock();
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode);
if (ca)
*bucket = bp_pos_to_bucket(ca, bp_pos);
- rcu_read_unlock();
return ca != NULL;
}
@@ -195,7 +194,7 @@ static inline bool bch2_bucket_bitmap_test(struct bucket_bitmap *b, u64 i)
return bitmap && test_bit(i, bitmap);
}
-int bch2_bucket_bitmap_resize(struct bucket_bitmap *, u64, u64);
+int bch2_bucket_bitmap_resize(struct bch_dev *, struct bucket_bitmap *, u64, u64);
void bch2_bucket_bitmap_free(struct bucket_bitmap *);
#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 7824da2af9d0..ddfacad0f70c 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -183,6 +183,16 @@
#define pr_fmt(fmt) "%s() " fmt "\n", __func__
#endif
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define ENUMERATED_REF_DEBUG
+#endif
+
+#ifndef dynamic_fault
+#define dynamic_fault(...) 0
+#endif
+
+#define race_fault(...) dynamic_fault("bcachefs:race")
+
#include <linux/backing-dev-defs.h>
#include <linux/bug.h>
#include <linux/bio.h>
@@ -219,15 +229,30 @@
#include "time_stats.h"
#include "util.h"
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define ENUMERATED_REF_DEBUG
-#endif
-
-#ifndef dynamic_fault
-#define dynamic_fault(...) 0
-#endif
+#include "alloc_types.h"
+#include "async_objs_types.h"
+#include "btree_gc_types.h"
+#include "btree_types.h"
+#include "btree_node_scan_types.h"
+#include "btree_write_buffer_types.h"
+#include "buckets_types.h"
+#include "buckets_waiting_for_journal_types.h"
+#include "clock_types.h"
+#include "disk_groups_types.h"
+#include "ec_types.h"
+#include "enumerated_ref_types.h"
+#include "journal_types.h"
+#include "keylist_types.h"
+#include "quota_types.h"
+#include "rebalance_types.h"
+#include "recovery_passes_types.h"
+#include "replicas_types.h"
+#include "sb-members_types.h"
+#include "subvolume_types.h"
+#include "super_types.h"
+#include "thread_with_file_types.h"
-#define race_fault(...) dynamic_fault("bcachefs:race")
+#include "trace.h"
#define count_event(_c, _name) this_cpu_inc((_c)->counters[BCH_COUNTER_##_name])
@@ -271,7 +296,6 @@ do { \
#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
void bch2_print_str(struct bch_fs *, const char *, const char *);
-void bch2_print_str_nonblocking(struct bch_fs *, const char *, const char *);
__printf(2, 3)
void bch2_print_opts(struct bch_opts *, const char *, ...);
@@ -380,6 +404,14 @@ do { \
pr_info(fmt, ##__VA_ARGS__); \
} while (0)
+static inline int __bch2_err_trace(struct bch_fs *c, int err)
+{
+ trace_error_throw(c, err, _THIS_IP_);
+ return err;
+}
+
+#define bch_err_throw(_c, _err) __bch2_err_trace(_c, -BCH_ERR_##_err)
+
/* Parameters that are useful for debugging, but should always be compiled in: */
#define BCH_DEBUG_PARAMS_ALWAYS() \
BCH_DEBUG_PARAM(key_merging_disabled, \
@@ -486,29 +518,6 @@ enum bch_time_stats {
BCH_TIME_STAT_NR
};
-#include "alloc_types.h"
-#include "async_objs_types.h"
-#include "btree_gc_types.h"
-#include "btree_types.h"
-#include "btree_node_scan_types.h"
-#include "btree_write_buffer_types.h"
-#include "buckets_types.h"
-#include "buckets_waiting_for_journal_types.h"
-#include "clock_types.h"
-#include "disk_groups_types.h"
-#include "ec_types.h"
-#include "enumerated_ref_types.h"
-#include "journal_types.h"
-#include "keylist_types.h"
-#include "quota_types.h"
-#include "rebalance_types.h"
-#include "recovery_passes_types.h"
-#include "replicas_types.h"
-#include "sb-members_types.h"
-#include "subvolume_types.h"
-#include "super_types.h"
-#include "thread_with_file_types.h"
-
/* Number of nodes btree coalesce will try to coalesce at once */
#define GC_MERGE_NODES 4U
@@ -758,7 +767,8 @@ struct btree_trans_buf {
x(sysfs) \
x(btree_write_buffer) \
x(btree_node_scrub) \
- x(async_recovery_passes)
+ x(async_recovery_passes) \
+ x(ioctl_data)
enum bch_write_ref {
#define x(n) BCH_WRITE_REF_##n,
@@ -853,9 +863,7 @@ struct bch_fs {
DARRAY(enum bcachefs_metadata_version)
incompat_versions_requested;
-#ifdef CONFIG_UNICODE
struct unicode_map *cf_encoding;
-#endif
struct bch_sb_handle disk_sb;
@@ -1275,4 +1283,13 @@ static inline bool bch2_discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca
: ca->mi.discard;
}
+static inline bool bch2_fs_casefold_enabled(struct bch_fs *c)
+{
+#ifdef CONFIG_UNICODE
+ return !c->opts.casefold_disabled;
+#else
+ return false;
+#endif
+}
+
#endif /* _BCACHEFS_H */
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 8557cbd3d818..83c9860e6b82 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -85,7 +85,7 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
six_unlock_intent(&b->c.lock);
}
-static void __btree_node_data_free(struct btree_cache *bc, struct btree *b)
+void __btree_node_data_free(struct btree *b)
{
BUG_ON(!list_empty(&b->list));
BUG_ON(btree_node_hashed(b));
@@ -112,16 +112,17 @@ static void __btree_node_data_free(struct btree_cache *bc, struct btree *b)
munmap(b->aux_data, btree_aux_data_bytes(b));
#endif
b->aux_data = NULL;
-
- btree_node_to_freedlist(bc, b);
}
static void btree_node_data_free(struct btree_cache *bc, struct btree *b)
{
BUG_ON(list_empty(&b->list));
list_del_init(&b->list);
+
+ __btree_node_data_free(b);
+
--bc->nr_freeable;
- __btree_node_data_free(bc, b);
+ btree_node_to_freedlist(bc, b);
}
static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
@@ -149,7 +150,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
b->data = kvmalloc(btree_buf_bytes(b), gfp);
if (!b->data)
- return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
+ return bch_err_throw(c, ENOMEM_btree_node_mem_alloc);
#ifdef __KERNEL__
b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
#else
@@ -162,7 +163,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
if (!b->aux_data) {
kvfree(b->data);
b->data = NULL;
- return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
+ return bch_err_throw(c, ENOMEM_btree_node_mem_alloc);
}
return 0;
@@ -185,10 +186,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
{
- struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
-
- b = __btree_node_mem_alloc(c, GFP_KERNEL);
+ struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL);
if (!b)
return NULL;
@@ -198,8 +196,6 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
}
bch2_btree_lock_init(&b->c, 0, GFP_KERNEL);
-
- __bch2_btree_node_to_freelist(bc, b);
return b;
}
@@ -353,21 +349,21 @@ static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b,
if (btree_node_noevict(b)) {
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++;
- return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ return bch_err_throw(c, ENOMEM_btree_node_reclaim);
}
if (btree_node_write_blocked(b)) {
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++;
- return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ return bch_err_throw(c, ENOMEM_btree_node_reclaim);
}
if (btree_node_will_make_reachable(b)) {
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++;
- return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ return bch_err_throw(c, ENOMEM_btree_node_reclaim);
}
if (btree_node_dirty(b)) {
if (!flush) {
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++;
- return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ return bch_err_throw(c, ENOMEM_btree_node_reclaim);
}
if (locked) {
@@ -393,7 +389,7 @@ static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b,
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++;
else if (btree_node_write_in_flight(b))
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++;
- return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ return bch_err_throw(c, ENOMEM_btree_node_reclaim);
}
if (locked)
@@ -424,13 +420,13 @@ retry_unlocked:
if (!six_trylock_intent(&b->c.lock)) {
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++;
- return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ return bch_err_throw(c, ENOMEM_btree_node_reclaim);
}
if (!six_trylock_write(&b->c.lock)) {
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++;
six_unlock_intent(&b->c.lock);
- return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ return bch_err_throw(c, ENOMEM_btree_node_reclaim);
}
/* recheck under lock */
@@ -524,7 +520,8 @@ restart:
--touched;;
} else if (!btree_node_reclaim(c, b)) {
__bch2_btree_node_hash_remove(bc, b);
- __btree_node_data_free(bc, b);
+ __btree_node_data_free(b);
+ btree_node_to_freedlist(bc, b);
freed++;
bc->nr_freed++;
@@ -652,9 +649,12 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
bch2_recalc_btree_reserve(c);
- for (i = 0; i < bc->nr_reserve; i++)
- if (!__bch2_btree_node_mem_alloc(c))
+ for (i = 0; i < bc->nr_reserve; i++) {
+ struct btree *b = __bch2_btree_node_mem_alloc(c);
+ if (!b)
goto err;
+ __bch2_btree_node_to_freelist(bc, b);
+ }
list_splice_init(&bc->live[0].list, &bc->freeable);
@@ -682,7 +682,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
return 0;
err:
- return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+ return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
}
void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
@@ -727,7 +727,7 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure
if (!cl) {
trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
- return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
+ return bch_err_throw(c, ENOMEM_btree_cache_cannibalize_lock);
}
closure_wait(&bc->alloc_wait, cl);
@@ -741,7 +741,7 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure
}
trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
- return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
+ return bch_err_throw(c, btree_cache_cannibalize_lock_blocked);
success:
trace_and_count(c, btree_cache_cannibalize_lock, trans);
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index ca3c1b145330..be275f87a60e 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -30,6 +30,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsig
void bch2_btree_cache_cannibalize_unlock(struct btree_trans *);
int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *);
+void __btree_node_data_free(struct btree *);
struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 91b6395421df..bac108e93823 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -150,7 +150,7 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
if (!new)
- return -BCH_ERR_ENOMEM_gc_repair_key;
+ return bch_err_throw(c, ENOMEM_gc_repair_key);
btree_ptr_to_v2(b, new);
b->data->min_key = new_min;
@@ -190,7 +190,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
if (!new)
- return -BCH_ERR_ENOMEM_gc_repair_key;
+ return bch_err_throw(c, ENOMEM_gc_repair_key);
btree_ptr_to_v2(b, new);
b->data->max_key = new_max;
@@ -397,7 +397,11 @@ again:
continue;
}
- ret = btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan);
+ ret = lockrestart_do(trans,
+ btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan));
+ if (ret < 0)
+ goto err;
+
if (ret == DID_FILL_FROM_SCAN) {
new_pass = true;
ret = 0;
@@ -438,7 +442,8 @@ again:
if (!ret && !IS_ERR_OR_NULL(prev)) {
BUG_ON(cur);
- ret = btree_repair_node_end(trans, b, prev, pulled_from_scan);
+ ret = lockrestart_do(trans,
+ btree_repair_node_end(trans, b, prev, pulled_from_scan));
if (ret == DID_FILL_FROM_SCAN) {
new_pass = true;
ret = 0;
@@ -498,8 +503,14 @@ again:
prt_newline(&buf);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ /*
+ * XXX: we're not passing the trans object here because we're not set up
+ * to handle a transaction restart - this code needs to be rewritten
+ * when we start doing online topology repair
+ */
+ bch2_trans_unlock_long(trans);
if (mustfix_fsck_err_on(!have_child,
- trans, btree_node_topology_interior_node_empty,
+ c, btree_node_topology_interior_node_empty,
"empty interior btree node at %s", buf.buf))
ret = DROP_THIS_NODE;
err:
@@ -519,49 +530,72 @@ fsck_err:
bch2_bkey_buf_exit(&prev_k, c);
bch2_bkey_buf_exit(&cur_k, c);
printbuf_exit(&buf);
+ bch_err_fn(c, ret);
return ret;
}
-int bch2_check_topology(struct bch_fs *c)
+static int bch2_check_root(struct btree_trans *trans, enum btree_id btree,
+ bool *reconstructed_root)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct bpos pulled_from_scan = POS_MIN;
+ struct bch_fs *c = trans->c;
+ struct btree_root *r = bch2_btree_id_root(c, btree);
struct printbuf buf = PRINTBUF;
int ret = 0;
- bch2_trans_srcu_unlock(trans);
+ bch2_btree_id_to_text(&buf, btree);
- for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
- struct btree_root *r = bch2_btree_id_root(c, i);
- bool reconstructed_root = false;
+ if (r->error) {
+ bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);
- printbuf_reset(&buf);
- bch2_btree_id_to_text(&buf, i);
+ ret = bch2_btree_has_scanned_nodes(c, btree);
+ if (ret < 0)
+ goto err;
- if (r->error) {
-reconstruct_root:
- bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);
+ if (!ret) {
+ __fsck_err(trans,
+ FSCK_CAN_FIX|(!btree_id_important(btree) ? FSCK_AUTOFIX : 0),
+ btree_root_unreadable_and_scan_found_nothing,
+ "no nodes found for btree %s, continue?", buf.buf);
r->alive = false;
r->error = 0;
+ bch2_btree_root_alloc_fake_trans(trans, btree, 0);
+ } else {
+ r->alive = false;
+ r->error = 0;
+ bch2_btree_root_alloc_fake_trans(trans, btree, 1);
- if (!bch2_btree_has_scanned_nodes(c, i)) {
- __fsck_err(trans,
- FSCK_CAN_FIX|(!btree_id_important(i) ? FSCK_AUTOFIX : 0),
- btree_root_unreadable_and_scan_found_nothing,
- "no nodes found for btree %s, continue?", buf.buf);
- bch2_btree_root_alloc_fake_trans(trans, i, 0);
- } else {
- bch2_btree_root_alloc_fake_trans(trans, i, 1);
- bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
- ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX);
- if (ret)
- break;
- }
-
- reconstructed_root = true;
+ bch2_shoot_down_journal_keys(c, btree, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ ret = bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX);
+ if (ret)
+ goto err;
}
+ *reconstructed_root = true;
+ }
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+int bch2_check_topology(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct bpos pulled_from_scan = POS_MIN;
+ int ret = 0;
+
+ bch2_trans_srcu_unlock(trans);
+
+ for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
+ bool reconstructed_root = false;
+recover:
+ ret = lockrestart_do(trans, bch2_check_root(trans, i, &reconstructed_root));
+ if (ret)
+ break;
+
+ struct btree_root *r = bch2_btree_id_root(c, i);
struct btree *b = r->b;
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
@@ -575,17 +609,21 @@ reconstruct_root:
r->b = NULL;
- if (!reconstructed_root)
- goto reconstruct_root;
+ if (!reconstructed_root) {
+ r->error = -EIO;
+ goto recover;
+ }
+ struct printbuf buf = PRINTBUF;
+ bch2_btree_id_to_text(&buf, i);
bch_err(c, "empty btree root %s", buf.buf);
+ printbuf_exit(&buf);
bch2_btree_root_alloc_fake_trans(trans, i, 0);
r->alive = false;
ret = 0;
}
}
-fsck_err:
- printbuf_exit(&buf);
+
bch2_trans_put(trans);
return ret;
}
@@ -935,7 +973,7 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL);
if (ret) {
bch2_dev_put(ca);
- ret = -BCH_ERR_ENOMEM_gc_alloc_start;
+ ret = bch_err_throw(c, ENOMEM_gc_alloc_start);
break;
}
}
@@ -1093,42 +1131,41 @@ static int gc_btree_gens_key(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct bkey_i *u;
- int ret;
if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
return -EROFS;
- rcu_read_lock();
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- if (!ca)
- continue;
+ bool too_stale = false;
+ scoped_guard(rcu) {
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ca)
+ continue;
- if (dev_ptr_stale(ca, ptr) > 16) {
- rcu_read_unlock();
- goto update;
+ too_stale |= dev_ptr_stale(ca, ptr) > 16;
}
+
+ if (!too_stale)
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ca)
+ continue;
+
+ u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
+ if (gen_after(*gen, ptr->gen))
+ *gen = ptr->gen;
+ }
}
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- if (!ca)
- continue;
+ if (too_stale) {
+ struct bkey_i *u = bch2_bkey_make_mut(trans, iter, &k, 0);
+ int ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ return ret;
- u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
- if (gen_after(*gen, ptr->gen))
- *gen = ptr->gen;
+ bch2_extent_normalize(c, bkey_i_to_s(u));
}
- rcu_read_unlock();
- return 0;
-update:
- u = bch2_bkey_make_mut(trans, iter, &k, 0);
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- return ret;
- bch2_extent_normalize(c, bkey_i_to_s(u));
return 0;
}
@@ -1181,7 +1218,7 @@ int bch2_gc_gens(struct bch_fs *c)
ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
if (!ca->oldest_gen) {
bch2_dev_put(ca);
- ret = -BCH_ERR_ENOMEM_gc_gens;
+ ret = bch_err_throw(c, ENOMEM_gc_gens);
goto err;
}
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 34018296053a..590cd29f3e86 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -557,7 +557,9 @@ static int __btree_err(int ret,
const char *fmt, ...)
{
if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes)
- return -BCH_ERR_fsck_fix;
+ return ret == -BCH_ERR_btree_node_read_err_fixable
+ ? bch_err_throw(c, fsck_fix)
+ : ret;
bool have_retry = false;
int ret2;
@@ -566,15 +568,15 @@ static int __btree_err(int ret,
bch2_mark_btree_validate_failure(failed, ca->dev_idx);
struct extent_ptr_decoded pick;
- have_retry = !bch2_bkey_pick_read_device(c,
+ have_retry = bch2_bkey_pick_read_device(c,
bkey_i_to_s_c(&b->key),
- failed, &pick, -1);
+ failed, &pick, -1) == 1;
}
if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
- ret = -BCH_ERR_btree_node_read_err_fixable;
+ ret = bch_err_throw(c, btree_node_read_err_fixable);
if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
- ret = -BCH_ERR_btree_node_read_err_bad_node;
+ ret = bch_err_throw(c, btree_node_read_err_bad_node);
bch2_sb_error_count(c, err_type);
@@ -602,18 +604,17 @@ static int __btree_err(int ret,
switch (ret) {
case -BCH_ERR_btree_node_read_err_fixable:
ret2 = bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type);
- if (ret2 != -BCH_ERR_fsck_fix &&
- ret2 != -BCH_ERR_fsck_ignore) {
+ if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) &&
+ !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) {
ret = ret2;
goto fsck_err;
}
if (!have_retry)
- ret = -BCH_ERR_fsck_fix;
+ ret = bch_err_throw(c, fsck_fix);
goto out;
case -BCH_ERR_btree_node_read_err_bad_node:
prt_str(&out, ", ");
- ret = __bch2_topology_error(c, &out);
break;
}
@@ -631,18 +632,17 @@ static int __btree_err(int ret,
switch (ret) {
case -BCH_ERR_btree_node_read_err_fixable:
ret2 = __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf);
- if (ret2 != -BCH_ERR_fsck_fix &&
- ret2 != -BCH_ERR_fsck_ignore) {
+ if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) &&
+ !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) {
ret = ret2;
goto fsck_err;
}
if (!have_retry)
- ret = -BCH_ERR_fsck_fix;
+ ret = bch_err_throw(c, fsck_fix);
goto out;
case -BCH_ERR_btree_node_read_err_bad_node:
prt_str(&out, ", ");
- ret = __bch2_topology_error(c, &out);
break;
}
print:
@@ -660,7 +660,7 @@ fsck_err:
failed, err_msg, \
msg, ##__VA_ARGS__); \
\
- if (_ret != -BCH_ERR_fsck_fix) { \
+ if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix)) { \
ret = _ret; \
goto fsck_err; \
} \
@@ -723,12 +723,11 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
struct btree *b, struct bset *i,
- unsigned offset, unsigned sectors, int write,
+ unsigned offset, int write,
struct bch_io_failures *failed,
struct printbuf *err_msg)
{
unsigned version = le16_to_cpu(i->version);
- unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
struct printbuf buf1 = PRINTBUF;
struct printbuf buf2 = PRINTBUF;
int ret = 0;
@@ -741,16 +740,22 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
BCH_VERSION_MAJOR(version),
BCH_VERSION_MINOR(version));
- if (btree_err_on(version < c->sb.version_min,
+ if (c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes &&
+ btree_err_on(version < c->sb.version_min,
-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, i, NULL,
btree_node_bset_older_than_sb_min,
"bset version %u older than superblock version_min %u",
version, c->sb.version_min)) {
- mutex_lock(&c->sb_lock);
- c->disk_sb.sb->version_min = cpu_to_le16(version);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
+ if (bch2_version_compatible(version)) {
+ mutex_lock(&c->sb_lock);
+ c->disk_sb.sb->version_min = cpu_to_le16(version);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ } else {
+ /* We have no idea what's going on: */
+ i->version = cpu_to_le16(c->sb.version);
+ }
}
if (btree_err_on(BCH_VERSION_MAJOR(version) >
@@ -772,15 +777,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
btree_node_unsupported_version,
"BSET_SEPARATE_WHITEOUTS no longer supported");
- if (!write &&
- btree_err_on(offset + sectors > (ptr_written ?: btree_sectors(c)),
- -BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i, NULL,
- bset_past_end_of_btree_node,
- "bset past end of btree node (offset %u len %u but written %zu)",
- offset, sectors, ptr_written ?: btree_sectors(c)))
- i->u64s = 0;
-
btree_err_on(offset && !i->u64s,
-BCH_ERR_btree_node_read_err_fixable,
c, ca, b, i, NULL,
@@ -1045,6 +1041,7 @@ got_good_key:
le16_add_cpu(&i->u64s, -next_good_key);
memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k);
set_btree_node_need_rewrite(b);
+ set_btree_node_need_rewrite_error(b);
}
fsck_err:
printbuf_exit(&buf);
@@ -1144,6 +1141,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
"unknown checksum type %llu", BSET_CSUM_TYPE(i));
if (first) {
+ sectors = vstruct_sectors(b->data, c->block_bits);
+ if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)),
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, ca, b, i, NULL,
+ bset_past_end_of_btree_node,
+ "bset past end of btree node (offset %u len %u but written %zu)",
+ b->written, sectors, ptr_written ?: btree_sectors(c)))
+ i->u64s = 0;
if (good_csum_type) {
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
bool csum_bad = bch2_crc_cmp(b->data->csum, csum);
@@ -1171,9 +1176,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
c, NULL, b, NULL, NULL,
btree_node_unsupported_version,
"btree node does not have NEW_EXTENT_OVERWRITE set");
-
- sectors = vstruct_sectors(b->data, c->block_bits);
} else {
+ sectors = vstruct_sectors(bne, c->block_bits);
+ if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)),
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, ca, b, i, NULL,
+ bset_past_end_of_btree_node,
+ "bset past end of btree node (offset %u len %u but written %zu)",
+ b->written, sectors, ptr_written ?: btree_sectors(c)))
+ i->u64s = 0;
if (good_csum_type) {
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
bool csum_bad = bch2_crc_cmp(bne->csum, csum);
@@ -1194,14 +1205,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
"decrypting btree node: %s", bch2_err_str(ret)))
goto fsck_err;
}
-
- sectors = vstruct_sectors(bne, c->block_bits);
}
b->version_ondisk = min(b->version_ondisk,
le16_to_cpu(i->version));
- ret = validate_bset(c, ca, b, i, b->written, sectors, READ, failed, err_msg);
+ ret = validate_bset(c, ca, b, i, b->written, READ, failed, err_msg);
if (ret)
goto fsck_err;
@@ -1286,9 +1295,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
- if (updated_range)
- bch2_btree_node_drop_keys_outside_node(b);
-
i = &b->data->keys;
for (k = i->start; k != vstruct_last(i);) {
struct bkey tmp;
@@ -1305,6 +1311,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
(u64 *) vstruct_end(i) - (u64 *) k);
set_btree_bset_end(b, b->set);
set_btree_node_need_rewrite(b);
+ set_btree_node_need_rewrite_error(b);
continue;
}
if (ret)
@@ -1325,17 +1332,50 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_node_reset_sib_u64s(b);
- rcu_read_lock();
- bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
- struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
+ if (updated_range)
+ bch2_btree_node_drop_keys_outside_node(b);
- if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw)
- set_btree_node_need_rewrite(b);
+ /*
+ * XXX:
+ *
+ * We deadlock if too many btree updates require node rewrites while
+ * we're still in journal replay.
+ *
+ * This is because btree node rewrites generate more updates for the
+ * interior updates (alloc, backpointers), and if those updates touch
+ * new nodes and generate more rewrites - well, you see the problem.
+ *
+ * The biggest cause is that we don't use the btree write buffer (for
+ * the backpointer updates - this needs some real thought on locking in
+ * order to fix.
+ *
+ * The problem with this workaround (not doing the rewrite for degraded
+ * nodes in journal replay) is that those degraded nodes persist, and we
+ * don't want that (this is a real bug when a btree node write completes
+ * with fewer replicas than we wanted and leaves a degraded node due to
+ * device _removal_, i.e. the device went away mid write).
+ *
+ * It's less of a bug here, but still a problem because we don't yet
+ * have a way of tracking degraded data - we another index (all
+ * extents/btree nodes, by replicas entry) in order to fix properly
+ * (re-replicate degraded data at the earliest possible time).
+ */
+ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) {
+ scoped_guard(rcu)
+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
+ struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
+
+ if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
+ set_btree_node_need_rewrite(b);
+ set_btree_node_need_rewrite_degraded(b);
+ }
+ }
}
- rcu_read_unlock();
- if (!ptr_written)
+ if (!ptr_written) {
set_btree_node_need_rewrite(b);
+ set_btree_node_need_rewrite_ptr_written_zero(b);
+ }
fsck_err:
mempool_free(iter, &c->fill_iter);
printbuf_exit(&buf);
@@ -1366,7 +1406,7 @@ static void btree_node_read_work(struct work_struct *work)
ret = bch2_bkey_pick_read_device(c,
bkey_i_to_s_c(&b->key),
&failed, &rb->pick, -1);
- if (ret) {
+ if (ret <= 0) {
set_btree_node_read_error(b);
break;
}
@@ -1688,7 +1728,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
ra = kzalloc(sizeof(*ra), GFP_NOFS);
if (!ra)
- return -BCH_ERR_ENOMEM_btree_node_read_all_replicas;
+ return bch_err_throw(c, ENOMEM_btree_node_read_all_replicas);
closure_init(&ra->cl, NULL);
ra->c = c;
@@ -1870,7 +1910,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_unlock(&c->btree_cache.lock);
- ret = -BCH_ERR_btree_node_read_error;
+ ret = bch_err_throw(c, btree_node_read_error);
goto err;
}
@@ -1971,28 +2011,12 @@ static void btree_node_scrub_work(struct work_struct *work)
prt_newline(&err);
if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) {
- struct btree_trans *trans = bch2_trans_get(c);
-
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, scrub->btree,
- scrub->key.k->k.p, 0, scrub->level - 1, 0);
-
- struct btree *b;
- int ret = lockrestart_do(trans,
- PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(trans, &iter)));
- if (ret)
- goto err;
-
- if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) {
- bch_err(c, "error validating btree node during scrub on %s at btree %s",
- scrub->ca->name, err.buf);
-
- ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0);
- }
-err:
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_begin(trans);
- bch2_trans_put(trans);
+ int ret = bch2_trans_do(c,
+ bch2_btree_node_rewrite_key(trans, scrub->btree, scrub->level - 1,
+ scrub->key.k, 0));
+ if (!bch2_err_matches(ret, ENOENT) &&
+ !bch2_err_matches(ret, EROFS))
+ bch_err_fn_ratelimited(c, ret);
}
printbuf_exit(&err);
@@ -2020,7 +2044,7 @@ int bch2_btree_node_scrub(struct btree_trans *trans,
struct bch_fs *c = trans->c;
if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_node_scrub))
- return -BCH_ERR_erofs_no_writes;
+ return bch_err_throw(c, erofs_no_writes);
struct extent_ptr_decoded pick;
int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev);
@@ -2030,7 +2054,7 @@ int bch2_btree_node_scrub(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
BCH_DEV_READ_REF_btree_node_scrub);
if (!ca) {
- ret = -BCH_ERR_device_offline;
+ ret = bch_err_throw(c, device_offline);
goto err;
}
@@ -2167,7 +2191,7 @@ static void btree_node_write_work(struct work_struct *work)
bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
- ret = -BCH_ERR_btree_node_write_all_failed;
+ ret = bch_err_throw(c, btree_node_write_all_failed);
goto err;
}
@@ -2256,7 +2280,7 @@ static void btree_node_write_endio(struct bio *bio)
}
static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
- struct bset *i, unsigned sectors)
+ struct bset *i)
{
int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key),
(struct bkey_validate_context) {
@@ -2271,7 +2295,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
}
ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?:
- validate_bset(c, NULL, b, i, b->written, sectors, WRITE, NULL, NULL);
+ validate_bset(c, NULL, b, i, b->written, WRITE, NULL, NULL);
if (ret) {
bch2_inconsistent_error(c);
dump_stack();
@@ -2464,7 +2488,7 @@ do_write:
/* if we're going to be encrypting, check metadata validity first: */
if (validate_before_checksum &&
- validate_bset_for_write(c, b, i, sectors_to_write))
+ validate_bset_for_write(c, b, i))
goto err;
ret = bset_encrypt(c, i, b->written << 9);
@@ -2481,7 +2505,7 @@ do_write:
/* if we're not encrypting, check metadata after checksumming: */
if (!validate_before_checksum &&
- validate_bset_for_write(c, b, i, sectors_to_write))
+ validate_bset_for_write(c, b, i))
goto err;
/*
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index b4bf4217a3fa..f8829b667ad3 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -890,8 +890,7 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
struct btree_path *path,
- unsigned flags,
- struct bkey_buf *out)
+ unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_path_level *l = path_l(path);
@@ -915,7 +914,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
goto err;
}
- bch2_bkey_buf_reassemble(out, c, k);
+ bkey_reassemble(&trans->btree_path_down, k);
if ((flags & BTREE_ITER_prefetch) &&
c->opts.btree_node_prefetch)
@@ -926,6 +925,22 @@ err:
return ret;
}
+static noinline_for_stack int btree_node_missing_err(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "node not found at pos ");
+ bch2_bpos_to_text(&buf, path->pos);
+ prt_str(&buf, " within parent node ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key));
+
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ return bch_err_throw(c, btree_need_topology_repair);
+}
+
static __always_inline int btree_path_down(struct btree_trans *trans,
struct btree_path *path,
unsigned flags,
@@ -936,51 +951,38 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
struct btree *b;
unsigned level = path->level - 1;
enum six_lock_type lock_type = __btree_lock_want(path, level);
- struct bkey_buf tmp;
int ret;
EBUG_ON(!btree_node_locked(path, path->level));
- bch2_bkey_buf_init(&tmp);
-
if (unlikely(trans->journal_replay_not_finished)) {
- ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
+ ret = btree_node_iter_and_journal_peek(trans, path, flags);
if (ret)
- goto err;
+ return ret;
} else {
struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b);
- if (!k) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "node not found at pos ");
- bch2_bpos_to_text(&buf, path->pos);
- prt_str(&buf, " within parent node ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key));
-
- bch2_fs_fatal_error(c, "%s", buf.buf);
- printbuf_exit(&buf);
- ret = -BCH_ERR_btree_need_topology_repair;
- goto err;
- }
+ if (unlikely(!k))
+ return btree_node_missing_err(trans, path);
- bch2_bkey_buf_unpack(&tmp, c, l->b, k);
+ bch2_bkey_unpack(l->b, &trans->btree_path_down, k);
- if ((flags & BTREE_ITER_prefetch) &&
+ if (unlikely((flags & BTREE_ITER_prefetch)) &&
c->opts.btree_node_prefetch) {
ret = btree_path_prefetch(trans, path);
if (ret)
- goto err;
+ return ret;
}
}
- b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
+ b = bch2_btree_node_get(trans, path, &trans->btree_path_down,
+ level, lock_type, trace_ip);
ret = PTR_ERR_OR_ZERO(b);
if (unlikely(ret))
- goto err;
+ return ret;
- if (likely(!trans->journal_replay_not_finished &&
- tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
- unlikely(b != btree_node_mem_ptr(tmp.k)))
+ if (unlikely(b != btree_node_mem_ptr(&trans->btree_path_down)) &&
+ likely(!trans->journal_replay_not_finished &&
+ trans->btree_path_down.k.type == KEY_TYPE_btree_ptr_v2))
btree_node_mem_ptr_set(trans, path, level + 1, b);
if (btree_node_read_locked(path, level + 1))
@@ -992,9 +994,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
bch2_btree_path_level_init(trans, path, b);
bch2_btree_path_verify_locks(trans, path);
-err:
- bch2_bkey_buf_exit(&tmp, c);
- return ret;
+ return 0;
}
static int bch2_btree_path_traverse_all(struct btree_trans *trans)
@@ -1006,7 +1006,7 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans)
int ret = 0;
if (trans->in_traverse_all)
- return -BCH_ERR_transaction_restart_in_traverse_all;
+ return bch_err_throw(c, transaction_restart_in_traverse_all);
trans->in_traverse_all = true;
retry_all:
@@ -2076,14 +2076,14 @@ inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter
static noinline
void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c *k)
+ struct bpos search_key, struct bkey_s_c *k)
{
struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key;
trans_for_each_update(trans, i)
if (!i->key_cache_already_flushed &&
i->btree_id == iter->btree_id &&
- bpos_le(i->k->k.p, iter->pos) &&
+ bpos_le(i->k->k.p, search_key) &&
bpos_ge(i->k->k.p, k->k ? k->k->p : end)) {
iter->k = i->k->k;
*k = bkey_i_to_s_c(i->k);
@@ -2092,6 +2092,7 @@ void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_
static noinline
void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos search_key,
struct bkey_s_c *k)
{
struct btree_path *path = btree_iter_path(trans, iter);
@@ -2100,7 +2101,7 @@ void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter
trans_for_each_update(trans, i)
if (!i->key_cache_already_flushed &&
i->btree_id == iter->btree_id &&
- bpos_ge(i->k->k.p, path->pos) &&
+ bpos_ge(i->k->k.p, search_key) &&
bpos_le(i->k->k.p, k->k ? k->k->p : end)) {
iter->k = i->k->k;
*k = bkey_i_to_s_c(i->k);
@@ -2122,13 +2123,14 @@ void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_
static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
struct btree_iter *iter,
+ struct bpos search_pos,
struct bpos end_pos)
{
struct btree_path *path = btree_iter_path(trans, iter);
return bch2_journal_keys_peek_max(trans->c, iter->btree_id,
path->level,
- path->pos,
+ search_pos,
end_pos,
&iter->journal_idx);
}
@@ -2138,7 +2140,7 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
struct btree_iter *iter)
{
struct btree_path *path = btree_iter_path(trans, iter);
- struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos);
+ struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos);
if (k) {
iter->k = k->k;
@@ -2151,11 +2153,12 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
static noinline
void btree_trans_peek_journal(struct btree_trans *trans,
struct btree_iter *iter,
+ struct bpos search_key,
struct bkey_s_c *k)
{
struct btree_path *path = btree_iter_path(trans, iter);
struct bkey_i *next_journal =
- bch2_btree_journal_peek(trans, iter,
+ bch2_btree_journal_peek(trans, iter, search_key,
k->k ? k->k->p : path_l(path)->b->key.k.p);
if (next_journal) {
iter->k = next_journal->k;
@@ -2165,13 +2168,14 @@ void btree_trans_peek_journal(struct btree_trans *trans,
static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
struct btree_iter *iter,
+ struct bpos search_key,
struct bpos end_pos)
{
struct btree_path *path = btree_iter_path(trans, iter);
return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id,
path->level,
- path->pos,
+ search_key,
end_pos,
&iter->journal_idx);
}
@@ -2179,12 +2183,13 @@ static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
static noinline
void btree_trans_peek_prev_journal(struct btree_trans *trans,
struct btree_iter *iter,
+ struct bpos search_key,
struct bkey_s_c *k)
{
struct btree_path *path = btree_iter_path(trans, iter);
struct bkey_i *next_journal =
- bch2_btree_journal_peek_prev(trans, iter,
- k->k ? k->k->p : path_l(path)->b->key.k.p);
+ bch2_btree_journal_peek_prev(trans, iter, search_key,
+ k->k ? k->k->p : path_l(path)->b->data->min_key);
if (next_journal) {
iter->k = next_journal->k;
@@ -2292,11 +2297,11 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct
}
if (unlikely(iter->flags & BTREE_ITER_with_journal))
- btree_trans_peek_journal(trans, iter, &k);
+ btree_trans_peek_journal(trans, iter, search_key, &k);
if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
trans->nr_updates))
- bch2_btree_trans_peek_updates(trans, iter, &k);
+ bch2_btree_trans_peek_updates(trans, iter, search_key, &k);
if (k.k && bkey_deleted(k.k)) {
/*
@@ -2326,6 +2331,20 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct
}
bch2_btree_iter_verify(trans, iter);
+
+ if (trace___btree_iter_peek_enabled()) {
+ CLASS(printbuf, buf)();
+
+ int ret = bkey_err(k);
+ if (ret)
+ prt_str(&buf, bch2_err_str(ret));
+ else if (k.k)
+ bch2_bkey_val_to_text(&buf, trans->c, k);
+ else
+ prt_str(&buf, "(null)");
+ trace___btree_iter_peek(trans->c, buf.buf);
+ }
+
return k;
}
@@ -2484,6 +2503,19 @@ out_no_locked:
bch2_btree_iter_verify_entry_exit(iter);
+ if (trace_btree_iter_peek_max_enabled()) {
+ CLASS(printbuf, buf)();
+
+ int ret = bkey_err(k);
+ if (ret)
+ prt_str(&buf, bch2_err_str(ret));
+ else if (k.k)
+ bch2_bkey_val_to_text(&buf, trans->c, k);
+ else
+ prt_str(&buf, "(null)");
+ trace_btree_iter_peek_max(trans->c, buf.buf);
+ }
+
return k;
end:
bch2_btree_iter_set_pos(trans, iter, end);
@@ -2557,11 +2589,11 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, st
}
if (unlikely(iter->flags & BTREE_ITER_with_journal))
- btree_trans_peek_prev_journal(trans, iter, &k);
+ btree_trans_peek_prev_journal(trans, iter, search_key, &k);
if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
trans->nr_updates))
- bch2_btree_trans_peek_prev_updates(trans, iter, &k);
+ bch2_btree_trans_peek_prev_updates(trans, iter, search_key, &k);
if (likely(k.k && !bkey_deleted(k.k))) {
break;
@@ -2724,6 +2756,19 @@ out_no_locked:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(trans, iter);
+
+ if (trace_btree_iter_peek_prev_min_enabled()) {
+ CLASS(printbuf, buf)();
+
+ int ret = bkey_err(k);
+ if (ret)
+ prt_str(&buf, bch2_err_str(ret));
+ else if (k.k)
+ bch2_bkey_val_to_text(&buf, trans->c, k);
+ else
+ prt_str(&buf, "(null)");
+ trace_btree_iter_peek_prev_min(trans->c, buf.buf);
+ }
return k;
end:
bch2_btree_iter_set_pos(trans, iter, end);
@@ -2767,8 +2812,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
/* extents can't span inode numbers: */
if ((iter->flags & BTREE_ITER_is_extents) &&
unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
- if (iter->pos.inode == KEY_INODE_MAX)
- return bkey_s_c_null;
+ if (iter->pos.inode == KEY_INODE_MAX) {
+ k = bkey_s_c_null;
+ goto out2;
+ }
bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos));
}
@@ -2785,8 +2832,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
}
struct btree_path *path = btree_iter_path(trans, iter);
- if (unlikely(!btree_path_node(path, path->level)))
- return bkey_s_c_null;
+ if (unlikely(!btree_path_node(path, path->level))) {
+ k = bkey_s_c_null;
+ goto out2;
+ }
btree_path_set_should_be_locked(trans, path);
@@ -2879,7 +2928,20 @@ out:
bch2_btree_iter_verify(trans, iter);
ret = bch2_btree_iter_verify_ret(trans, iter, k);
if (unlikely(ret))
- return bkey_s_c_err(ret);
+ k = bkey_s_c_err(ret);
+out2:
+ if (trace_btree_iter_peek_slot_enabled()) {
+ CLASS(printbuf, buf)();
+
+ int ret = bkey_err(k);
+ if (ret)
+ prt_str(&buf, bch2_err_str(ret));
+ else if (k.k)
+ bch2_bkey_val_to_text(&buf, trans->c, k);
+ else
+ prt_str(&buf, "(null)");
+ trace_btree_iter_peek_slot(trans->c, buf.buf);
+ }
return k;
}
@@ -3132,6 +3194,10 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long
if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) {
#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+ prt_printf(&buf, "bump allocator exceeded BTREE_TRANS_MEM_MAX (%u)\n",
+ BTREE_TRANS_MEM_MAX);
+
bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace);
bch2_print_str(c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
@@ -3159,46 +3225,32 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long
mutex_unlock(&s->lock);
}
- if (trans->used_mempool) {
- if (trans->mem_bytes >= new_bytes)
- goto out_change_top;
-
- /* No more space from mempool item, need malloc new one */
- new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN);
- if (unlikely(!new_mem)) {
- bch2_trans_unlock(trans);
-
- new_mem = kmalloc(new_bytes, GFP_KERNEL);
- if (!new_mem)
- return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+ if (trans->used_mempool || new_bytes > BTREE_TRANS_MEM_MAX) {
+ EBUG_ON(trans->mem_bytes >= new_bytes);
+ return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+ }
- ret = bch2_trans_relock(trans);
- if (ret) {
- kfree(new_mem);
- return ERR_PTR(ret);
- }
- }
- memcpy(new_mem, trans->mem, trans->mem_top);
- trans->used_mempool = false;
- mempool_free(trans->mem, &c->btree_trans_mem_pool);
- goto out_new_mem;
+ if (old_bytes) {
+ trans->realloc_bytes_required = new_bytes;
+ trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
+ return ERR_PTR(btree_trans_restart_ip(trans,
+ BCH_ERR_transaction_restart_mem_realloced, _RET_IP_));
}
- new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
+ EBUG_ON(trans->mem);
+
+ new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN);
if (unlikely(!new_mem)) {
bch2_trans_unlock(trans);
- new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
+ new_mem = kmalloc(new_bytes, GFP_KERNEL);
if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
new_bytes = BTREE_TRANS_MEM_MAX;
- memcpy(new_mem, trans->mem, trans->mem_top);
trans->used_mempool = true;
- kfree(trans->mem);
}
- if (!new_mem)
- return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+ EBUG_ON(!new_mem);
trans->mem = new_mem;
trans->mem_bytes = new_bytes;
@@ -3207,18 +3259,10 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long
if (ret)
return ERR_PTR(ret);
}
-out_new_mem:
+
trans->mem = new_mem;
trans->mem_bytes = new_bytes;
- if (old_bytes) {
- trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
- return ERR_PTR(btree_trans_restart_ip(trans,
- BCH_ERR_transaction_restart_mem_realloced, _RET_IP_));
- }
-out_change_top:
- bch2_trans_kmalloc_trace(trans, size, ip);
-
p = trans->mem + trans->mem_top;
trans->mem_top += size;
memset(p, 0, size);
@@ -3279,6 +3323,27 @@ u32 bch2_trans_begin(struct btree_trans *trans)
trans->restart_count++;
trans->mem_top = 0;
+ if (trans->restarted == BCH_ERR_transaction_restart_mem_realloced) {
+ EBUG_ON(!trans->mem || !trans->mem_bytes);
+ unsigned new_bytes = trans->realloc_bytes_required;
+ void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
+ if (unlikely(!new_mem)) {
+ bch2_trans_unlock(trans);
+ new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
+
+ EBUG_ON(new_bytes > BTREE_TRANS_MEM_MAX);
+
+ if (!new_mem) {
+ new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
+ new_bytes = BTREE_TRANS_MEM_MAX;
+ trans->used_mempool = true;
+ kfree(trans->mem);
+ }
+ }
+ trans->mem = new_mem;
+ trans->mem_bytes = new_bytes;
+ }
+
trans_for_each_path(trans, path, i) {
path->should_be_locked = false;
@@ -3568,13 +3633,12 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
struct btree_bkey_cached_common *b)
{
struct six_lock_count c = six_lock_counts(&b->lock);
- struct task_struct *owner;
pid_t pid;
- rcu_read_lock();
- owner = READ_ONCE(b->lock.owner);
- pid = owner ? owner->pid : 0;
- rcu_read_unlock();
+ scoped_guard(rcu) {
+ struct task_struct *owner = READ_ONCE(b->lock.owner);
+ pid = owner ? owner->pid : 0;
+ }
prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b');
bch2_btree_id_to_text(out, b->btree_id);
@@ -3603,7 +3667,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn);
/* trans->paths is rcu protected vs. freeing */
- rcu_read_lock();
+ guard(rcu)();
out->atomic++;
struct btree_path *paths = rcu_dereference(trans->paths);
@@ -3646,7 +3710,6 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
}
out:
--out->atomic;
- rcu_read_unlock();
}
void bch2_fs_btree_iter_exit(struct bch_fs *c)
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 2cabb5f0f484..09dd3e52622e 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -963,16 +963,6 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *,
_p; \
})
-#define bch2_trans_run(_c, _do) \
-({ \
- struct btree_trans *trans = bch2_trans_get(_c); \
- int _ret = (_do); \
- bch2_trans_put(trans); \
- _ret; \
-})
-
-#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do))
-
struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
void bch2_trans_put(struct btree_trans *);
@@ -990,6 +980,27 @@ unsigned bch2_trans_get_fn_idx(const char *);
__bch2_trans_get(_c, trans_fn_idx); \
})
+/*
+ * We don't use DEFINE_CLASS() because using a function for the constructor
+ * breaks bch2_trans_get()'s use of __func__
+ */
+typedef struct btree_trans * class_btree_trans_t;
+static inline void class_btree_trans_destructor(struct btree_trans **p)
+{
+ struct btree_trans *trans = *p;
+ bch2_trans_put(trans);
+}
+
+#define class_btree_trans_constructor(_c) bch2_trans_get(_c)
+
+#define bch2_trans_run(_c, _do) \
+({ \
+ CLASS(btree_trans, trans)(_c); \
+ (_do); \
+})
+
+#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do))
+
void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
void bch2_fs_btree_iter_exit(struct bch_fs *);
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index ade3b5addd75..ea839560a136 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -137,12 +137,15 @@ struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id b
struct journal_key *k;
BUG_ON(*idx > keys->nr);
+
+ if (!keys->nr)
+ return NULL;
search:
if (!*idx)
*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
- while (*idx &&
- __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
+ while (*idx < keys->nr &&
+ __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) {
(*idx)++;
iters++;
if (iters == 10) {
@@ -151,18 +154,23 @@ search:
}
}
+ if (*idx == keys->nr)
+ --(*idx);
+
struct bkey_i *ret = NULL;
rcu_read_lock(); /* for overwritten_ranges */
- while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
+ while (true) {
+ k = idx_to_key(keys, *idx);
if (__journal_key_cmp(btree_id, level, end_pos, k) > 0)
break;
if (k->overwritten) {
if (k->overwritten_range)
- *idx = rcu_dereference(k->overwritten_range)->start - 1;
- else
- *idx -= 1;
+ *idx = rcu_dereference(k->overwritten_range)->start;
+ if (!*idx)
+ break;
+ --(*idx);
continue;
}
@@ -171,6 +179,8 @@ search:
break;
}
+ if (!*idx)
+ break;
--(*idx);
iters++;
if (iters == 10) {
@@ -292,7 +302,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
if (!new_keys.data) {
bch_err(c, "%s: error allocating new key array (size %zu)",
__func__, new_keys.size);
- return -BCH_ERR_ENOMEM_journal_key_insert;
+ return bch_err_throw(c, ENOMEM_journal_key_insert);
}
/* Since @keys was full, there was no gap: */
@@ -331,7 +341,7 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
if (!n)
- return -BCH_ERR_ENOMEM_journal_key_insert;
+ return bch_err_throw(c, ENOMEM_journal_key_insert);
bkey_copy(n, k);
ret = bch2_journal_key_insert_take(c, id, level, n);
@@ -457,11 +467,9 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
{
- struct bkey_s_c ret = bkey_s_c_null;
-
journal_iter_verify(iter);
- rcu_read_lock();
+ guard(rcu)();
while (iter->idx < iter->keys->size) {
struct journal_key *k = iter->keys->data + iter->idx;
@@ -470,19 +478,16 @@ static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
break;
BUG_ON(cmp);
- if (!k->overwritten) {
- ret = bkey_i_to_s_c(k->k);
- break;
- }
+ if (!k->overwritten)
+ return bkey_i_to_s_c(k->k);
if (k->overwritten_range)
iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end);
else
bch2_journal_iter_advance(iter);
}
- rcu_read_unlock();
- return ret;
+ return bkey_s_c_null;
}
static void bch2_journal_iter_exit(struct journal_iter *iter)
@@ -646,10 +651,11 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
{
const struct journal_key *l = _l;
const struct journal_key *r = _r;
+ int rewind = l->rewind && r->rewind ? -1 : 1;
return journal_key_cmp(l, r) ?:
- cmp_int(l->journal_seq, r->journal_seq) ?:
- cmp_int(l->journal_offset, r->journal_offset);
+ ((cmp_int(l->journal_seq, r->journal_seq) ?:
+ cmp_int(l->journal_offset, r->journal_offset)) * rewind);
}
void bch2_journal_keys_put(struct bch_fs *c)
@@ -718,6 +724,8 @@ int bch2_journal_keys_sort(struct bch_fs *c)
struct journal_keys *keys = &c->journal_keys;
size_t nr_read = 0;
+ u64 rewind_seq = c->opts.journal_rewind ?: U64_MAX;
+
genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i;
@@ -726,28 +734,43 @@ int bch2_journal_keys_sort(struct bch_fs *c)
cond_resched();
- for_each_jset_key(k, entry, &i->j) {
- struct journal_key n = (struct journal_key) {
- .btree_id = entry->btree_id,
- .level = entry->level,
- .k = k,
- .journal_seq = le64_to_cpu(i->j.seq),
- .journal_offset = k->_data - i->j._data,
- };
-
- if (darray_push(keys, n)) {
- __journal_keys_sort(keys);
-
- if (keys->nr * 8 > keys->size * 7) {
- bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu",
- keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq));
- return -BCH_ERR_ENOMEM_journal_keys_sort;
+ vstruct_for_each(&i->j, entry) {
+ bool rewind = !entry->level &&
+ !btree_id_is_alloc(entry->btree_id) &&
+ le64_to_cpu(i->j.seq) >= rewind_seq;
+
+ if (entry->type != (rewind
+ ? BCH_JSET_ENTRY_overwrite
+ : BCH_JSET_ENTRY_btree_keys))
+ continue;
+
+ if (!rewind && le64_to_cpu(i->j.seq) < c->journal_replay_seq_start)
+ continue;
+
+ jset_entry_for_each_key(entry, k) {
+ struct journal_key n = (struct journal_key) {
+ .btree_id = entry->btree_id,
+ .level = entry->level,
+ .rewind = rewind,
+ .k = k,
+ .journal_seq = le64_to_cpu(i->j.seq),
+ .journal_offset = k->_data - i->j._data,
+ };
+
+ if (darray_push(keys, n)) {
+ __journal_keys_sort(keys);
+
+ if (keys->nr * 8 > keys->size * 7) {
+ bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu",
+ keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq));
+ return bch_err_throw(c, ENOMEM_journal_keys_sort);
+ }
+
+ BUG_ON(darray_push(keys, n));
}
- BUG_ON(darray_push(keys, n));
+ nr_read++;
}
-
- nr_read++;
}
}
diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h
index 8b773823704f..86aacb254fb2 100644
--- a/fs/bcachefs/btree_journal_iter_types.h
+++ b/fs/bcachefs/btree_journal_iter_types.h
@@ -11,8 +11,9 @@ struct journal_key {
u32 journal_offset;
enum btree_id btree_id:8;
unsigned level:8;
- bool allocated;
- bool overwritten;
+ bool allocated:1;
+ bool overwritten:1;
+ bool rewind:1;
struct journal_key_range_overwritten __rcu *
overwritten_range;
struct bkey_i *k;
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 9da950e7eb7d..d96188b92db2 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -187,27 +187,23 @@ lock:
static struct bkey_cached *
bkey_cached_reuse(struct btree_key_cache *c)
{
- struct bucket_table *tbl;
+
+ guard(rcu)();
+ struct bucket_table *tbl = rht_dereference_rcu(c->table.tbl, &c->table);
struct rhash_head *pos;
struct bkey_cached *ck;
- unsigned i;
- rcu_read_lock();
- tbl = rht_dereference_rcu(c->table.tbl, &c->table);
- for (i = 0; i < tbl->size; i++)
+ for (unsigned i = 0; i < tbl->size; i++)
rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bkey_cached_lock_for_evict(ck)) {
if (bkey_cached_evict(c, ck))
- goto out;
+ return ck;
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
}
}
- ck = NULL;
-out:
- rcu_read_unlock();
- return ck;
+ return NULL;
}
static int btree_key_cache_create(struct btree_trans *trans,
@@ -242,7 +238,7 @@ static int btree_key_cache_create(struct btree_trans *trans,
if (unlikely(!ck)) {
bch_err(c, "error allocating memory for key cache item, btree %s",
bch2_btree_id_str(ck_path->btree_id));
- return -BCH_ERR_ENOMEM_btree_key_cache_create;
+ return bch_err_throw(c, ENOMEM_btree_key_cache_create);
}
}
@@ -260,7 +256,7 @@ static int btree_key_cache_create(struct btree_trans *trans,
if (unlikely(!new_k)) {
bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
bch2_btree_id_str(ck->key.btree_id), key_u64s);
- ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
+ ret = bch_err_throw(c, ENOMEM_btree_key_cache_fill);
} else if (ret) {
kfree(new_k);
goto err;
@@ -826,20 +822,20 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
bc->nr_pending = alloc_percpu(size_t);
if (!bc->nr_pending)
- return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+ return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) ||
rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free))
- return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+ return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
- return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+ return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
bc->table_init_done = true;
shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name);
if (!shrink)
- return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+ return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
bc->shrink = shrink;
shrink->count_objects = bch2_btree_key_cache_count;
shrink->scan_objects = bch2_btree_key_cache_scan;
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 2f2aed0c9916..bed2b4b6ffb9 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -194,6 +194,30 @@ static int btree_trans_abort_preference(struct btree_trans *trans)
return 3;
}
+static noinline __noreturn void break_cycle_fail(struct lock_graph *g)
+{
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+
+ prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
+
+ for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) {
+ struct btree_trans *trans = i->trans;
+
+ bch2_btree_trans_to_text(&buf, trans);
+
+ prt_printf(&buf, "backtrace:\n");
+ printbuf_indent_add(&buf, 2);
+ bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
+ printbuf_indent_sub(&buf, 2);
+ prt_newline(&buf);
+ }
+
+ bch2_print_str(g->g->trans->c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ BUG();
+}
+
static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle,
struct trans_waiting_for_lock *from)
{
@@ -219,28 +243,8 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle,
}
}
- if (unlikely(!best)) {
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
-
- prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
-
- for (i = g->g; i < g->g + g->nr; i++) {
- struct btree_trans *trans = i->trans;
-
- bch2_btree_trans_to_text(&buf, trans);
-
- prt_printf(&buf, "backtrace:\n");
- printbuf_indent_add(&buf, 2);
- bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
- printbuf_indent_sub(&buf, 2);
- prt_newline(&buf);
- }
-
- bch2_print_str_nonblocking(g->g->trans->c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- BUG();
- }
+ if (unlikely(!best))
+ break_cycle_fail(g);
ret = abort_lock(g, abort);
out:
@@ -255,15 +259,14 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
struct printbuf *cycle)
{
struct btree_trans *orig_trans = g->g->trans;
- struct trans_waiting_for_lock *i;
- for (i = g->g; i < g->g + g->nr; i++)
+ for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++)
if (i->trans == trans) {
closure_put(&trans->ref);
return break_cycle(g, cycle, i);
}
- if (g->nr == ARRAY_SIZE(g->g)) {
+ if (unlikely(g->nr == ARRAY_SIZE(g->g))) {
closure_put(&trans->ref);
if (orig_trans->lock_may_not_fail)
@@ -308,7 +311,7 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
lock_graph_down(&g, trans);
/* trans->paths is rcu protected vs. freeing */
- rcu_read_lock();
+ guard(rcu)();
if (cycle)
cycle->atomic++;
next:
@@ -406,7 +409,6 @@ up:
out:
if (cycle)
--cycle->atomic;
- rcu_read_unlock();
return ret;
}
@@ -769,7 +771,7 @@ static inline void __bch2_trans_unlock(struct btree_trans *trans)
}
static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path,
- struct get_locks_fail *f, bool trace)
+ struct get_locks_fail *f, bool trace, ulong ip)
{
if (!trace)
goto out;
@@ -794,7 +796,7 @@ static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, st
prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
}
- trace_trans_restart_relock(trans, _RET_IP_, buf.buf);
+ trace_trans_restart_relock(trans, ip, buf.buf);
printbuf_exit(&buf);
}
@@ -804,7 +806,7 @@ out:
bch2_trans_verify_locks(trans);
}
-static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace)
+static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace, ulong ip)
{
bch2_trans_verify_locks(trans);
@@ -823,7 +825,7 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace)
if (path->should_be_locked &&
(ret = btree_path_get_locks(trans, path, false, &f,
BCH_ERR_transaction_restart_relock))) {
- bch2_trans_relock_fail(trans, path, &f, trace);
+ bch2_trans_relock_fail(trans, path, &f, trace, ip);
return ret;
}
}
@@ -836,12 +838,12 @@ out:
int bch2_trans_relock(struct btree_trans *trans)
{
- return __bch2_trans_relock(trans, true);
+ return __bch2_trans_relock(trans, true, _RET_IP_);
}
int bch2_trans_relock_notrace(struct btree_trans *trans)
{
- return __bch2_trans_relock(trans, false);
+ return __bch2_trans_relock(trans, false, _RET_IP_);
}
void bch2_trans_unlock(struct btree_trans *trans)
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 9adca77e2580..f2173a3316f4 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -417,8 +417,10 @@ static inline void btree_path_set_should_be_locked(struct btree_trans *trans, st
EBUG_ON(!btree_node_locked(path, path->level));
EBUG_ON(path->uptodate);
- path->should_be_locked = true;
- trace_btree_path_should_be_locked(trans, path);
+ if (!path->should_be_locked) {
+ path->should_be_locked = true;
+ trace_btree_path_should_be_locked(trans, path);
+ }
}
static inline void __btree_path_set_level_up(struct btree_trans *trans,
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 5a97a6b8a757..a3fb07c60e25 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -75,39 +75,6 @@ static inline u64 bkey_journal_seq(struct bkey_s_c k)
}
}
-static bool found_btree_node_is_readable(struct btree_trans *trans,
- struct found_btree_node *f)
-{
- struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
-
- found_btree_node_to_key(&tmp.k, f);
-
- struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false);
- bool ret = !IS_ERR_OR_NULL(b);
- if (!ret)
- return ret;
-
- f->sectors_written = b->written;
- f->journal_seq = le64_to_cpu(b->data->keys.journal_seq);
-
- struct bkey_s_c k;
- struct bkey unpacked;
- struct btree_node_iter iter;
- for_each_btree_node_key_unpack(b, k, &iter, &unpacked)
- f->journal_seq = max(f->journal_seq, bkey_journal_seq(k));
-
- six_unlock_read(&b->c.lock);
-
- /*
- * We might update this node's range; if that happens, we need the node
- * to be re-read so the read path can trim keys that are no longer in
- * this node
- */
- if (b != btree_node_root(trans->c, b))
- bch2_btree_node_evict(trans, &tmp.k);
- return ret;
-}
-
static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
{
const struct found_btree_node *l = _l;
@@ -159,17 +126,17 @@ static const struct min_heap_callbacks found_btree_node_heap_cbs = {
};
static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
- struct bio *bio, struct btree_node *bn, u64 offset)
+ struct btree *b, struct bio *bio, u64 offset)
{
struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
+ struct btree_node *bn = b->data;
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
bio->bi_iter.bi_sector = offset;
- bch2_bio_map(bio, bn, PAGE_SIZE);
+ bch2_bio_map(bio, b->data, c->opts.block_size);
u64 submit_time = local_clock();
submit_bio_wait(bio);
-
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
if (bio->bi_status) {
@@ -217,7 +184,28 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
};
rcu_read_unlock();
- if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
+ bio->bi_iter.bi_sector = offset;
+ bch2_bio_map(bio, b->data, c->opts.btree_node_size);
+
+ submit_time = local_clock();
+ submit_bio_wait(bio);
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
+
+ found_btree_node_to_key(&b->key, &n);
+
+ CLASS(printbuf, buf)();
+ if (!bch2_btree_node_read_done(c, ca, b, NULL, &buf)) {
+ /* read_done will swap out b->data for another buffer */
+ bn = b->data;
+ /*
+ * Grab journal_seq here because we want the max journal_seq of
+ * any bset; read_done sorts down to a single set and picks the
+ * max journal_seq
+ */
+ n.journal_seq = le64_to_cpu(bn->keys.journal_seq),
+ n.sectors_written = b->written;
+
mutex_lock(&f->lock);
if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
bch_err(c, "try_read_btree_node() can't handle endian conversion");
@@ -237,12 +225,20 @@ static int read_btree_nodes_worker(void *p)
struct find_btree_nodes_worker *w = p;
struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
struct bch_dev *ca = w->ca;
- void *buf = (void *) __get_free_page(GFP_KERNEL);
- struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
unsigned long last_print = jiffies;
+ struct btree *b = NULL;
+ struct bio *bio = NULL;
+
+ b = __bch2_btree_node_mem_alloc(c);
+ if (!b) {
+ bch_err(c, "read_btree_nodes_worker: error allocating buf");
+ w->f->ret = -ENOMEM;
+ goto err;
+ }
- if (!buf || !bio) {
- bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
+ bio = bio_alloc(NULL, buf_pages(b->data, c->opts.btree_node_size), 0, GFP_KERNEL);
+ if (!bio) {
+ bch_err(c, "read_btree_nodes_worker: error allocating bio");
w->f->ret = -ENOMEM;
goto err;
}
@@ -266,11 +262,13 @@ static int read_btree_nodes_worker(void *p)
!bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
continue;
- try_read_btree_node(w->f, ca, bio, buf, sector);
+ try_read_btree_node(w->f, ca, b, bio, sector);
}
err:
+ if (b)
+ __btree_node_data_free(b);
+ kfree(b);
bio_put(bio);
- free_page((unsigned long) buf);
enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
closure_put(w->cl);
kfree(w);
@@ -363,6 +361,8 @@ static int handle_overwrites(struct bch_fs *c,
min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
}
}
+
+ cond_resched();
}
return 0;
@@ -519,8 +519,12 @@ bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
return false;
}
-bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
+int bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
{
+ int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ if (ret)
+ return ret;
+
struct found_btree_node search = {
.btree_id = btree,
.level = 0,
diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h
index 08687b209787..66e6f9ed19d0 100644
--- a/fs/bcachefs/btree_node_scan.h
+++ b/fs/bcachefs/btree_node_scan.h
@@ -4,7 +4,7 @@
int bch2_scan_for_btree_nodes(struct bch_fs *);
bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *);
-bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id);
+int bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id);
int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos);
void bch2_find_btree_nodes_exit(struct find_btree_nodes *);
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 1c03c965d836..639ef75b3dbd 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -376,7 +376,7 @@ static inline int btree_key_can_insert(struct btree_trans *trans,
struct btree *b, unsigned u64s)
{
if (!bch2_btree_node_insert_fits(b, u64s))
- return -BCH_ERR_btree_insert_btree_node_full;
+ return bch_err_throw(trans->c, btree_insert_btree_node_full);
return 0;
}
@@ -394,9 +394,10 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
if (!new_k) {
- bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
+ struct bch_fs *c = trans->c;
+ bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
bch2_btree_id_str(path->btree_id), new_u64s);
- return -BCH_ERR_ENOMEM_btree_key_cache_insert;
+ return bch_err_throw(c, ENOMEM_btree_key_cache_insert);
}
ret = bch2_trans_relock(trans) ?:
@@ -432,7 +433,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
if (watermark < BCH_WATERMARK_reclaim &&
!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bch2_btree_key_cache_must_wait(c))
- return -BCH_ERR_btree_insert_need_journal_reclaim;
+ return bch_err_throw(c, btree_insert_need_journal_reclaim);
/*
* bch2_varint_decode can read past the end of the buffer by at most 7
@@ -594,12 +595,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
int ret = 0;
bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
+#if 0
+ /* todo: bring back dynamic fault injection */
if (race_fault()) {
trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
}
-
+#endif
/*
* Check if the insert will fit in the leaf node with the write lock
* held, otherwise another thread could write the node changing the
@@ -756,6 +758,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
btree_trans_journal_entries_start(trans),
trans->journal_entries.u64s);
+ EBUG_ON(trans->journal_res.u64s < trans->journal_entries.u64s);
+
trans->journal_res.offset += trans->journal_entries.u64s;
trans->journal_res.u64s -= trans->journal_entries.u64s;
@@ -894,7 +898,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
*/
if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
watermark < BCH_WATERMARK_reclaim) {
- ret = -BCH_ERR_journal_reclaim_would_deadlock;
+ ret = bch_err_throw(c, journal_reclaim_would_deadlock);
goto out;
}
@@ -966,13 +970,26 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
for (struct jset_entry *i = btree_trans_journal_entries_start(trans);
i != btree_trans_journal_entries_top(trans);
- i = vstruct_next(i))
+ i = vstruct_next(i)) {
if (i->type == BCH_JSET_ENTRY_btree_keys ||
i->type == BCH_JSET_ENTRY_write_buffer_keys) {
- int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->start);
- if (ret)
- return ret;
+ jset_entry_for_each_key(i, k) {
+ int ret = bch2_journal_key_insert(c, i->btree_id, i->level, k);
+ if (ret)
+ return ret;
+ }
+ }
+
+ if (i->type == BCH_JSET_ENTRY_btree_root) {
+ guard(mutex)(&c->btree_root_lock);
+
+ struct btree_root *r = bch2_btree_id_root(c, i->btree_id);
+
+ bkey_copy(&r->key, i->start);
+ r->level = i->level;
+ r->alive = true;
}
+ }
for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting);
i != btree_trans_subbuf_top(trans, &trans->accounting);
@@ -989,6 +1006,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
{
struct btree_insert_entry *errored_at = NULL;
struct bch_fs *c = trans->c;
+ unsigned journal_u64s = 0;
int ret = 0;
bch2_trans_verify_not_unlocked_or_in_restart(trans);
@@ -1011,16 +1029,16 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags)))
ret = do_bch2_trans_commit_to_journal_replay(trans);
else
- ret = -BCH_ERR_erofs_trans_commit;
+ ret = bch_err_throw(c, erofs_trans_commit);
goto out_reset;
}
EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
- trans->journal_u64s = trans->journal_entries.u64s + jset_u64s(trans->accounting.u64s);
+ journal_u64s = jset_u64s(trans->accounting.u64s);
trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
if (trans->journal_transaction_names)
- trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
+ journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
trans_for_each_update(trans, i) {
struct btree_path *path = trans->paths + i->path;
@@ -1040,11 +1058,11 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
continue;
/* we're going to journal the key being updated: */
- trans->journal_u64s += jset_u64s(i->k->k.u64s);
+ journal_u64s += jset_u64s(i->k->k.u64s);
/* and we're also going to log the overwrite: */
if (trans->journal_transaction_names)
- trans->journal_u64s += jset_u64s(i->old_k.u64s);
+ journal_u64s += jset_u64s(i->old_k.u64s);
}
if (trans->extra_disk_res) {
@@ -1062,6 +1080,8 @@ retry:
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
+ trans->journal_u64s = journal_u64s + trans->journal_entries.u64s;
+
ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_);
/* make sure we didn't drop or screw up locks: */
@@ -1093,7 +1113,7 @@ err:
* restart:
*/
if (flags & BCH_TRANS_COMMIT_no_journal_res) {
- ret = -BCH_ERR_transaction_restart_nested;
+ ret = bch_err_throw(c, transaction_restart_nested);
goto out;
}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 9d641bf9d2a2..112170fd9c8f 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -497,6 +497,7 @@ struct btree_trans {
void *mem;
unsigned mem_top;
unsigned mem_bytes;
+ unsigned realloc_bytes_required;
#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
darray_trans_kmalloc_trace trans_kmalloc_trace;
#endif
@@ -555,6 +556,8 @@ struct btree_trans {
unsigned journal_u64s;
unsigned extra_disk_res; /* XXX kill */
+ __BKEY_PADDED(btree_path_down, BKEY_BTREE_PTR_VAL_U64s_MAX);
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
@@ -615,6 +618,9 @@ enum btree_write_type {
x(dying) \
x(fake) \
x(need_rewrite) \
+ x(need_rewrite_error) \
+ x(need_rewrite_degraded) \
+ x(need_rewrite_ptr_written_zero) \
x(never_write) \
x(pinned)
@@ -639,6 +645,32 @@ static inline void clear_btree_node_ ## flag(struct btree *b) \
BTREE_FLAGS()
#undef x
+#define BTREE_NODE_REWRITE_REASON() \
+ x(none) \
+ x(unknown) \
+ x(error) \
+ x(degraded) \
+ x(ptr_written_zero)
+
+enum btree_node_rewrite_reason {
+#define x(n) BTREE_NODE_REWRITE_##n,
+ BTREE_NODE_REWRITE_REASON()
+#undef x
+};
+
+static inline enum btree_node_rewrite_reason btree_node_rewrite_reason(struct btree *b)
+{
+ if (btree_node_need_rewrite_ptr_written_zero(b))
+ return BTREE_NODE_REWRITE_ptr_written_zero;
+ if (btree_node_need_rewrite_degraded(b))
+ return BTREE_NODE_REWRITE_degraded;
+ if (btree_node_need_rewrite_error(b))
+ return BTREE_NODE_REWRITE_error;
+ if (btree_node_need_rewrite(b))
+ return BTREE_NODE_REWRITE_unknown;
+ return BTREE_NODE_REWRITE_none;
+}
+
static inline struct btree_write *btree_current_write(struct btree *b)
{
return b->writes + btree_node_write_idx(b);
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 5dac09c98026..ee657b9f4b96 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -123,65 +123,44 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
}
int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
- enum btree_id id,
- struct bpos old_pos,
- struct bpos new_pos)
+ enum btree_id btree, struct bpos pos,
+ snapshot_id_list *s)
{
- struct bch_fs *c = trans->c;
- struct btree_iter old_iter, new_iter = {};
- struct bkey_s_c old_k, new_k;
- snapshot_id_list s;
- struct bkey_i *update;
int ret = 0;
- if (!bch2_snapshot_has_children(c, old_pos.snapshot))
- return 0;
+ darray_for_each(*s, id) {
+ pos.snapshot = *id;
- darray_init(&s);
-
- bch2_trans_iter_init(trans, &old_iter, id, old_pos,
- BTREE_ITER_not_extents|
- BTREE_ITER_all_snapshots);
- while ((old_k = bch2_btree_iter_prev(trans, &old_iter)).k &&
- !(ret = bkey_err(old_k)) &&
- bkey_eq(old_pos, old_k.k->p)) {
- struct bpos whiteout_pos =
- SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);
-
- if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
- snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
- continue;
-
- new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
- BTREE_ITER_not_extents|
- BTREE_ITER_intent);
- ret = bkey_err(new_k);
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, pos,
+ BTREE_ITER_not_extents|
+ BTREE_ITER_intent);
+ ret = bkey_err(k);
if (ret)
break;
- if (new_k.k->type == KEY_TYPE_deleted) {
- update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+ if (k.k->type == KEY_TYPE_deleted) {
+ struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
ret = PTR_ERR_OR_ZERO(update);
- if (ret)
+ if (ret) {
+ bch2_trans_iter_exit(trans, &iter);
break;
+ }
bkey_init(&update->k);
- update->k.p = whiteout_pos;
+ update->k.p = pos;
update->k.type = KEY_TYPE_whiteout;
- ret = bch2_trans_update(trans, &new_iter, update,
+ ret = bch2_trans_update(trans, &iter, update,
BTREE_UPDATE_internal_snapshot_node);
}
- bch2_trans_iter_exit(trans, &new_iter);
+ bch2_trans_iter_exit(trans, &iter);
- ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
if (ret)
break;
}
- bch2_trans_iter_exit(trans, &new_iter);
- bch2_trans_iter_exit(trans, &old_iter);
- darray_exit(&s);
+ darray_exit(s);
return ret;
}
@@ -570,20 +549,26 @@ void *__bch2_trans_subbuf_alloc(struct btree_trans *trans,
unsigned u64s)
{
unsigned new_top = buf->u64s + u64s;
- unsigned old_size = buf->size;
+ unsigned new_size = buf->size;
- if (new_top > buf->size)
- buf->size = roundup_pow_of_two(new_top);
+ BUG_ON(roundup_pow_of_two(new_top) > U16_MAX);
- void *n = bch2_trans_kmalloc_nomemzero(trans, buf->size * sizeof(u64));
+ if (new_top > new_size)
+ new_size = roundup_pow_of_two(new_top);
+
+ void *n = bch2_trans_kmalloc_nomemzero(trans, new_size * sizeof(u64));
if (IS_ERR(n))
return n;
+ unsigned offset = (u64 *) n - (u64 *) trans->mem;
+ BUG_ON(offset > U16_MAX);
+
if (buf->u64s)
memcpy(n,
btree_trans_subbuf_base(trans, buf),
- old_size * sizeof(u64));
+ buf->size * sizeof(u64));
buf->base = (u64 *) n - (u64 *) trans->mem;
+ buf->size = new_size;
void *p = btree_trans_subbuf_top(trans, buf);
buf->u64s = new_top;
@@ -608,7 +593,7 @@ int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
BUG_ON(k.k->type != KEY_TYPE_deleted);
if (bkey_gt(k.k->p, end)) {
- ret = -BCH_ERR_ENOSPC_btree_slot;
+ ret = bch_err_throw(trans->c, ENOSPC_btree_slot);
goto err;
}
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index f907eaa8b185..0b98ab959719 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -4,6 +4,7 @@
#include "btree_iter.h"
#include "journal.h"
+#include "snapshot.h"
struct bch_fs;
struct btree;
@@ -74,7 +75,7 @@ static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans,
}
int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
- struct bpos, struct bpos);
+ struct bpos, snapshot_id_list *);
/*
* For use when splitting extents in existing snapshots:
@@ -88,11 +89,20 @@ static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
struct bpos old_pos,
struct bpos new_pos)
{
+ BUG_ON(old_pos.snapshot != new_pos.snapshot);
+
if (!btree_type_has_snapshots(btree) ||
bkey_eq(old_pos, new_pos))
return 0;
- return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos);
+ snapshot_id_list s;
+ int ret = bch2_get_snapshot_overwrites(trans, btree, old_pos, &s);
+ if (ret)
+ return ret;
+
+ return s.nr
+ ? __bch2_insert_snapshot_whiteouts(trans, btree, new_pos, &s)
+ : 0;
}
int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *,
@@ -160,8 +170,7 @@ bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *);
-int bch2_btree_write_buffer_insert_err(struct btree_trans *,
- enum btree_id, struct bkey_i *);
+int bch2_btree_write_buffer_insert_err(struct bch_fs *, enum btree_id, struct bkey_i *);
static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
enum btree_id btree,
@@ -172,7 +181,7 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr
EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
if (unlikely(!btree_type_uses_write_buffer(btree))) {
- int ret = bch2_btree_write_buffer_insert_err(trans, btree, k);
+ int ret = bch2_btree_write_buffer_insert_err(trans->c, btree, k);
dump_stack();
return ret;
}
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 74e65714fecd..553059b33bfd 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -57,8 +57,6 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
struct bkey_buf prev;
int ret = 0;
- printbuf_indent_add_nextline(&buf, 2);
-
BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
!bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
b->data->min_key));
@@ -69,20 +67,23 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
if (b == btree_node_root(c, b)) {
if (!bpos_eq(b->data->min_key, POS_MIN)) {
- ret = __bch2_topology_error(c, &buf);
-
+ bch2_log_msg_start(c, &buf);
+ prt_printf(&buf, "btree root with incorrect min_key: ");
bch2_bpos_to_text(&buf, b->data->min_key);
- log_fsck_err(trans, btree_root_bad_min_key,
- "btree root with incorrect min_key: %s", buf.buf);
- goto out;
+ prt_newline(&buf);
+
+ bch2_count_fsck_err(c, btree_root_bad_min_key, &buf);
+ goto err;
}
if (!bpos_eq(b->data->max_key, SPOS_MAX)) {
- ret = __bch2_topology_error(c, &buf);
+ bch2_log_msg_start(c, &buf);
+ prt_printf(&buf, "btree root with incorrect max_key: ");
bch2_bpos_to_text(&buf, b->data->max_key);
- log_fsck_err(trans, btree_root_bad_max_key,
- "btree root with incorrect max_key: %s", buf.buf);
- goto out;
+ prt_newline(&buf);
+
+ bch2_count_fsck_err(c, btree_root_bad_max_key, &buf);
+ goto err;
}
}
@@ -100,19 +101,15 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
: bpos_successor(prev.k->k.p);
if (!bpos_eq(expected_min, bp.v->min_key)) {
- ret = __bch2_topology_error(c, &buf);
-
- prt_str(&buf, "end of prev node doesn't match start of next node\nin ");
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_str(&buf, " node ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_str(&buf, "end of prev node doesn't match start of next node");
prt_str(&buf, "\nprev ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
prt_str(&buf, "\nnext ");
bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
- log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf);
- goto out;
+ bch2_count_fsck_err(c, btree_node_topology_bad_min_key, &buf);
+ goto err;
}
bch2_bkey_buf_reassemble(&prev, c, k);
@@ -120,32 +117,34 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
}
if (bkey_deleted(&prev.k->k)) {
- ret = __bch2_topology_error(c, &buf);
-
- prt_str(&buf, "empty interior node\nin ");
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_str(&buf, " node ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
- log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf);
- } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
- ret = __bch2_topology_error(c, &buf);
+ prt_printf(&buf, "empty interior node\n");
+ bch2_count_fsck_err(c, btree_node_topology_empty_interior_node, &buf);
+ goto err;
+ }
- prt_str(&buf, "last child node doesn't end at end of parent node\nin ");
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_str(&buf, " node ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- prt_str(&buf, "\nlast key ");
+ if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
+ prt_str(&buf, "last child node doesn't end at end of parent node\nchild: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
+ prt_newline(&buf);
- log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf);
+ bch2_count_fsck_err(c, btree_node_topology_bad_max_key, &buf);
+ goto err;
}
out:
-fsck_err:
bch2_btree_and_journal_iter_exit(&iter);
bch2_bkey_buf_exit(&prev, c);
printbuf_exit(&buf);
return ret;
+err:
+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+ prt_char(&buf, ' ');
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_newline(&buf);
+
+ ret = __bch2_topology_error(c, &buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ BUG_ON(!ret);
+ goto out;
}
/* Calculate ideal packed bkey format for new btree nodes: */
@@ -685,12 +684,31 @@ static void btree_update_nodes_written(struct btree_update *as)
/*
* Wait for any in flight writes to finish before we free the old nodes
- * on disk:
+ * on disk. But we haven't pinned those old nodes in the btree cache,
+ * they might have already been evicted.
+ *
+ * The update we're completing deleted references to those nodes from the
+ * btree, so we know if they've been evicted they can't be pulled back in.
+ * We just have to check if the nodes we have pointers to are still those
+ * old nodes, and haven't been reused.
+ *
+ * This can't be done locklessly because the data buffer might have been
+ * vmalloc allocated, and they're not RCU freed. We also need the
+ * __no_kmsan_checks annotation because even with the btree node read
+ * lock, nothing tells us that the data buffer has been initialized (if
+ * the btree node has been reused for a different node, and the data
+ * buffer swapped for a new data buffer).
*/
for (i = 0; i < as->nr_old_nodes; i++) {
b = as->old_nodes[i];
- if (btree_node_seq_matches(b, as->old_nodes_seq[i]))
+ bch2_trans_begin(trans);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+ bool seq_matches = btree_node_seq_matches(b, as->old_nodes_seq[i]);
+ six_unlock_read(&b->c.lock);
+ bch2_trans_unlock_long(trans);
+
+ if (seq_matches)
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
TASK_UNINTERRUPTIBLE);
}
@@ -1120,6 +1138,13 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
start_time);
}
+static const char * const btree_node_reawrite_reason_strs[] = {
+#define x(n) #n,
+ BTREE_NODE_REWRITE_REASON()
+#undef x
+ NULL,
+};
+
static struct btree_update *
bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
unsigned level_start, bool split,
@@ -1214,6 +1239,15 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
list_add_tail(&as->list, &c->btree_interior_update_list);
mutex_unlock(&c->btree_interior_update_lock);
+ struct btree *b = btree_path_node(path, path->level);
+ as->node_start = b->data->min_key;
+ as->node_end = b->data->max_key;
+ as->node_needed_rewrite = btree_node_rewrite_reason(b);
+ as->node_written = b->written;
+ as->node_sectors = btree_buf_bytes(b) >> 9;
+ as->node_remaining = __bch2_btree_u64s_remaining(b,
+ btree_bkey_last(b, bset_tree_last(b)));
+
/*
* We don't want to allocate if we're in an error state, that can cause
* deadlock on emergency shutdown due to open buckets getting stuck in
@@ -1245,7 +1279,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
if (bch2_err_matches(ret, ENOSPC) &&
(flags & BCH_TRANS_COMMIT_journal_reclaim) &&
watermark < BCH_WATERMARK_reclaim) {
- ret = -BCH_ERR_journal_reclaim_would_deadlock;
+ ret = bch_err_throw(c, journal_reclaim_would_deadlock);
goto err;
}
@@ -1253,10 +1287,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
do {
ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl);
-
+ if (!bch2_err_matches(ret, BCH_ERR_operation_blocked))
+ break;
bch2_trans_unlock(trans);
bch2_wait_on_allocator(c, &cl);
- } while (bch2_err_matches(ret, BCH_ERR_operation_blocked));
+ } while (1);
}
if (ret) {
@@ -2090,6 +2125,9 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
if (ret)
goto err;
+ as->node_start = prev->data->min_key;
+ as->node_end = next->data->max_key;
+
trace_and_count(c, btree_node_merge, trans, b);
n = bch2_btree_node_alloc(as, trans, b->c.level);
@@ -2178,7 +2216,7 @@ static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter,
if (btree_iter_path(trans, iter)->l[b->c.level].b != b) {
/* node has been freed: */
BUG_ON(!btree_node_dying(b));
- ret = -BCH_ERR_btree_node_dying;
+ ret = bch_err_throw(trans->c, btree_node_dying);
goto err;
}
@@ -2256,9 +2294,9 @@ err:
goto out;
}
-static int bch2_btree_node_rewrite_key(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_i *k, unsigned flags)
+int bch2_btree_node_rewrite_key(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_i *k, unsigned flags)
{
struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter,
@@ -2330,9 +2368,8 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans,
a->btree_id, a->level, a->key.k, 0));
- if (ret != -ENOENT &&
- !bch2_err_matches(ret, EROFS) &&
- ret != -BCH_ERR_journal_shutdown)
+ if (!bch2_err_matches(ret, ENOENT) &&
+ !bch2_err_matches(ret, EROFS))
bch_err_fn_ratelimited(c, ret);
spin_lock(&c->btree_node_rewrites_lock);
@@ -2663,9 +2700,19 @@ static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update
prt_str(out, " ");
bch2_btree_id_to_text(out, as->btree_id);
- prt_printf(out, " l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
+ prt_printf(out, " l=%u-%u ",
as->update_level_start,
- as->update_level_end,
+ as->update_level_end);
+ bch2_bpos_to_text(out, as->node_start);
+ prt_char(out, ' ');
+ bch2_bpos_to_text(out, as->node_end);
+ prt_printf(out, "\nwritten %u/%u u64s_remaining %u need_rewrite %s",
+ as->node_written,
+ as->node_sectors,
+ as->node_remaining,
+ btree_node_reawrite_reason_strs[as->node_needed_rewrite]);
+
+ prt_printf(out, "\nmode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
bch2_btree_update_modes[as->mode],
as->nodes_written,
closure_nr_remaining(&as->cl),
@@ -2792,16 +2839,16 @@ int bch2_fs_btree_interior_update_init(struct bch_fs *c)
c->btree_interior_update_worker =
alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8);
if (!c->btree_interior_update_worker)
- return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+ return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init);
c->btree_node_rewrite_worker =
alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND);
if (!c->btree_node_rewrite_worker)
- return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+ return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init);
if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
sizeof(struct btree_update)))
- return -BCH_ERR_ENOMEM_btree_interior_update_pool_init;
+ return bch_err_throw(c, ENOMEM_btree_interior_update_pool_init);
return 0;
}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 7fe793788a79..ac04e45a8515 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -57,6 +57,13 @@ struct btree_update {
unsigned took_gc_lock:1;
enum btree_id btree_id;
+ struct bpos node_start;
+ struct bpos node_end;
+ enum btree_node_rewrite_reason node_needed_rewrite;
+ u16 node_written;
+ u16 node_sectors;
+ u16 node_remaining;
+
unsigned update_level_start;
unsigned update_level_end;
@@ -169,6 +176,9 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
struct btree *, unsigned, unsigned);
+int bch2_btree_node_rewrite_key(struct btree_trans *,
+ enum btree_id, unsigned,
+ struct bkey_i *, unsigned);
int bch2_btree_node_rewrite_pos(struct btree_trans *,
enum btree_id, unsigned,
struct bpos, unsigned, unsigned);
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index efb0c64d0aac..4b095235a0d2 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -267,10 +267,9 @@ out:
BUG_ON(wb->sorted.size < wb->flushing.keys.nr);
}
-int bch2_btree_write_buffer_insert_err(struct btree_trans *trans,
+int bch2_btree_write_buffer_insert_err(struct bch_fs *c,
enum btree_id btree, struct bkey_i *k)
{
- struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
prt_printf(&buf, "attempting to do write buffer update on non wb btree=");
@@ -332,7 +331,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
if (unlikely(!btree_type_uses_write_buffer(k->btree))) {
- ret = bch2_btree_write_buffer_insert_err(trans, k->btree, &k->k);
+ ret = bch2_btree_write_buffer_insert_err(trans->c, k->btree, &k->k);
goto err;
}
@@ -394,7 +393,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
bool accounting_accumulated = false;
do {
if (race_fault()) {
- ret = -BCH_ERR_journal_reclaim_would_deadlock;
+ ret = bch_err_throw(c, journal_reclaim_would_deadlock);
break;
}
@@ -633,7 +632,7 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
struct bch_fs *c = trans->c;
if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer))
- return -BCH_ERR_erofs_no_writes;
+ return bch_err_throw(c, erofs_no_writes);
int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans);
enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer);
@@ -676,7 +675,10 @@ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans,
goto err;
bch2_bkey_buf_copy(last_flushed, c, tmp.k);
- ret = -BCH_ERR_transaction_restart_write_buffer_flush;
+
+ /* can we avoid the unconditional restart? */
+ trace_and_count(c, trans_restart_write_buffer_flush, trans, _RET_IP_);
+ ret = bch_err_throw(c, transaction_restart_write_buffer_flush);
}
err:
bch2_bkey_buf_exit(&tmp, c);
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
index 05f56fd1eed0..c351d21aca0b 100644
--- a/fs/bcachefs/btree_write_buffer.h
+++ b/fs/bcachefs/btree_write_buffer.h
@@ -89,6 +89,12 @@ static inline int bch2_journal_key_to_wb(struct bch_fs *c,
struct journal_keys_to_wb *dst,
enum btree_id btree, struct bkey_i *k)
{
+ if (unlikely(!btree_type_uses_write_buffer(btree))) {
+ int ret = bch2_btree_write_buffer_insert_err(c, btree, k);
+ dump_stack();
+ return ret;
+ }
+
EBUG_ON(!dst->seq);
return k->k.type == KEY_TYPE_accounting
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 09eb5a543ae4..f25903c10e8a 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -221,6 +221,20 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
if (!p.ptr.cached &&
data_type == BCH_DATA_btree) {
+ switch (g->data_type) {
+ case BCH_DATA_sb:
+ bch_err(c, "btree and superblock in the same bucket - cannot repair");
+ ret = bch_err_throw(c, fsck_repair_unimplemented);
+ goto out;
+ case BCH_DATA_journal:
+ ret = bch2_dev_journal_bucket_delete(ca, PTR_BUCKET_NR(ca, &p.ptr));
+ bch_err_msg(c, ret, "error deleting journal bucket %zu",
+ PTR_BUCKET_NR(ca, &p.ptr));
+ if (ret)
+ goto out;
+ break;
+ }
+
g->data_type = data_type;
g->stripe_sectors = 0;
g->dirty_sectors = 0;
@@ -270,6 +284,9 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
struct printbuf buf = PRINTBUF;
int ret = 0;
+ /* We don't yet do btree key updates correctly for when we're RW */
+ BUG_ON(test_bit(BCH_FS_rw, &c->flags));
+
bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update);
if (ret)
@@ -277,20 +294,13 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
}
if (do_update) {
- if (flags & BTREE_TRIGGER_is_root) {
- bch_err(c, "cannot update btree roots yet");
- ret = -EINVAL;
- goto err;
- }
-
struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
ret = PTR_ERR_OR_ZERO(new);
if (ret)
goto err;
- rcu_read_lock();
- bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev));
- rcu_read_unlock();
+ scoped_guard(rcu)
+ bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev));
if (level) {
/*
@@ -299,14 +309,11 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
* sort it out:
*/
struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
- rcu_read_lock();
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- struct bucket *g = PTR_GC_BUCKET(ca, ptr);
-
- ptr->gen = g->gen;
- }
- rcu_read_unlock();
+ scoped_guard(rcu)
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ ptr->gen = PTR_GC_BUCKET(ca, ptr)->gen;
+ }
} else {
struct bkey_ptrs ptrs;
union bch_extent_entry *entry;
@@ -370,19 +377,41 @@ found:
bch_info(c, "new key %s", buf.buf);
}
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
- BTREE_ITER_intent|BTREE_ITER_all_snapshots);
- ret = bch2_btree_iter_traverse(trans, &iter) ?:
- bch2_trans_update(trans, &iter, new,
- BTREE_UPDATE_internal_snapshot_node|
- BTREE_TRIGGER_norun);
- bch2_trans_iter_exit(trans, &iter);
- if (ret)
- goto err;
+ if (!(flags & BTREE_TRIGGER_is_root)) {
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
+ BTREE_ITER_intent|BTREE_ITER_all_snapshots);
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
+ bch2_trans_update(trans, &iter, new,
+ BTREE_UPDATE_internal_snapshot_node|
+ BTREE_TRIGGER_norun);
+ bch2_trans_iter_exit(trans, &iter);
+ if (ret)
+ goto err;
+
+ if (level)
+ bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
+ } else {
+ struct jset_entry *e = bch2_trans_jset_entry_alloc(trans,
+ jset_u64s(new->k.u64s));
+ ret = PTR_ERR_OR_ZERO(e);
+ if (ret)
+ goto err;
+
+ journal_entry_set(e,
+ BCH_JSET_ENTRY_btree_root,
+ btree, level - 1,
+ new, new->k.u64s);
- if (level)
- bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
+ /*
+ * no locking, we're single threaded and not rw yet, see
+ * the big assertino above that we repeat here:
+ */
+ BUG_ON(test_bit(BCH_FS_rw, &c->flags));
+
+ struct btree *b = bch2_btree_id_root(c, btree)->b;
+ bkey_copy(&b->key, new);
+ }
}
err:
printbuf_exit(&buf);
@@ -406,7 +435,15 @@ static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf
if (insert) {
bch2_trans_updates_to_text(buf, trans);
__bch2_inconsistent_error(c, buf);
- ret = -BCH_ERR_bucket_ref_update;
+ /*
+ * If we're in recovery, run_explicit_recovery_pass might give
+ * us an error code for rewinding recovery
+ */
+ if (!ret)
+ ret = bch_err_throw(c, bucket_ref_update);
+ } else {
+ /* Always ignore overwrite errors, so that deletion works */
+ ret = 0;
}
if (print || insert)
@@ -595,7 +632,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
if (unlikely(!ca)) {
if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID)
- ret = -BCH_ERR_trigger_pointer;
+ ret = bch_err_throw(c, trigger_pointer);
goto err;
}
@@ -603,7 +640,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
if (!bucket_valid(ca, bucket.offset)) {
if (insert) {
bch2_dev_bucket_missing(ca, bucket.offset);
- ret = -BCH_ERR_trigger_pointer;
+ ret = bch_err_throw(c, trigger_pointer);
}
goto err;
}
@@ -625,7 +662,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
p.ptr.dev,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = -BCH_ERR_trigger_pointer;
+ ret = bch_err_throw(c, trigger_pointer);
goto err;
}
@@ -651,6 +688,8 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
s64 sectors,
enum btree_iter_update_trigger_flags flags)
{
+ struct bch_fs *c = trans->c;
+
if (flags & BTREE_TRIGGER_transactional) {
struct btree_iter iter;
struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter,
@@ -668,7 +707,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
bch2_trans_inconsistent(trans,
"stripe pointer doesn't match stripe %llu",
(u64) p.ec.idx);
- ret = -BCH_ERR_trigger_stripe_pointer;
+ ret = bch_err_throw(c, trigger_stripe_pointer);
goto err;
}
@@ -688,13 +727,11 @@ err:
}
if (flags & BTREE_TRIGGER_gc) {
- struct bch_fs *c = trans->c;
-
struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL);
if (!m) {
bch_err(c, "error allocating memory for gc_stripes, idx %llu",
(u64) p.ec.idx);
- return -BCH_ERR_ENOMEM_mark_stripe_ptr;
+ return bch_err_throw(c, ENOMEM_mark_stripe_ptr);
}
gc_stripe_lock(m);
@@ -709,7 +746,7 @@ err:
__bch2_inconsistent_error(c, &buf);
bch2_print_str(c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
- return -BCH_ERR_trigger_stripe_pointer;
+ return bch_err_throw(c, trigger_stripe_pointer);
}
m->block_sectors[p.ec.block] += sectors;
@@ -732,8 +769,7 @@ err:
static int __trigger_extent(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k,
- enum btree_iter_update_trigger_flags flags,
- s64 *replicas_sectors)
+ enum btree_iter_update_trigger_flags flags)
{
bool gc = flags & BTREE_TRIGGER_gc;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -744,6 +780,8 @@ static int __trigger_extent(struct btree_trans *trans,
: BCH_DATA_user;
int ret = 0;
+ s64 replicas_sectors = 0;
+
struct disk_accounting_pos acc_replicas_key;
memset(&acc_replicas_key, 0, sizeof(acc_replicas_key));
acc_replicas_key.type = BCH_DISK_ACCOUNTING_replicas;
@@ -770,7 +808,7 @@ static int __trigger_extent(struct btree_trans *trans,
if (ret)
return ret;
} else if (!p.has_ec) {
- *replicas_sectors += disk_sectors;
+ replicas_sectors += disk_sectors;
replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev);
} else {
ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
@@ -808,13 +846,13 @@ static int __trigger_extent(struct btree_trans *trans,
}
if (acc_replicas_key.replicas.nr_devs) {
- ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc);
+ ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc);
if (ret)
return ret;
}
if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) {
- ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, snapshot, k.k->p.snapshot);
+ ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, snapshot, k.k->p.snapshot);
if (ret)
return ret;
}
@@ -830,7 +868,7 @@ static int __trigger_extent(struct btree_trans *trans,
}
if (level) {
- ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, btree, btree_id);
+ ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, btree, btree_id);
if (ret)
return ret;
} else {
@@ -839,7 +877,7 @@ static int __trigger_extent(struct btree_trans *trans,
s64 v[3] = {
insert ? 1 : -1,
insert ? k.k->size : -((s64) k.k->size),
- *replicas_sectors,
+ replicas_sectors,
};
ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode);
if (ret)
@@ -871,20 +909,16 @@ int bch2_trigger_extent(struct btree_trans *trans,
return 0;
if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
- s64 old_replicas_sectors = 0, new_replicas_sectors = 0;
-
if (old.k->type) {
int ret = __trigger_extent(trans, btree, level, old,
- flags & ~BTREE_TRIGGER_insert,
- &old_replicas_sectors);
+ flags & ~BTREE_TRIGGER_insert);
if (ret)
return ret;
}
if (new.k->type) {
int ret = __trigger_extent(trans, btree, level, new.s_c,
- flags & ~BTREE_TRIGGER_overwrite,
- &new_replicas_sectors);
+ flags & ~BTREE_TRIGGER_overwrite);
if (ret)
return ret;
}
@@ -971,15 +1005,16 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
bch2_data_type_str(type),
bch2_data_type_str(type));
- bool print = bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf);
+ bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf);
- bch2_run_explicit_recovery_pass(c, &buf,
+ ret = bch2_run_explicit_recovery_pass(c, &buf,
BCH_RECOVERY_PASS_check_allocations, 0);
- if (print)
- bch2_print_str(c, KERN_ERR, buf.buf);
+ /* Always print, this is always fatal */
+ bch2_print_str(c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
- ret = -BCH_ERR_metadata_bucket_inconsistency;
+ if (!ret)
+ ret = bch_err_throw(c, metadata_bucket_inconsistency);
goto err;
}
@@ -1032,7 +1067,7 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *
err_unlock:
bucket_unlock(g);
err:
- return -BCH_ERR_metadata_bucket_inconsistency;
+ return bch_err_throw(c, metadata_bucket_inconsistency);
}
int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
@@ -1247,7 +1282,7 @@ recalculate:
ret = 0;
} else {
atomic64_set(&c->sectors_available, sectors_available);
- ret = -BCH_ERR_ENOSPC_disk_reservation;
+ ret = bch_err_throw(c, ENOSPC_disk_reservation);
}
mutex_unlock(&c->sectors_available_lock);
@@ -1276,7 +1311,7 @@ int bch2_buckets_nouse_alloc(struct bch_fs *c)
GFP_KERNEL|__GFP_ZERO);
if (!ca->buckets_nouse) {
bch2_dev_put(ca);
- return -BCH_ERR_ENOMEM_buckets_nouse;
+ return bch_err_throw(c, ENOMEM_buckets_nouse);
}
}
@@ -1301,12 +1336,12 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
lockdep_assert_held(&c->state_lock);
if (resize && ca->buckets_nouse)
- return -BCH_ERR_no_resize_with_buckets_nouse;
+ return bch_err_throw(c, no_resize_with_buckets_nouse);
bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets),
GFP_KERNEL|__GFP_ZERO);
if (!bucket_gens) {
- ret = -BCH_ERR_ENOMEM_bucket_gens;
+ ret = bch_err_throw(c, ENOMEM_bucket_gens);
goto err;
}
@@ -1325,9 +1360,9 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
sizeof(bucket_gens->b[0]) * copy);
}
- ret = bch2_bucket_bitmap_resize(&ca->bucket_backpointer_mismatch,
+ ret = bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_mismatch,
ca->mi.nbuckets, nbuckets) ?:
- bch2_bucket_bitmap_resize(&ca->bucket_backpointer_empty,
+ bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_empty,
ca->mi.nbuckets, nbuckets);
rcu_assign_pointer(ca->bucket_gens, bucket_gens);
@@ -1354,7 +1389,7 @@ int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
{
ca->usage = alloc_percpu(struct bch_dev_usage_full);
if (!ca->usage)
- return -BCH_ERR_ENOMEM_usage_init;
+ return bch_err_throw(c, ENOMEM_usage_init);
return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index af1532de4a37..49a3807a5eab 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -84,10 +84,8 @@ static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b)
static inline int bucket_gen_get(struct bch_dev *ca, size_t b)
{
- rcu_read_lock();
- int ret = bucket_gen_get_rcu(ca, b);
- rcu_read_unlock();
- return ret;
+ guard(rcu)();
+ return bucket_gen_get_rcu(ca, b);
}
static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
@@ -156,10 +154,8 @@ static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_
*/
static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
{
- rcu_read_lock();
- int ret = dev_ptr_stale_rcu(ca, ptr);
- rcu_read_unlock();
- return ret;
+ guard(rcu)();
+ return dev_ptr_stale_rcu(ca, ptr);
}
/* Device usage: */
diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
index c8a488e6b7b8..832eff93acb6 100644
--- a/fs/bcachefs/buckets_waiting_for_journal.c
+++ b/fs/bcachefs/buckets_waiting_for_journal.c
@@ -108,7 +108,8 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
realloc:
n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
if (!n) {
- ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set;
+ struct bch_fs *c = container_of(b, struct bch_fs, buckets_waiting_for_journal);
+ ret = bch_err_throw(c, ENOMEM_buckets_waiting_for_journal_set);
goto out;
}
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 4066946b26bc..5ea89aa2b0c4 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -319,6 +319,7 @@ static int bch2_data_thread(void *arg)
ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done;
ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done;
}
+ enumerated_ref_put(&ctx->c->writes, BCH_WRITE_REF_ioctl_data);
return 0;
}
@@ -378,15 +379,24 @@ static long bch2_ioctl_data(struct bch_fs *c,
struct bch_data_ctx *ctx;
int ret;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_ioctl_data))
+ return -EROFS;
- if (arg.op >= BCH_DATA_OP_NR || arg.flags)
- return -EINVAL;
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto put_ref;
+ }
+
+ if (arg.op >= BCH_DATA_OP_NR || arg.flags) {
+ ret = -EINVAL;
+ goto put_ref;
+ }
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx)
- return -ENOMEM;
+ if (!ctx) {
+ ret = -ENOMEM;
+ goto put_ref;
+ }
ctx->c = c;
ctx->arg = arg;
@@ -395,11 +405,16 @@ static long bch2_ioctl_data(struct bch_fs *c,
&bcachefs_data_ops,
bch2_data_thread);
if (ret < 0)
- kfree(ctx);
+ goto cleanup;
+ return ret;
+cleanup:
+ kfree(ctx);
+put_ref:
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_ioctl_data);
return ret;
}
-static long bch2_ioctl_fs_usage(struct bch_fs *c,
+static noinline_for_stack long bch2_ioctl_fs_usage(struct bch_fs *c,
struct bch_ioctl_fs_usage __user *user_arg)
{
struct bch_ioctl_fs_usage arg = {};
@@ -469,7 +484,7 @@ err:
}
/* obsolete, didn't allow for new data types: */
-static long bch2_ioctl_dev_usage(struct bch_fs *c,
+static noinline_for_stack long bch2_ioctl_dev_usage(struct bch_fs *c,
struct bch_ioctl_dev_usage __user *user_arg)
{
struct bch_ioctl_dev_usage arg;
@@ -613,15 +628,12 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
if (!dev)
return -EINVAL;
- rcu_read_lock();
+ guard(rcu)();
for_each_online_member_rcu(c, ca)
- if (ca->dev == dev) {
- rcu_read_unlock();
+ if (ca->dev == dev)
return ca->dev_idx;
- }
- rcu_read_unlock();
- return -BCH_ERR_ENOENT_dev_idx_not_found;
+ return bch_err_throw(c, ENOENT_dev_idx_not_found);
}
static long bch2_ioctl_disk_resize(struct bch_fs *c,
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index d3e2e4f776c6..a6795e73f0b9 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -173,7 +173,7 @@ int bch2_encrypt(struct bch_fs *c, unsigned type,
if (bch2_fs_inconsistent_on(!c->chacha20_key_set,
c, "attempting to encrypt without encryption key"))
- return -BCH_ERR_no_encryption_key;
+ return bch_err_throw(c, no_encryption_key);
bch2_chacha20(&c->chacha20_key, nonce, data, len);
return 0;
@@ -262,7 +262,7 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
if (bch2_fs_inconsistent_on(!c->chacha20_key_set,
c, "attempting to encrypt without encryption key"))
- return -BCH_ERR_no_encryption_key;
+ return bch_err_throw(c, no_encryption_key);
bch2_chacha20_init(&chacha_state, &c->chacha20_key, nonce);
@@ -375,7 +375,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
prt_str(&buf, ")");
WARN_RATELIMIT(1, "%s", buf.buf);
printbuf_exit(&buf);
- return -BCH_ERR_recompute_checksum;
+ return bch_err_throw(c, recompute_checksum);
}
for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
@@ -659,7 +659,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
crypt = bch2_sb_field_resize(&c->disk_sb, crypt,
sizeof(*crypt) / sizeof(u64));
if (!crypt) {
- ret = -BCH_ERR_ENOSPC_sb_crypt;
+ ret = bch_err_throw(c, ENOSPC_sb_crypt);
goto err;
}
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index f57f9f4774e6..8e9264b5a84e 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -53,7 +53,6 @@ void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
struct io_clock_wait {
struct io_timer io_timer;
- struct timer_list cpu_timer;
struct task_struct *task;
int expired;
};
@@ -67,15 +66,6 @@ static void io_clock_wait_fn(struct io_timer *timer)
wake_up_process(wait->task);
}
-static void io_clock_cpu_timeout(struct timer_list *timer)
-{
- struct io_clock_wait *wait = container_of(timer,
- struct io_clock_wait, cpu_timer);
-
- wait->expired = 1;
- wake_up_process(wait->task);
-}
-
void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until)
{
struct io_clock_wait wait = {
@@ -90,8 +80,8 @@ void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until)
bch2_io_timer_del(clock, &wait.io_timer);
}
-void bch2_kthread_io_clock_wait(struct io_clock *clock,
- u64 io_until, unsigned long cpu_timeout)
+unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *clock,
+ u64 io_until, unsigned long cpu_timeout)
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct io_clock_wait wait = {
@@ -103,27 +93,26 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
bch2_io_timer_add(clock, &wait.io_timer);
- timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
-
- if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
- mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
-
- do {
- set_current_state(TASK_INTERRUPTIBLE);
- if (kthread && kthread_should_stop())
- break;
-
- if (wait.expired)
- break;
-
- schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!(kthread && kthread_should_stop())) {
+ cpu_timeout = schedule_timeout(cpu_timeout);
try_to_freeze();
- } while (0);
+ }
__set_current_state(TASK_RUNNING);
- timer_delete_sync(&wait.cpu_timer);
- timer_destroy_on_stack(&wait.cpu_timer);
bch2_io_timer_del(clock, &wait.io_timer);
+ return cpu_timeout;
+}
+
+void bch2_kthread_io_clock_wait(struct io_clock *clock,
+ u64 io_until, unsigned long cpu_timeout)
+{
+ bool kthread = (current->flags & PF_KTHREAD) != 0;
+
+ while (!(kthread && kthread_should_stop()) &&
+ cpu_timeout &&
+ atomic64_read(&clock->now) < io_until)
+ cpu_timeout = bch2_kthread_io_clock_wait_once(clock, io_until, cpu_timeout);
}
static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now)
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
index 82c79c8baf92..8769be2aa21e 100644
--- a/fs/bcachefs/clock.h
+++ b/fs/bcachefs/clock.h
@@ -4,6 +4,7 @@
void bch2_io_timer_add(struct io_clock *, struct io_timer *);
void bch2_io_timer_del(struct io_clock *, struct io_timer *);
+unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *, u64, unsigned long);
void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long);
void __bch2_increment_clock(struct io_clock *, u64);
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 1bca61d17092..b37b1f325f0a 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -187,7 +187,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
__bch2_compression_types[crc.compression_type]))
ret = bch2_check_set_has_compressed_data(c, opt);
else
- ret = -BCH_ERR_compression_workspace_not_initialized;
+ ret = bch_err_throw(c, compression_workspace_not_initialized);
if (ret)
goto err;
}
@@ -200,7 +200,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data,
src_len, dst_len, dst_len);
if (ret2 != dst_len)
- ret = -BCH_ERR_decompress_lz4;
+ ret = bch_err_throw(c, decompress_lz4);
break;
case BCH_COMPRESSION_TYPE_gzip: {
z_stream strm = {
@@ -219,7 +219,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
mempool_free(workspace, workspace_pool);
if (ret2 != Z_STREAM_END)
- ret = -BCH_ERR_decompress_gzip;
+ ret = bch_err_throw(c, decompress_gzip);
break;
}
case BCH_COMPRESSION_TYPE_zstd: {
@@ -227,7 +227,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
size_t real_src_len = le32_to_cpup(src_data.b);
if (real_src_len > src_len - 4) {
- ret = -BCH_ERR_decompress_zstd_src_len_bad;
+ ret = bch_err_throw(c, decompress_zstd_src_len_bad);
goto err;
}
@@ -241,7 +241,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
mempool_free(workspace, workspace_pool);
if (ret2 != dst_len)
- ret = -BCH_ERR_decompress_zstd;
+ ret = bch_err_throw(c, decompress_zstd);
break;
}
default:
@@ -270,7 +270,7 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op,
bch2_write_op_error(op, op->pos.offset,
"extent too big to decompress (%u > %u)",
crc->uncompressed_size << 9, c->opts.encoded_extent_max);
- return -BCH_ERR_decompress_exceeded_max_encoded_extent;
+ return bch_err_throw(c, decompress_exceeded_max_encoded_extent);
}
data = __bounce_alloc(c, dst_len, WRITE);
@@ -314,7 +314,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max ||
crc.compressed_size << 9 > c->opts.encoded_extent_max)
- return -BCH_ERR_decompress_exceeded_max_encoded_extent;
+ return bch_err_throw(c, decompress_exceeded_max_encoded_extent);
dst_data = dst_len == dst_iter.bi_size
? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
@@ -656,12 +656,12 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
if (!mempool_initialized(&c->compression_bounce[READ]) &&
mempool_init_kvmalloc_pool(&c->compression_bounce[READ],
1, c->opts.encoded_extent_max))
- return -BCH_ERR_ENOMEM_compression_bounce_read_init;
+ return bch_err_throw(c, ENOMEM_compression_bounce_read_init);
if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE],
1, c->opts.encoded_extent_max))
- return -BCH_ERR_ENOMEM_compression_bounce_write_init;
+ return bch_err_throw(c, ENOMEM_compression_bounce_write_init);
for (i = compression_types;
i < compression_types + ARRAY_SIZE(compression_types);
@@ -675,7 +675,7 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
if (mempool_init_kvmalloc_pool(
&c->compress_workspace[i->type],
1, i->compress_workspace))
- return -BCH_ERR_ENOMEM_compression_workspace_init;
+ return bch_err_throw(c, ENOMEM_compression_workspace_init);
}
return 0;
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
index 50ec3decfe8c..4080ee99aadd 100644
--- a/fs/bcachefs/darray.h
+++ b/fs/bcachefs/darray.h
@@ -8,6 +8,7 @@
* Inspired by CCAN's darray
*/
+#include <linux/cleanup.h>
#include <linux/slab.h>
#define DARRAY_PREALLOCATED(_type, _nr) \
@@ -87,7 +88,23 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
#define darray_remove_item(_d, _pos) \
array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
-#define __darray_for_each(_d, _i) \
+#define darray_find_p(_d, _i, cond) \
+({ \
+ typeof((_d).data) _ret = NULL; \
+ \
+ darray_for_each(_d, _i) \
+ if (cond) { \
+ _ret = _i; \
+ break; \
+ } \
+ _ret; \
+})
+
+#define darray_find(_d, _item) darray_find_p(_d, _i, *_i == _item)
+
+/* Iteration: */
+
+#define __darray_for_each(_d, _i) \
for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
#define darray_for_each(_d, _i) \
@@ -96,6 +113,8 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
#define darray_for_each_reverse(_d, _i) \
for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i)
+/* Init/exit */
+
#define darray_init(_d) \
do { \
(_d)->nr = 0; \
@@ -111,4 +130,29 @@ do { \
darray_init(_d); \
} while (0)
+#define DEFINE_DARRAY_CLASS(_type) \
+DEFINE_CLASS(_type, _type, darray_exit(&(_T)), (_type) {}, void)
+
+#define DEFINE_DARRAY(_type) \
+typedef DARRAY(_type) darray_##_type; \
+DEFINE_DARRAY_CLASS(darray_##_type)
+
+#define DEFINE_DARRAY_NAMED(_name, _type) \
+typedef DARRAY(_type) _name; \
+DEFINE_DARRAY_CLASS(_name)
+
+DEFINE_DARRAY_CLASS(darray_char);
+DEFINE_DARRAY_CLASS(darray_str)
+DEFINE_DARRAY_CLASS(darray_const_str)
+
+DEFINE_DARRAY_CLASS(darray_u8)
+DEFINE_DARRAY_CLASS(darray_u16)
+DEFINE_DARRAY_CLASS(darray_u32)
+DEFINE_DARRAY_CLASS(darray_u64)
+
+DEFINE_DARRAY_CLASS(darray_s8)
+DEFINE_DARRAY_CLASS(darray_s16)
+DEFINE_DARRAY_CLASS(darray_s32)
+DEFINE_DARRAY_CLASS(darray_s64)
+
#endif /* _BCACHEFS_DARRAY_H */
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index c34e5b88ba9d..e848e210a9bf 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -66,37 +66,46 @@ static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k)
}
}
-static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_s_c k)
+static noinline_for_stack
+bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs,
+ const struct bch_extent_ptr *start)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ if (!ctxt) {
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (ptr == start)
+ break;
+
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+ struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+ bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
+ }
+ return false;
+ }
- bkey_for_each_ptr(ptrs, ptr) {
+ __bkey_for_each_ptr(start, ptrs.end, ptr) {
struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
- if (ctxt) {
- bool locked;
-
- move_ctxt_wait_event(ctxt,
- (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
- list_empty(&ctxt->ios));
+ bool locked;
+ move_ctxt_wait_event(ctxt,
+ (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
+ list_empty(&ctxt->ios));
+ if (!locked)
+ bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
+ }
+ return true;
+}
- if (!locked)
- bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
- } else {
- if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) {
- bkey_for_each_ptr(ptrs, ptr2) {
- if (ptr2 == ptr)
- break;
+static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs)
+{
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+ struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
- ca = bch2_dev_have_ref(c, ptr2->dev);
- bucket = PTR_BUCKET_POS(ca, ptr2);
- bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
- }
- return false;
- }
- }
+ if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0))
+ return __bkey_nocow_lock(c, ctxt, ptrs, ptr);
}
+
return true;
}
@@ -240,13 +249,14 @@ static int data_update_invalid_bkey(struct data_update *m,
bch2_bkey_val_to_text(&buf, c, k);
prt_str(&buf, "\nnew: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+ prt_newline(&buf);
bch2_fs_emergency_read_only2(c, &buf);
bch2_print_str(c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
- return -BCH_ERR_invalid_bkey;
+ return bch_err_throw(c, invalid_bkey);
}
static int __bch2_data_update_index_update(struct btree_trans *trans,
@@ -367,21 +377,21 @@ restart_drop_conflicting_replicas:
bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i));
/* Now, drop excess replicas: */
- rcu_read_lock();
+ scoped_guard(rcu) {
restart_drop_extra_replicas:
- bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
- unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
+ bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
+ unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
- if (!p.ptr.cached &&
- durability - ptr_durability >= m->op.opts.data_replicas) {
- durability -= ptr_durability;
+ if (!p.ptr.cached &&
+ durability - ptr_durability >= m->op.opts.data_replicas) {
+ durability -= ptr_durability;
- bch2_extent_ptr_set_cached(c, &m->op.opts,
- bkey_i_to_s(insert), &entry->ptr);
- goto restart_drop_extra_replicas;
+ bch2_extent_ptr_set_cached(c, &m->op.opts,
+ bkey_i_to_s(insert), &entry->ptr);
+ goto restart_drop_extra_replicas;
+ }
}
}
- rcu_read_unlock();
/* Finally, add the pointers we just wrote: */
extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
@@ -523,8 +533,9 @@ void bch2_data_update_exit(struct data_update *update)
bch2_bkey_buf_exit(&update->k, c);
}
-static int bch2_update_unwritten_extent(struct btree_trans *trans,
- struct data_update *update)
+static noinline_for_stack
+int bch2_update_unwritten_extent(struct btree_trans *trans,
+ struct data_update *update)
{
struct bch_fs *c = update->op.c;
struct bkey_i_extent *e;
@@ -716,18 +727,10 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
-int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
- struct bch_io_opts *io_opts)
+static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
+ struct bch_io_opts *io_opts,
+ unsigned buf_bytes)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- /* write path might have to decompress data: */
- unsigned buf_bytes = 0;
- bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry)
- buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
-
unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE);
m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL);
@@ -751,11 +754,26 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
return 0;
}
+int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
+ struct bch_io_opts *io_opts)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ /* write path might have to decompress data: */
+ unsigned buf_bytes = 0;
+ bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry)
+ buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
+
+ return __bch2_data_update_bios_init(m, c, io_opts, buf_bytes);
+}
+
static int can_write_extent(struct bch_fs *c, struct data_update *m)
{
if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark)))
- return -BCH_ERR_data_update_done_would_block;
+ return bch_err_throw(c, data_update_done_would_block);
unsigned target = m->op.flags & BCH_WRITE_only_specified_devs
? m->op.target
@@ -765,7 +783,8 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
darray_for_each(m->op.devs_have, i)
__clear_bit(*i, devs.d);
- rcu_read_lock();
+ guard(rcu)();
+
unsigned nr_replicas = 0, i;
for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
struct bch_dev *ca = bch2_dev_rcu_noerror(c, i);
@@ -782,12 +801,11 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
if (nr_replicas >= m->op.nr_replicas)
break;
}
- rcu_read_unlock();
if (!nr_replicas)
- return -BCH_ERR_data_update_done_no_rw_devs;
+ return bch_err_throw(c, data_update_done_no_rw_devs);
if (nr_replicas < m->op.nr_replicas)
- return -BCH_ERR_insufficient_devices;
+ return bch_err_throw(c, insufficient_devices);
return 0;
}
@@ -802,19 +820,21 @@ int bch2_data_update_init(struct btree_trans *trans,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned reserve_sectors = k.k->size * data_opts.extra_replicas;
int ret = 0;
- /*
- * fs is corrupt we have a key for a snapshot node that doesn't exist,
- * and we have to check for this because we go rw before repairing the
- * snapshots table - just skip it, we can move it later.
- */
- if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot)))
- return -BCH_ERR_data_update_done_no_snapshot;
+ if (k.k->p.snapshot) {
+ ret = bch2_check_key_has_snapshot(trans, iter, k);
+ if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) {
+ /* Can't repair yet, waiting on other recovery passes */
+ return bch_err_throw(c, data_update_done_no_snapshot);
+ }
+ if (ret < 0)
+ return ret;
+ if (ret) /* key was deleted */
+ return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ bch_err_throw(c, data_update_done_no_snapshot);
+ ret = 0;
+ }
bch2_bkey_buf_init(&m->k);
bch2_bkey_buf_reassemble(&m->k, c, k);
@@ -842,10 +862,17 @@ int bch2_data_update_init(struct btree_trans *trans,
unsigned durability_have = 0, durability_removing = 0;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned reserve_sectors = k.k->size * data_opts.extra_replicas;
+ unsigned buf_bytes = 0;
+ bool unwritten = false;
+
unsigned ptr_bit = 1;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
if (!p.ptr.cached) {
- rcu_read_lock();
+ guard(rcu)();
if (ptr_bit & m->data_opts.rewrite_ptrs) {
if (crc_is_compressed(p.crc))
reserve_sectors += k.k->size;
@@ -856,7 +883,6 @@ int bch2_data_update_init(struct btree_trans *trans,
bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
durability_have += bch2_extent_ptr_durability(c, &p);
}
- rcu_read_unlock();
}
/*
@@ -872,6 +898,9 @@ int bch2_data_update_init(struct btree_trans *trans,
if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
m->op.incompressible = true;
+ buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
+ unwritten |= p.ptr.unwritten;
+
ptr_bit <<= 1;
}
@@ -910,7 +939,7 @@ int bch2_data_update_init(struct btree_trans *trans,
if (iter)
ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts);
if (!ret)
- ret = -BCH_ERR_data_update_done_no_writes_needed;
+ ret = bch_err_throw(c, data_update_done_no_writes_needed);
goto out_bkey_buf_exit;
}
@@ -941,23 +970,25 @@ int bch2_data_update_init(struct btree_trans *trans,
}
if (!bkey_get_dev_refs(c, k)) {
- ret = -BCH_ERR_data_update_done_no_dev_refs;
+ ret = bch_err_throw(c, data_update_done_no_dev_refs);
goto out_put_disk_res;
}
if (c->opts.nocow_enabled &&
- !bkey_nocow_lock(c, ctxt, k)) {
- ret = -BCH_ERR_nocow_lock_blocked;
+ !bkey_nocow_lock(c, ctxt, ptrs)) {
+ ret = bch_err_throw(c, nocow_lock_blocked);
goto out_put_dev_refs;
}
- if (bkey_extent_is_unwritten(k)) {
+ if (unwritten) {
ret = bch2_update_unwritten_extent(trans, m) ?:
- -BCH_ERR_data_update_done_unwritten;
+ bch_err_throw(c, data_update_done_unwritten);
goto out_nocow_unlock;
}
- ret = bch2_data_update_bios_init(m, c, io_opts);
+ bch2_trans_unlock(trans);
+
+ ret = __bch2_data_update_bios_init(m, c, io_opts, buf_bytes);
if (ret)
goto out_nocow_unlock;
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 4fa70634c90e..07c2a0f73cc2 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -153,8 +153,6 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
c->verify_data = __bch2_btree_node_mem_alloc(c);
if (!c->verify_data)
goto out;
-
- list_del_init(&c->verify_data->list);
}
BUG_ON(b->nsets != 1);
@@ -492,6 +490,8 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *
prt_printf(out, "journal pin %px:\t%llu\n",
&b->writes[1].journal, b->writes[1].journal.seq);
+ prt_printf(out, "ob:\t%u\n", b->ob.nr);
+
printbuf_indent_sub(out, 2);
}
@@ -508,27 +508,27 @@ static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
i->ret = 0;
do {
- struct bucket_table *tbl;
- struct rhash_head *pos;
- struct btree *b;
-
ret = bch2_debugfs_flush_buf(i);
if (ret)
return ret;
- rcu_read_lock();
i->buf.atomic++;
- tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
- &c->btree_cache.table);
- if (i->iter < tbl->size) {
- rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
- bch2_cached_btree_node_to_text(&i->buf, c, b);
- i->iter++;
- } else {
- done = true;
+ scoped_guard(rcu) {
+ struct bucket_table *tbl =
+ rht_dereference_rcu(c->btree_cache.table.tbl,
+ &c->btree_cache.table);
+ if (i->iter < tbl->size) {
+ struct rhash_head *pos;
+ struct btree *b;
+
+ rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
+ bch2_cached_btree_node_to_text(&i->buf, c, b);
+ i->iter++;
+ } else {
+ done = true;
+ }
}
--i->buf.atomic;
- rcu_read_unlock();
} while (!done);
if (i->buf.allocation_failure)
@@ -584,6 +584,8 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
i->ubuf = buf;
i->size = size;
i->ret = 0;
+
+ int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
restart:
seqmutex_lock(&c->btree_trans_lock);
list_sort(&c->btree_trans_list, list_ptr_order_cmp);
@@ -597,6 +599,11 @@ restart:
if (!closure_get_not_zero(&trans->ref))
continue;
+ if (!trans->srcu_held) {
+ closure_put(&trans->ref);
+ continue;
+ }
+
u32 seq = seqmutex_unlock(&c->btree_trans_lock);
bch2_btree_trans_to_text(&i->buf, trans);
@@ -618,6 +625,8 @@ restart:
}
seqmutex_unlock(&c->btree_trans_lock);
unlocked:
+ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+
if (i->buf.allocation_failure)
ret = -ENOMEM;
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index d198001838f3..28875c5c86ad 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -13,12 +13,15 @@
#include <linux/dcache.h>
+#ifdef CONFIG_UNICODE
int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
const struct qstr *str, struct qstr *out_cf)
{
*out_cf = (struct qstr) QSTR_INIT(NULL, 0);
-#ifdef CONFIG_UNICODE
+ if (!bch2_fs_casefold_enabled(trans->c))
+ return -EOPNOTSUPP;
+
unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1);
int ret = PTR_ERR_OR_ZERO(buf);
if (ret)
@@ -30,10 +33,8 @@ int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
*out_cf = (struct qstr) QSTR_INIT(buf, ret);
return 0;
-#else
- return -EOPNOTSUPP;
-#endif
}
+#endif
static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
{
@@ -231,70 +232,66 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type));
}
-static struct bkey_i_dirent *dirent_alloc_key(struct btree_trans *trans,
- subvol_inum dir,
- u8 type,
- int name_len, int cf_name_len,
- u64 dst)
+int bch2_dirent_init_name(struct bch_fs *c,
+ struct bkey_i_dirent *dirent,
+ const struct bch_hash_info *hash_info,
+ const struct qstr *name,
+ const struct qstr *cf_name)
{
- struct bkey_i_dirent *dirent;
- unsigned u64s = BKEY_U64s + dirent_val_u64s(name_len, cf_name_len);
+ EBUG_ON(hash_info->cf_encoding == NULL && cf_name);
+ int cf_len = 0;
- BUG_ON(u64s > U8_MAX);
-
- dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
- if (IS_ERR(dirent))
- return dirent;
+ if (name->len > BCH_NAME_MAX)
+ return -ENAMETOOLONG;
- bkey_dirent_init(&dirent->k_i);
- dirent->k.u64s = u64s;
+ dirent->v.d_casefold = hash_info->cf_encoding != NULL;
- if (type != DT_SUBVOL) {
- dirent->v.d_inum = cpu_to_le64(dst);
+ if (!dirent->v.d_casefold) {
+ memcpy(&dirent->v.d_name[0], name->name, name->len);
+ memset(&dirent->v.d_name[name->len], 0,
+ bkey_val_bytes(&dirent->k) -
+ offsetof(struct bch_dirent, d_name) -
+ name->len);
} else {
- dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol);
- dirent->v.d_child_subvol = cpu_to_le32(dst);
- }
+ if (!bch2_fs_casefold_enabled(c))
+ return -EOPNOTSUPP;
- dirent->v.d_type = type;
- dirent->v.d_unused = 0;
- dirent->v.d_casefold = cf_name_len ? 1 : 0;
+#ifdef CONFIG_UNICODE
+ memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len);
- return dirent;
-}
+ char *cf_out = &dirent->v.d_cf_name_block.d_names[name->len];
-static void dirent_init_regular_name(struct bkey_i_dirent *dirent,
- const struct qstr *name)
-{
- EBUG_ON(dirent->v.d_casefold);
+ if (cf_name) {
+ cf_len = cf_name->len;
- memcpy(&dirent->v.d_name[0], name->name, name->len);
- memset(&dirent->v.d_name[name->len], 0,
- bkey_val_bytes(&dirent->k) -
- offsetof(struct bch_dirent, d_name) -
- name->len);
-}
+ memcpy(cf_out, cf_name->name, cf_name->len);
+ } else {
+ cf_len = utf8_casefold(hash_info->cf_encoding, name,
+ cf_out,
+ bkey_val_end(bkey_i_to_s(&dirent->k_i)) - (void *) cf_out);
+ if (cf_len <= 0)
+ return cf_len;
+ }
-static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent,
- const struct qstr *name,
- const struct qstr *cf_name)
-{
- EBUG_ON(!dirent->v.d_casefold);
- EBUG_ON(!cf_name->len);
-
- dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len);
- dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_name->len);
- memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len);
- memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len);
- memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0,
- bkey_val_bytes(&dirent->k) -
- offsetof(struct bch_dirent, d_cf_name_block.d_names) -
- name->len + cf_name->len);
-
- EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_name->len);
+ memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_len], 0,
+ bkey_val_bytes(&dirent->k) -
+ offsetof(struct bch_dirent, d_cf_name_block.d_names) -
+ name->len + cf_len);
+
+ dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len);
+ dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_len);
+
+ EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_len);
+#endif
+ }
+
+ unsigned u64s = dirent_val_u64s(name->len, cf_len);
+ BUG_ON(u64s > bkey_val_u64s(&dirent->k));
+ set_bkey_val_u64s(&dirent->k, u64s);
+ return 0;
}
-static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
+struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *trans,
const struct bch_hash_info *hash_info,
subvol_inum dir,
u8 type,
@@ -302,31 +299,28 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
const struct qstr *cf_name,
u64 dst)
{
- struct bkey_i_dirent *dirent;
- struct qstr _cf_name;
-
- if (name->len > BCH_NAME_MAX)
- return ERR_PTR(-ENAMETOOLONG);
+ struct bkey_i_dirent *dirent = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64));
+ if (IS_ERR(dirent))
+ return dirent;
- if (hash_info->cf_encoding && !cf_name) {
- int ret = bch2_casefold(trans, hash_info, name, &_cf_name);
- if (ret)
- return ERR_PTR(ret);
+ bkey_dirent_init(&dirent->k_i);
+ dirent->k.u64s = BKEY_U64s_MAX;
- cf_name = &_cf_name;
+ if (type != DT_SUBVOL) {
+ dirent->v.d_inum = cpu_to_le64(dst);
+ } else {
+ dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol);
+ dirent->v.d_child_subvol = cpu_to_le32(dst);
}
- dirent = dirent_alloc_key(trans, dir, type, name->len, cf_name ? cf_name->len : 0, dst);
- if (IS_ERR(dirent))
- return dirent;
+ dirent->v.d_type = type;
+ dirent->v.d_unused = 0;
- if (cf_name)
- dirent_init_casefolded_name(dirent, name, cf_name);
- else
- dirent_init_regular_name(dirent, name);
+ int ret = bch2_dirent_init_name(trans->c, dirent, hash_info, name, cf_name);
+ if (ret)
+ return ERR_PTR(ret);
EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len);
-
return dirent;
}
@@ -341,7 +335,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
struct bkey_i_dirent *dirent;
int ret;
- dirent = dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum);
+ dirent = bch2_dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum);
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
@@ -365,7 +359,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
struct bkey_i_dirent *dirent;
int ret;
- dirent = dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum);
+ dirent = bch2_dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum);
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
@@ -402,8 +396,8 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
}
int bch2_dirent_rename(struct btree_trans *trans,
- subvol_inum src_dir, struct bch_hash_info *src_hash, u64 *src_dir_i_size,
- subvol_inum dst_dir, struct bch_hash_info *dst_hash, u64 *dst_dir_i_size,
+ subvol_inum src_dir, struct bch_hash_info *src_hash,
+ subvol_inum dst_dir, struct bch_hash_info *dst_hash,
const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
enum bch_rename_mode mode)
@@ -470,8 +464,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
*src_offset = dst_iter.pos.offset;
/* Create new dst key: */
- new_dst = dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name,
- dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0);
+ new_dst = bch2_dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name,
+ dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0);
ret = PTR_ERR_OR_ZERO(new_dst);
if (ret)
goto out;
@@ -481,8 +475,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
/* Create new src key: */
if (mode == BCH_RENAME_EXCHANGE) {
- new_src = dirent_create_key(trans, src_hash, src_dir, 0, src_name,
- src_hash->cf_encoding ? &src_name_lookup : NULL, 0);
+ new_src = bch2_dirent_create_key(trans, src_hash, src_dir, 0, src_name,
+ src_hash->cf_encoding ? &src_name_lookup : NULL, 0);
ret = PTR_ERR_OR_ZERO(new_src);
if (ret)
goto out;
@@ -542,14 +536,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
new_src->v.d_type == DT_SUBVOL)
new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
- if (old_dst.k)
- *dst_dir_i_size -= bkey_bytes(old_dst.k);
- *src_dir_i_size -= bkey_bytes(old_src.k);
-
- if (mode == BCH_RENAME_EXCHANGE)
- *src_dir_i_size += bkey_bytes(&new_src->k);
- *dst_dir_i_size += bkey_bytes(&new_dst->k);
-
ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
if (ret)
goto out;
@@ -656,7 +642,7 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol)
continue;
- ret = -BCH_ERR_ENOTEMPTY_dir_not_empty;
+ ret = bch_err_throw(trans->c, ENOTEMPTY_dir_not_empty);
break;
}
bch2_trans_iter_exit(trans, &iter);
@@ -692,7 +678,9 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv
return !ret;
}
-int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
+int bch2_readdir(struct bch_fs *c, subvol_inum inum,
+ struct bch_hash_info *hash_info,
+ struct dir_context *ctx)
{
struct bkey_buf sk;
bch2_bkey_buf_init(&sk);
@@ -710,7 +698,11 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
subvol_inum target;
- int ret2 = bch2_dirent_read_target(trans, inum, dirent, &target);
+
+ bool need_second_pass = false;
+ int ret2 = bch2_str_hash_check_key(trans, NULL, &bch2_dirent_hash_desc,
+ hash_info, &iter, k, &need_second_pass) ?:
+ bch2_dirent_read_target(trans, inum, dirent, &target);
if (ret2 > 0)
continue;
@@ -740,7 +732,7 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
ret = bch2_inode_unpack(k, inode);
goto found;
}
- ret = -BCH_ERR_ENOENT_inode;
+ ret = bch_err_throw(trans->c, ENOENT_inode);
found:
bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
bch2_trans_iter_exit(trans, &iter);
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index d3e7ae669575..0417608c18d5 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -23,8 +23,16 @@ struct bch_fs;
struct bch_hash_info;
struct bch_inode_info;
+#ifdef CONFIG_UNICODE
int bch2_casefold(struct btree_trans *, const struct bch_hash_info *,
const struct qstr *, struct qstr *);
+#else
+static inline int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
+ const struct qstr *str, struct qstr *out_cf)
+{
+ return -EOPNOTSUPP;
+}
+#endif
static inline int bch2_maybe_casefold(struct btree_trans *trans,
const struct bch_hash_info *info,
@@ -38,7 +46,7 @@ static inline int bch2_maybe_casefold(struct btree_trans *trans,
}
}
-struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d);
+struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent);
static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len)
{
@@ -59,6 +67,15 @@ static inline void dirent_copy_target(struct bkey_i_dirent *dst,
dst->v.d_type = src.v->d_type;
}
+int bch2_dirent_init_name(struct bch_fs *,
+ struct bkey_i_dirent *,
+ const struct bch_hash_info *,
+ const struct qstr *,
+ const struct qstr *);
+struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *,
+ const struct bch_hash_info *, subvol_inum, u8,
+ const struct qstr *, const struct qstr *, u64);
+
int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *,
@@ -80,8 +97,8 @@ enum bch_rename_mode {
};
int bch2_dirent_rename(struct btree_trans *,
- subvol_inum, struct bch_hash_info *, u64 *,
- subvol_inum, struct bch_hash_info *, u64 *,
+ subvol_inum, struct bch_hash_info *,
+ subvol_inum, struct bch_hash_info *,
const struct qstr *, subvol_inum *, u64 *,
const struct qstr *, subvol_inum *, u64 *,
enum bch_rename_mode);
@@ -95,7 +112,7 @@ u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32);
int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
-int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
+int bch2_readdir(struct bch_fs *, subvol_inum, struct bch_hash_info *, struct dir_context *);
int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos);
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index b3840ff7c407..f7528cd69c73 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -390,7 +390,7 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun
err:
free_percpu(n.v[1]);
free_percpu(n.v[0]);
- return -BCH_ERR_ENOMEM_disk_accounting;
+ return bch_err_throw(c, ENOMEM_disk_accounting);
}
int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
@@ -401,7 +401,7 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
if (mode != BCH_ACCOUNTING_read &&
accounting_to_replicas(&r.e, a.k->p) &&
!bch2_replicas_marked_locked(c, &r.e))
- return -BCH_ERR_btree_insert_need_mark_replicas;
+ return bch_err_throw(c, btree_insert_need_mark_replicas);
percpu_up_read(&c->mark_lock);
percpu_down_write(&c->mark_lock);
@@ -419,7 +419,7 @@ int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounti
if (mode != BCH_ACCOUNTING_read &&
accounting_to_replicas(&r.e, a.k->p) &&
!bch2_replicas_marked_locked(c, &r.e))
- return -BCH_ERR_btree_insert_need_mark_replicas;
+ return bch_err_throw(c, btree_insert_need_mark_replicas);
return __bch2_accounting_mem_insert(c, a);
}
@@ -559,7 +559,7 @@ int bch2_gc_accounting_start(struct bch_fs *c)
sizeof(u64), GFP_KERNEL);
if (!e->v[1]) {
bch2_accounting_free_counters(acc, true);
- ret = -BCH_ERR_ENOMEM_disk_accounting;
+ ret = bch_err_throw(c, ENOMEM_disk_accounting);
break;
}
}
@@ -618,7 +618,9 @@ int bch2_gc_accounting_done(struct bch_fs *c)
for (unsigned j = 0; j < nr; j++)
src_v[j] -= dst_v[j];
- if (fsck_err(trans, accounting_mismatch, "%s", buf.buf)) {
+ bch2_trans_unlock_long(trans);
+
+ if (fsck_err(c, accounting_mismatch, "%s", buf.buf)) {
percpu_up_write(&c->mark_lock);
ret = commit_do(trans, NULL, NULL, 0,
bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false));
@@ -737,7 +739,7 @@ invalid_device:
bch2_disk_accounting_mod(trans, acc, v, nr, false)) ?:
-BCH_ERR_remove_disk_accounting_entry;
} else {
- ret = -BCH_ERR_remove_disk_accounting_entry;
+ ret = bch_err_throw(c, remove_disk_accounting_entry);
}
goto fsck_err;
}
@@ -897,8 +899,8 @@ int bch2_accounting_read(struct bch_fs *c)
case BCH_DISK_ACCOUNTING_replicas:
fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]);
break;
- case BCH_DISK_ACCOUNTING_dev_data_type:
- rcu_read_lock();
+ case BCH_DISK_ACCOUNTING_dev_data_type: {
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev);
if (ca) {
struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type];
@@ -910,9 +912,9 @@ int bch2_accounting_read(struct bch_fs *c)
k.dev_data_type.data_type == BCH_DATA_journal)
usage->hidden += v[0] * ca->mi.bucket_size;
}
- rcu_read_unlock();
break;
}
+ }
}
preempt_enable();
fsck_err:
@@ -1006,19 +1008,18 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
case BCH_DISK_ACCOUNTING_replicas:
fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]);
break;
- case BCH_DISK_ACCOUNTING_dev_data_type: {
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
- if (!ca) {
- rcu_read_unlock();
- continue;
+ case BCH_DISK_ACCOUNTING_dev_data_type:
+ {
+ guard(rcu)(); /* scoped guard is a loop, and doesn't play nicely with continue */
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
+ if (!ca)
+ continue;
+
+ v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets);
+ v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors);
+ v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented);
}
- v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets);
- v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors);
- v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented);
- rcu_read_unlock();
-
if (memcmp(a.v->d, v, 3 * sizeof(u64))) {
struct printbuf buf = PRINTBUF;
@@ -1032,7 +1033,6 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
mismatch = true;
}
}
- }
0;
})));
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index f6098e33ab30..d61abebf3e0b 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -174,17 +174,17 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
case BCH_DISK_ACCOUNTING_replicas:
fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]);
break;
- case BCH_DISK_ACCOUNTING_dev_data_type:
- rcu_read_lock();
+ case BCH_DISK_ACCOUNTING_dev_data_type: {
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
if (ca) {
this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]);
this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]);
this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]);
}
- rcu_read_unlock();
break;
}
+ }
}
unsigned idx;
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index c20ecf5e5381..cde842ac1886 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -130,7 +130,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL);
if (!cpu_g)
- return -BCH_ERR_ENOMEM_disk_groups_to_cpu;
+ return bch_err_throw(c, ENOMEM_disk_groups_to_cpu);
cpu_g->nr = nr_groups;
@@ -170,36 +170,28 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
{
struct target t = target_decode(target);
- struct bch_devs_mask *devs;
- rcu_read_lock();
+ guard(rcu)();
switch (t.type) {
case TARGET_NULL:
- devs = NULL;
- break;
+ return NULL;
case TARGET_DEV: {
struct bch_dev *ca = t.dev < c->sb.nr_devices
? rcu_dereference(c->devs[t.dev])
: NULL;
- devs = ca ? &ca->self : NULL;
- break;
+ return ca ? &ca->self : NULL;
}
case TARGET_GROUP: {
struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
- devs = g && t.group < g->nr && !g->entries[t.group].deleted
+ return g && t.group < g->nr && !g->entries[t.group].deleted
? &g->entries[t.group].devs
: NULL;
- break;
}
default:
BUG();
}
-
- rcu_read_unlock();
-
- return devs;
}
bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
@@ -384,7 +376,7 @@ void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
bch2_printbuf_make_room(out, 4096);
out->atomic++;
- rcu_read_lock();
+ guard(rcu)();
struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
for (unsigned i = 0; i < (g ? g->nr : 0); i++) {
@@ -405,16 +397,14 @@ next:
prt_newline(out);
}
- rcu_read_unlock();
out->atomic--;
}
void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
{
out->atomic++;
- rcu_read_lock();
+ guard(rcu)();
__bch2_disk_path_to_text(out, rcu_dereference(c->disk_groups), v),
- rcu_read_unlock();
--out->atomic;
}
@@ -535,13 +525,11 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
switch (t.type) {
case TARGET_NULL:
prt_printf(out, "none");
- break;
+ return;
case TARGET_DEV: {
- struct bch_dev *ca;
-
out->atomic++;
- rcu_read_lock();
- ca = t.dev < c->sb.nr_devices
+ guard(rcu)();
+ struct bch_dev *ca = t.dev < c->sb.nr_devices
? rcu_dereference(c->devs[t.dev])
: NULL;
@@ -552,13 +540,12 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
else
prt_printf(out, "invalid device %u", t.dev);
- rcu_read_unlock();
out->atomic--;
- break;
+ return;
}
case TARGET_GROUP:
bch2_disk_path_to_text(out, c, t.group);
- break;
+ return;
default:
BUG();
}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index c581426e3894..543dbba9b14f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -213,7 +213,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
a->dirty_sectors,
a->stripe, s.k->p.offset,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
+ ret = bch_err_throw(c, mark_stripe);
goto err;
}
@@ -224,7 +224,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
a->dirty_sectors,
a->cached_sectors,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
+ ret = bch_err_throw(c, mark_stripe);
goto err;
}
} else {
@@ -234,7 +234,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
bucket.inode, bucket.offset, a->gen,
a->stripe,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
+ ret = bch_err_throw(c, mark_stripe);
goto err;
}
@@ -244,7 +244,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
bch2_data_type_str(a->data_type),
bch2_data_type_str(data_type),
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
+ ret = bch_err_throw(c, mark_stripe);
goto err;
}
@@ -256,7 +256,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
a->dirty_sectors,
a->cached_sectors,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
+ ret = bch_err_throw(c, mark_stripe);
goto err;
}
}
@@ -295,7 +295,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
if (unlikely(!ca)) {
if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite))
- ret = -BCH_ERR_mark_stripe;
+ ret = bch_err_throw(c, mark_stripe);
goto err;
}
@@ -325,7 +325,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n%s",
ptr->dev,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
+ ret = bch_err_throw(c, mark_stripe);
goto err;
}
@@ -428,7 +428,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
if (!gc) {
bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx);
- return -BCH_ERR_ENOMEM_mark_stripe;
+ return bch_err_throw(c, ENOMEM_mark_stripe);
}
/*
@@ -536,7 +536,8 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
}
/* XXX: this is a non-mempoolified memory allocation: */
-static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
+static int ec_stripe_buf_init(struct bch_fs *c,
+ struct ec_stripe_buf *buf,
unsigned offset, unsigned size)
{
struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
@@ -564,7 +565,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
return 0;
err:
ec_stripe_buf_exit(buf);
- return -BCH_ERR_ENOMEM_stripe_buf;
+ return bch_err_throw(c, ENOMEM_stripe_buf);
}
/* Checksumming: */
@@ -840,7 +841,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
buf = kzalloc(sizeof(*buf), GFP_NOFS);
if (!buf)
- return -BCH_ERR_ENOMEM_ec_read_extent;
+ return bch_err_throw(c, ENOMEM_ec_read_extent);
ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
if (ret) {
@@ -861,7 +862,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
goto err;
}
- ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
+ ret = ec_stripe_buf_init(c, buf, offset, bio_sectors(&rbio->bio));
if (ret) {
msg = "-ENOMEM";
goto err;
@@ -894,7 +895,7 @@ err:
bch_err_ratelimited(c,
"error doing reconstruct read: %s\n %s", msg, msgbuf.buf);
printbuf_exit(&msgbuf);
- ret = -BCH_ERR_stripe_reconstruct;
+ ret = bch_err_throw(c, stripe_reconstruct);
goto out;
}
@@ -904,7 +905,7 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
{
if (c->gc_pos.phase != GC_PHASE_not_running &&
!genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
- return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
+ return bch_err_throw(c, ENOMEM_ec_stripe_mem_alloc);
return 0;
}
@@ -1129,7 +1130,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
bch2_fs_inconsistent(c, "%s", buf.buf);
printbuf_exit(&buf);
- return -BCH_ERR_erasure_coding_found_btree_node;
+ return bch_err_throw(c, erasure_coding_found_btree_node);
}
k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed);
@@ -1195,7 +1196,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
if (!ca)
- return -BCH_ERR_ENOENT_dev_not_found;
+ return bch_err_throw(c, ENOENT_dev_not_found);
struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
@@ -1256,7 +1257,7 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE,
BCH_DEV_WRITE_REF_ec_bucket_zero);
if (!ca) {
- s->err = -BCH_ERR_erofs_no_writes;
+ s->err = bch_err_throw(c, erofs_no_writes);
return;
}
@@ -1320,7 +1321,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (ec_do_recov(c, &s->existing_stripe)) {
bch_err(c, "error creating stripe: error reading existing stripe");
- ret = -BCH_ERR_ec_block_read;
+ ret = bch_err_throw(c, ec_block_read);
goto err;
}
@@ -1346,7 +1347,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (ec_nr_failed(&s->new_stripe)) {
bch_err(c, "error creating stripe: error writing redundancy buckets");
- ret = -BCH_ERR_ec_block_write;
+ ret = bch_err_throw(c, ec_block_write);
goto err;
}
@@ -1578,26 +1579,26 @@ static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_str
static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h)
{
struct bch_devs_mask devs = h->devs;
+ unsigned nr_devs, nr_devs_with_durability;
- rcu_read_lock();
- h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label
- ? group_to_target(h->disk_label - 1)
- : 0);
- unsigned nr_devs = dev_mask_nr(&h->devs);
+ scoped_guard(rcu) {
+ h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label
+ ? group_to_target(h->disk_label - 1)
+ : 0);
+ nr_devs = dev_mask_nr(&h->devs);
- for_each_member_device_rcu(c, ca, &h->devs)
- if (!ca->mi.durability)
- __clear_bit(ca->dev_idx, h->devs.d);
- unsigned nr_devs_with_durability = dev_mask_nr(&h->devs);
+ for_each_member_device_rcu(c, ca, &h->devs)
+ if (!ca->mi.durability)
+ __clear_bit(ca->dev_idx, h->devs.d);
+ nr_devs_with_durability = dev_mask_nr(&h->devs);
- h->blocksize = pick_blocksize(c, &h->devs);
+ h->blocksize = pick_blocksize(c, &h->devs);
- h->nr_active_devs = 0;
- for_each_member_device_rcu(c, ca, &h->devs)
- if (ca->mi.bucket_size == h->blocksize)
- h->nr_active_devs++;
-
- rcu_read_unlock();
+ h->nr_active_devs = 0;
+ for_each_member_device_rcu(c, ca, &h->devs)
+ if (ca->mi.bucket_size == h->blocksize)
+ h->nr_active_devs++;
+ }
/*
* If we only have redundancy + 1 devices, we're better off with just
@@ -1865,7 +1866,7 @@ static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new
s->nr_data = existing_v->nr_blocks -
existing_v->nr_redundant;
- int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors));
+ int ret = ec_stripe_buf_init(c, &s->existing_stripe, 0, le16_to_cpu(existing_v->sectors));
if (ret) {
bch2_stripe_close(c, s);
return ret;
@@ -1925,7 +1926,7 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
}
bch2_trans_iter_exit(trans, &lru_iter);
if (!ret)
- ret = -BCH_ERR_stripe_alloc_blocked;
+ ret = bch_err_throw(c, stripe_alloc_blocked);
if (ret == 1)
ret = 0;
if (ret)
@@ -1966,7 +1967,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
continue;
}
- ret = -BCH_ERR_ENOSPC_stripe_create;
+ ret = bch_err_throw(c, ENOSPC_stripe_create);
break;
}
@@ -2024,7 +2025,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
if (!h->s) {
h->s = ec_new_stripe_alloc(c, h);
if (!h->s) {
- ret = -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
+ ret = bch_err_throw(c, ENOMEM_ec_new_stripe_alloc);
bch_err(c, "failed to allocate new stripe");
goto err;
}
@@ -2089,7 +2090,7 @@ alloc_existing:
goto err;
allocate_buf:
- ret = ec_stripe_buf_init(&s->new_stripe, 0, h->blocksize);
+ ret = ec_stripe_buf_init(c, &s->new_stripe, 0, h->blocksize);
if (ret)
goto err;
@@ -2115,6 +2116,7 @@ int bch2_invalidate_stripe_to_dev(struct btree_trans *trans,
if (k.k->type != KEY_TYPE_stripe)
return 0;
+ struct bch_fs *c = trans->c;
struct bkey_i_stripe *s =
bch2_bkey_make_mut_typed(trans, iter, &k, 0, stripe);
int ret = PTR_ERR_OR_ZERO(s);
@@ -2141,23 +2143,22 @@ int bch2_invalidate_stripe_to_dev(struct btree_trans *trans,
unsigned nr_good = 0;
- rcu_read_lock();
- bkey_for_each_ptr(ptrs, ptr) {
- if (ptr->dev == dev_idx)
- ptr->dev = BCH_SB_MEMBER_INVALID;
+ scoped_guard(rcu)
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (ptr->dev == dev_idx)
+ ptr->dev = BCH_SB_MEMBER_INVALID;
- struct bch_dev *ca = bch2_dev_rcu(trans->c, ptr->dev);
- nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed;
- }
- rcu_read_unlock();
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed;
+ }
if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED))
- return -BCH_ERR_remove_would_lose_data;
+ return bch_err_throw(c, remove_would_lose_data);
unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant;
if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST))
- return -BCH_ERR_remove_would_lose_data;
+ return bch_err_throw(c, remove_would_lose_data);
sectors = -sectors;
@@ -2178,14 +2179,15 @@ static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, s
return 0;
if (a->stripe_sectors) {
- bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data");
- return -BCH_ERR_invalidate_stripe_to_dev;
+ struct bch_fs *c = trans->c;
+ bch_err(c, "trying to invalidate device in stripe when bucket has stripe data");
+ return bch_err_throw(c, invalidate_stripe_to_dev);
}
struct btree_iter iter;
struct bkey_s_c_stripe s =
bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
- BTREE_ITER_slots, stripe);
+ BTREE_ITER_slots, stripe);
int ret = bkey_err(s);
if (ret)
return ret;
diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
index 43557bebd0f8..c39cf304c681 100644
--- a/fs/bcachefs/errcode.c
+++ b/fs/bcachefs/errcode.c
@@ -13,12 +13,13 @@ static const char * const bch2_errcode_strs[] = {
NULL
};
-static unsigned bch2_errcode_parents[] = {
+static const unsigned bch2_errcode_parents[] = {
#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class,
BCH_ERRCODES()
#undef x
};
+__attribute__((const))
const char *bch2_err_str(int err)
{
const char *errstr;
@@ -36,6 +37,7 @@ const char *bch2_err_str(int err)
return errstr ?: "(Invalid error)";
}
+__attribute__((const))
bool __bch2_err_matches(int err, int class)
{
err = abs(err);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 62843e772b2c..acc3b7b67704 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -137,7 +137,6 @@
x(BCH_ERR_transaction_restart, transaction_restart_relock) \
x(BCH_ERR_transaction_restart, transaction_restart_relock_path) \
x(BCH_ERR_transaction_restart, transaction_restart_relock_path_intent) \
- x(BCH_ERR_transaction_restart, transaction_restart_relock_after_fill) \
x(BCH_ERR_transaction_restart, transaction_restart_too_many_iters) \
x(BCH_ERR_transaction_restart, transaction_restart_lock_node_reused) \
x(BCH_ERR_transaction_restart, transaction_restart_fill_relock) \
@@ -148,11 +147,8 @@
x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\
x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\
x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \
- x(BCH_ERR_transaction_restart, transaction_restart_key_cache_upgrade) \
x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \
x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \
- x(BCH_ERR_transaction_restart, transaction_restart_key_cache_realloced)\
- x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \
x(BCH_ERR_transaction_restart, transaction_restart_split_race) \
x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \
x(BCH_ERR_transaction_restart, transaction_restart_nested) \
@@ -182,9 +178,12 @@
x(BCH_ERR_fsck, fsck_errors_not_fixed) \
x(BCH_ERR_fsck, fsck_repair_unimplemented) \
x(BCH_ERR_fsck, fsck_repair_impossible) \
- x(EINVAL, restart_recovery) \
- x(EINVAL, cannot_rewind_recovery) \
+ x(EINVAL, recovery_will_run) \
+ x(BCH_ERR_recovery_will_run, restart_recovery) \
+ x(BCH_ERR_recovery_will_run, cannot_rewind_recovery) \
+ x(BCH_ERR_recovery_will_run, recovery_pass_will_run) \
x(0, data_update_done) \
+ x(0, bkey_was_deleted) \
x(BCH_ERR_data_update_done, data_update_done_would_block) \
x(BCH_ERR_data_update_done, data_update_done_unwritten) \
x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \
@@ -211,6 +210,8 @@
x(EINVAL, remove_would_lose_data) \
x(EINVAL, no_resize_with_buckets_nouse) \
x(EINVAL, inode_unpack_error) \
+ x(EINVAL, inode_not_unlinked) \
+ x(EINVAL, inode_has_child_snapshot) \
x(EINVAL, varint_decode_error) \
x(EINVAL, erasure_coding_found_btree_node) \
x(EINVAL, option_negative) \
@@ -236,7 +237,6 @@
x(BCH_ERR_journal_res_blocked, journal_buf_enomem) \
x(BCH_ERR_journal_res_blocked, journal_stuck) \
x(BCH_ERR_journal_res_blocked, journal_retry_open) \
- x(BCH_ERR_journal_res_blocked, journal_preres_get_blocked) \
x(BCH_ERR_journal_res_blocked, bucket_alloc_blocked) \
x(BCH_ERR_journal_res_blocked, stripe_alloc_blocked) \
x(BCH_ERR_invalid, invalid_sb) \
@@ -282,7 +282,6 @@
x(EIO, sb_not_downgraded) \
x(EIO, btree_node_write_all_failed) \
x(EIO, btree_node_read_error) \
- x(EIO, btree_node_read_validate_error) \
x(EIO, btree_need_topology_repair) \
x(EIO, bucket_ref_update) \
x(EIO, trigger_alloc) \
@@ -357,9 +356,11 @@ enum bch_errcode {
BCH_ERR_MAX
};
-const char *bch2_err_str(int);
-bool __bch2_err_matches(int, int);
+__attribute__((const)) const char *bch2_err_str(int);
+__attribute__((const)) bool __bch2_err_matches(int, int);
+
+__attribute__((const))
static inline bool _bch2_err_matches(int err, int class)
{
return err < 0 && __bch2_err_matches(err, class);
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index c2cad28635bf..267e73d9d7e6 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -69,7 +69,7 @@ static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *tra
if (trans)
bch2_trans_updates_to_text(&buf, trans);
bool ret = __bch2_inconsistent_error(c, &buf);
- bch2_print_str_nonblocking(c, KERN_ERR, buf.buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
return ret;
@@ -100,10 +100,10 @@ int __bch2_topology_error(struct bch_fs *c, struct printbuf *out)
set_bit(BCH_FS_topology_error, &c->flags);
if (!test_bit(BCH_FS_in_recovery, &c->flags)) {
__bch2_inconsistent_error(c, out);
- return -BCH_ERR_btree_need_topology_repair;
+ return bch_err_throw(c, btree_need_topology_repair);
} else {
return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?:
- -BCH_ERR_btree_node_read_validate_error;
+ bch_err_throw(c, btree_need_topology_repair);
}
}
@@ -403,23 +403,23 @@ int bch2_fsck_err_opt(struct bch_fs *c,
if (test_bit(BCH_FS_in_fsck, &c->flags)) {
if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE)))
- return -BCH_ERR_fsck_repair_unimplemented;
+ return bch_err_throw(c, fsck_repair_unimplemented);
switch (c->opts.fix_errors) {
case FSCK_FIX_exit:
- return -BCH_ERR_fsck_errors_not_fixed;
+ return bch_err_throw(c, fsck_errors_not_fixed);
case FSCK_FIX_yes:
if (flags & FSCK_CAN_FIX)
- return -BCH_ERR_fsck_fix;
+ return bch_err_throw(c, fsck_fix);
fallthrough;
case FSCK_FIX_no:
if (flags & FSCK_CAN_IGNORE)
- return -BCH_ERR_fsck_ignore;
- return -BCH_ERR_fsck_errors_not_fixed;
+ return bch_err_throw(c, fsck_ignore);
+ return bch_err_throw(c, fsck_errors_not_fixed);
case FSCK_FIX_ask:
if (flags & FSCK_AUTOFIX)
- return -BCH_ERR_fsck_fix;
- return -BCH_ERR_fsck_ask;
+ return bch_err_throw(c, fsck_fix);
+ return bch_err_throw(c, fsck_ask);
default:
BUG();
}
@@ -427,12 +427,12 @@ int bch2_fsck_err_opt(struct bch_fs *c,
if ((flags & FSCK_AUTOFIX) &&
(c->opts.errors == BCH_ON_ERROR_continue ||
c->opts.errors == BCH_ON_ERROR_fix_safe))
- return -BCH_ERR_fsck_fix;
+ return bch_err_throw(c, fsck_fix);
if (c->opts.errors == BCH_ON_ERROR_continue &&
(flags & FSCK_CAN_IGNORE))
- return -BCH_ERR_fsck_ignore;
- return -BCH_ERR_fsck_errors_not_fixed;
+ return bch_err_throw(c, fsck_ignore);
+ return bch_err_throw(c, fsck_errors_not_fixed);
}
}
@@ -444,7 +444,7 @@ int __bch2_fsck_err(struct bch_fs *c,
{
va_list args;
struct printbuf buf = PRINTBUF, *out = &buf;
- int ret = -BCH_ERR_fsck_ignore;
+ int ret = 0;
const char *action_orig = "fix?", *action = action_orig;
might_sleep();
@@ -474,8 +474,8 @@ int __bch2_fsck_err(struct bch_fs *c,
if (test_bit(err, c->sb.errors_silent))
return flags & FSCK_CAN_FIX
- ? -BCH_ERR_fsck_fix
- : -BCH_ERR_fsck_ignore;
+ ? bch_err_throw(c, fsck_fix)
+ : bch_err_throw(c, fsck_ignore);
printbuf_indent_add_nextline(out, 2);
@@ -517,10 +517,10 @@ int __bch2_fsck_err(struct bch_fs *c,
prt_str(out, ", ");
if (flags & FSCK_CAN_FIX) {
prt_actioning(out, action);
- ret = -BCH_ERR_fsck_fix;
+ ret = bch_err_throw(c, fsck_fix);
} else {
prt_str(out, ", continuing");
- ret = -BCH_ERR_fsck_ignore;
+ ret = bch_err_throw(c, fsck_ignore);
}
goto print;
@@ -532,18 +532,18 @@ int __bch2_fsck_err(struct bch_fs *c,
"run fsck, and forward to devs so error can be marked for self-healing");
inconsistent = true;
print = true;
- ret = -BCH_ERR_fsck_errors_not_fixed;
+ ret = bch_err_throw(c, fsck_errors_not_fixed);
} else if (flags & FSCK_CAN_FIX) {
prt_str(out, ", ");
prt_actioning(out, action);
- ret = -BCH_ERR_fsck_fix;
+ ret = bch_err_throw(c, fsck_fix);
} else {
prt_str(out, ", continuing");
- ret = -BCH_ERR_fsck_ignore;
+ ret = bch_err_throw(c, fsck_ignore);
}
} else if (c->opts.fix_errors == FSCK_FIX_exit) {
prt_str(out, ", exiting");
- ret = -BCH_ERR_fsck_errors_not_fixed;
+ ret = bch_err_throw(c, fsck_errors_not_fixed);
} else if (flags & FSCK_CAN_FIX) {
int fix = s && s->fix
? s->fix
@@ -562,30 +562,37 @@ int __bch2_fsck_err(struct bch_fs *c,
: FSCK_FIX_yes;
ret = ret & 1
- ? -BCH_ERR_fsck_fix
- : -BCH_ERR_fsck_ignore;
+ ? bch_err_throw(c, fsck_fix)
+ : bch_err_throw(c, fsck_ignore);
} else if (fix == FSCK_FIX_yes ||
(c->opts.nochanges &&
!(flags & FSCK_CAN_IGNORE))) {
prt_str(out, ", ");
prt_actioning(out, action);
- ret = -BCH_ERR_fsck_fix;
+ ret = bch_err_throw(c, fsck_fix);
} else {
prt_str(out, ", not ");
prt_actioning(out, action);
+ ret = bch_err_throw(c, fsck_ignore);
+ }
+ } else {
+ if (flags & FSCK_CAN_IGNORE) {
+ prt_str(out, ", continuing");
+ ret = bch_err_throw(c, fsck_ignore);
+ } else {
+ prt_str(out, " (repair unimplemented)");
+ ret = bch_err_throw(c, fsck_repair_unimplemented);
}
- } else if (!(flags & FSCK_CAN_IGNORE)) {
- prt_str(out, " (repair unimplemented)");
}
- if (ret == -BCH_ERR_fsck_ignore &&
+ if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) &&
(c->opts.fix_errors == FSCK_FIX_exit ||
!(flags & FSCK_CAN_IGNORE)))
- ret = -BCH_ERR_fsck_errors_not_fixed;
+ ret = bch_err_throw(c, fsck_errors_not_fixed);
if (test_bit(BCH_FS_in_fsck, &c->flags) &&
- (ret != -BCH_ERR_fsck_fix &&
- ret != -BCH_ERR_fsck_ignore)) {
+ (!bch2_err_matches(ret, BCH_ERR_fsck_fix) &&
+ !bch2_err_matches(ret, BCH_ERR_fsck_ignore))) {
exiting = true;
print = true;
}
@@ -614,25 +621,32 @@ print:
if (s)
s->ret = ret;
+ if (trans &&
+ !(flags & FSCK_ERR_NO_LOG) &&
+ ret == -BCH_ERR_fsck_fix)
+ ret = bch2_trans_log_str(trans, bch2_sb_error_strs[err]) ?: ret;
+err_unlock:
+ mutex_unlock(&c->fsck_error_msgs_lock);
+err:
/*
* We don't yet track whether the filesystem currently has errors, for
* log_fsck_err()s: that would require us to track for every error type
* which recovery pass corrects it, to get the fsck exit status correct:
*/
- if (flags & FSCK_CAN_FIX) {
- if (ret == -BCH_ERR_fsck_fix) {
- set_bit(BCH_FS_errors_fixed, &c->flags);
- } else {
- set_bit(BCH_FS_errors_not_fixed, &c->flags);
- set_bit(BCH_FS_error, &c->flags);
- }
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ /* nothing */
+ } else if (bch2_err_matches(ret, BCH_ERR_fsck_fix)) {
+ set_bit(BCH_FS_errors_fixed, &c->flags);
+ } else {
+ set_bit(BCH_FS_errors_not_fixed, &c->flags);
+ set_bit(BCH_FS_error, &c->flags);
}
-err_unlock:
- mutex_unlock(&c->fsck_error_msgs_lock);
-err:
+
if (action != action_orig)
kfree(action);
printbuf_exit(&buf);
+
+ BUG_ON(!ret);
return ret;
}
@@ -650,12 +664,12 @@ int __bch2_bkey_fsck_err(struct bch_fs *c,
const char *fmt, ...)
{
if (from.flags & BCH_VALIDATE_silent)
- return -BCH_ERR_fsck_delete_bkey;
+ return bch_err_throw(c, fsck_delete_bkey);
unsigned fsck_flags = 0;
if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) {
if (test_bit(err, c->sb.errors_silent))
- return -BCH_ERR_fsck_delete_bkey;
+ return bch_err_throw(c, fsck_delete_bkey);
fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX;
}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 5123d4c86770..0c3c3a24fc6f 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -105,13 +105,13 @@ void bch2_free_fsck_errs(struct bch_fs *);
#define fsck_err_wrap(_do) \
({ \
int _ret = _do; \
- if (_ret != -BCH_ERR_fsck_fix && \
- _ret != -BCH_ERR_fsck_ignore) { \
+ if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \
+ !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) { \
ret = _ret; \
goto fsck_err; \
} \
\
- _ret == -BCH_ERR_fsck_fix; \
+ bch2_err_matches(_ret, BCH_ERR_fsck_fix); \
})
#define __fsck_err(...) fsck_err_wrap(bch2_fsck_err(__VA_ARGS__))
@@ -170,10 +170,10 @@ do { \
int _ret = __bch2_bkey_fsck_err(c, k, from, \
BCH_FSCK_ERR_##_err_type, \
_err_msg, ##__VA_ARGS__); \
- if (_ret != -BCH_ERR_fsck_fix && \
- _ret != -BCH_ERR_fsck_ignore) \
+ if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \
+ !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) \
ret = _ret; \
- ret = -BCH_ERR_fsck_delete_bkey; \
+ ret = bch_err_throw(c, fsck_delete_bkey); \
goto fsck_err; \
} while (0)
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index b899ee75f5b9..e76e58a568bf 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -139,6 +139,17 @@ int bch2_extent_trim_atomic(struct btree_trans *trans,
if (ret)
return ret;
- bch2_cut_back(end, k);
+ /* tracepoint */
+
+ if (bpos_lt(end, k->k.p)) {
+ if (trace_extent_trim_atomic_enabled()) {
+ CLASS(printbuf, buf)();
+ bch2_bpos_to_text(&buf, end);
+ prt_newline(&buf);
+ bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k));
+ trace_extent_trim_atomic(trans->c, buf.buf);
+ }
+ bch2_cut_back(end, k);
+ }
return 0;
}
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 1ac9897f189d..83cbd77dcb9c 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -50,34 +50,34 @@ void bch2_io_failures_to_text(struct printbuf *out,
struct bch_io_failures *failed)
{
static const char * const error_types[] = {
- "io", "checksum", "ec reconstruct", NULL
+ "btree validate", "io", "checksum", "ec reconstruct", NULL
};
for (struct bch_dev_io_failures *f = failed->devs;
f < failed->devs + failed->nr;
f++) {
unsigned errflags =
- ((!!f->failed_io) << 0) |
- ((!!f->failed_csum_nr) << 1) |
- ((!!f->failed_ec) << 2);
-
- if (!errflags)
- continue;
+ ((!!f->failed_btree_validate) << 0) |
+ ((!!f->failed_io) << 1) |
+ ((!!f->failed_csum_nr) << 2) |
+ ((!!f->failed_ec) << 3);
bch2_printbuf_make_room(out, 1024);
- rcu_read_lock();
out->atomic++;
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev);
- if (ca)
- prt_str(out, ca->name);
- else
- prt_printf(out, "(invalid device %u)", f->dev);
+ scoped_guard(rcu) {
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev);
+ if (ca)
+ prt_str(out, ca->name);
+ else
+ prt_printf(out, "(invalid device %u)", f->dev);
+ }
--out->atomic;
- rcu_read_unlock();
prt_char(out, ' ');
- if (is_power_of_2(errflags)) {
+ if (!errflags) {
+ prt_str(out, "no error - confused");
+ } else if (is_power_of_2(errflags)) {
prt_bitflags(out, error_types, errflags);
prt_str(out, " error");
} else {
@@ -193,7 +193,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
bool have_dirty_ptrs = false, have_pick = false;
if (k.k->type == KEY_TYPE_error)
- return -BCH_ERR_key_type_error;
+ return bch_err_throw(c, key_type_error);
rcu_read_lock();
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -286,17 +286,17 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
if (!have_dirty_ptrs)
return 0;
if (have_missing_devs)
- return -BCH_ERR_no_device_to_read_from;
+ return bch_err_throw(c, no_device_to_read_from);
if (have_csum_errors)
- return -BCH_ERR_data_read_csum_err;
+ return bch_err_throw(c, data_read_csum_err);
if (have_io_errors)
- return -BCH_ERR_data_read_io_err;
+ return bch_err_throw(c, data_read_io_err);
/*
* If we get here, we have pointers (bkey_ptrs_validate() ensures that),
* but they don't point to valid devices:
*/
- return -BCH_ERR_no_devices_valid;
+ return bch_err_throw(c, no_devices_valid);
}
/* KEY_TYPE_btree_ptr: */
@@ -407,6 +407,8 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
lp.crc = bch2_extent_crc_unpack(l.k, NULL);
rp.crc = bch2_extent_crc_unpack(r.k, NULL);
+ guard(rcu)();
+
while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) &&
__bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) {
if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
@@ -418,10 +420,8 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
return false;
/* Extents may not straddle buckets: */
- rcu_read_lock();
struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev);
bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr);
- rcu_read_unlock();
if (!same_bucket)
return false;
@@ -838,11 +838,9 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
struct extent_ptr_decoded p;
unsigned durability = 0;
- rcu_read_lock();
+ guard(rcu)();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
durability += bch2_extent_ptr_durability(c, &p);
- rcu_read_unlock();
-
return durability;
}
@@ -853,12 +851,10 @@ static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
struct extent_ptr_decoded p;
unsigned durability = 0;
- rcu_read_lock();
+ guard(rcu)();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
durability += bch2_extent_ptr_durability(c, &p);
- rcu_read_unlock();
-
return durability;
}
@@ -1015,20 +1011,16 @@ bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
struct bch_dev *ca;
- bool ret = false;
- rcu_read_lock();
+ guard(rcu)();
bkey_for_each_ptr(ptrs, ptr)
if (bch2_dev_in_target(c, ptr->dev, target) &&
(ca = bch2_dev_rcu(c, ptr->dev)) &&
(!ptr->cached ||
- !dev_ptr_stale_rcu(ca, ptr))) {
- ret = true;
- break;
- }
- rcu_read_unlock();
+ !dev_ptr_stale_rcu(ca, ptr)))
+ return true;
- return ret;
+ return false;
}
bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
@@ -1142,7 +1134,7 @@ void bch2_extent_ptr_set_cached(struct bch_fs *c,
bool have_cached_ptr;
unsigned drop_dev = ptr->dev;
- rcu_read_lock();
+ guard(rcu)();
restart_drop_ptrs:
ptrs = bch2_bkey_ptrs(k);
have_cached_ptr = false;
@@ -1175,10 +1167,8 @@ restart_drop_ptrs:
goto drop;
ptr->cached = true;
- rcu_read_unlock();
return;
drop:
- rcu_read_unlock();
bch2_bkey_drop_ptr_noerror(k, ptr);
}
@@ -1194,12 +1184,11 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
{
struct bch_dev *ca;
- rcu_read_lock();
+ guard(rcu)();
bch2_bkey_drop_ptrs(k, ptr,
ptr->cached &&
(!(ca = bch2_dev_rcu(c, ptr->dev)) ||
dev_ptr_stale_rcu(ca, ptr) > 0));
- rcu_read_unlock();
return bkey_deleted(k.k);
}
@@ -1217,7 +1206,7 @@ bool bch2_extent_normalize_by_opts(struct bch_fs *c,
struct bkey_ptrs ptrs;
bool have_cached_ptr;
- rcu_read_lock();
+ guard(rcu)();
restart_drop_ptrs:
ptrs = bch2_bkey_ptrs(k);
have_cached_ptr = false;
@@ -1230,7 +1219,6 @@ restart_drop_ptrs:
}
have_cached_ptr = true;
}
- rcu_read_unlock();
return bkey_deleted(k.k);
}
@@ -1238,7 +1226,7 @@ restart_drop_ptrs:
void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
{
out->atomic++;
- rcu_read_lock();
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
if (!ca) {
prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
@@ -1262,7 +1250,6 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc
else if (stale)
prt_printf(out, " invalid");
}
- rcu_read_unlock();
--out->atomic;
}
@@ -1528,7 +1515,7 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
prt_printf(err, "invalid compression opt %u:%u",
opt.type, opt.level);
- return -BCH_ERR_invalid_bkey;
+ return bch_err_throw(c, invalid_bkey);
}
#endif
break;
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index e3a75dcca60c..1c54b9b5bd69 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -394,17 +394,9 @@ struct bch_writepage_state {
struct bch_io_opts opts;
struct bch_folio_sector *tmp;
unsigned tmp_sectors;
+ struct blk_plug plug;
};
-static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
- struct bch_inode_info *inode)
-{
- struct bch_writepage_state ret = { 0 };
-
- bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
- return ret;
-}
-
/*
* Determine when a writepage io is full. We have to limit writepage bios to a
* single page per bvec (i.e. 1MB with 4k pages) because that is the limit to
@@ -666,23 +658,23 @@ do_io:
int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct bch_fs *c = mapping->host->i_sb->s_fs_info;
- struct bch_writepage_state w =
- bch_writepage_state_init(c, to_bch_ei(mapping->host));
- struct blk_plug plug;
- int ret;
+ struct bch_writepage_state *w = kzalloc(sizeof(*w), GFP_NOFS|__GFP_NOFAIL);
- blk_start_plug(&plug);
- ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
- if (w.io)
- bch2_writepage_do_io(&w);
- blk_finish_plug(&plug);
- kfree(w.tmp);
+ bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode);
+
+ blk_start_plug(&w->plug);
+ int ret = write_cache_pages(mapping, wbc, __bch2_writepage, w);
+ if (w->io)
+ bch2_writepage_do_io(w);
+ blk_finish_plug(&w->plug);
+ kfree(w->tmp);
+ kfree(w);
return bch2_err_class(ret);
}
/* buffered writes: */
-int bch2_write_begin(struct file *file, struct address_space *mapping,
+int bch2_write_begin(const struct kiocb *iocb, struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
@@ -765,7 +757,7 @@ err_unlock:
return bch2_err_class(ret);
}
-int bch2_write_end(struct file *file, struct address_space *mapping,
+int bch2_write_end(const struct kiocb *iocb, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
{
diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h
index 3207ebbb4ab4..14de91c27656 100644
--- a/fs/bcachefs/fs-io-buffered.h
+++ b/fs/bcachefs/fs-io-buffered.h
@@ -10,9 +10,9 @@ int bch2_read_folio(struct file *, struct folio *);
int bch2_writepages(struct address_space *, struct writeback_control *);
void bch2_readahead(struct readahead_control *);
-int bch2_write_begin(struct file *, struct address_space *, loff_t pos,
+int bch2_write_begin(const struct kiocb *, struct address_space *, loff_t pos,
unsigned len, struct folio **, void **);
-int bch2_write_end(struct file *, struct address_space *, loff_t,
+int bch2_write_end(const struct kiocb *, struct address_space *, loff_t,
unsigned len, unsigned copied, struct folio *, void *);
ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index fbae9c1de746..c2cc405822f2 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -447,7 +447,7 @@ static int __bch2_folio_reservation_get(struct bch_fs *c,
if (!reserved) {
bch2_disk_reservation_put(c, &disk_res);
- return -BCH_ERR_ENOSPC_disk_reservation;
+ return bch_err_throw(c, ENOSPC_disk_reservation);
}
break;
}
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index b1e9ee28fc0f..a233f45875e9 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -71,12 +71,12 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
- rcu_read_lock();
- ca = rcu_dereference(c->devs[dev]);
- if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE],
- BCH_DEV_WRITE_REF_nocow_flush))
- ca = NULL;
- rcu_read_unlock();
+ scoped_guard(rcu) {
+ ca = rcu_dereference(c->devs[dev]);
+ if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE],
+ BCH_DEV_WRITE_REF_nocow_flush))
+ ca = NULL;
+ }
if (!ca)
continue;
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 05361a793206..4e72e654da96 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -268,13 +268,13 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
}
if (dst_dentry->d_inode) {
- error = -BCH_ERR_EEXIST_subvolume_create;
+ error = bch_err_throw(c, EEXIST_subvolume_create);
goto err3;
}
dir = dst_path.dentry->d_inode;
if (IS_DEADDIR(dir)) {
- error = -BCH_ERR_ENOENT_directory_dead;
+ error = bch_err_throw(c, ENOENT_directory_dead);
goto err3;
}
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index ddfe89d84966..687af0eea0c2 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -124,8 +124,9 @@ retry:
goto err;
struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u);
+ bool rebalance_changed = memcmp(&old_r, &new_r, sizeof(new_r));
- if (memcmp(&old_r, &new_r, sizeof(new_r))) {
+ if (rebalance_changed) {
ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum);
if (ret)
goto err;
@@ -146,6 +147,9 @@ err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
+ if (rebalance_changed)
+ bch2_rebalance_wakeup(c);
+
bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
"%s: inode %llu:%llu not found when updating",
bch2_err_str(ret),
@@ -718,7 +722,6 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
if (IS_ERR(inode))
inode = NULL;
-#ifdef CONFIG_UNICODE
if (!inode && IS_CASEFOLDED(vdir)) {
/*
* Do not cache a negative dentry in casefolded directories
@@ -733,7 +736,6 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
*/
return NULL;
}
-#endif
return d_splice_alias(&inode->v, dentry);
}
@@ -1549,11 +1551,11 @@ static const struct vm_operations_struct bch_vm_ops = {
.page_mkwrite = bch2_page_mkwrite,
};
-static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
+static int bch2_mmap_prepare(struct vm_area_desc *desc)
{
- file_accessed(file);
+ file_accessed(desc->file);
- vma->vm_ops = &bch_vm_ops;
+ desc->vm_ops = &bch_vm_ops;
return 0;
}
@@ -1569,11 +1571,12 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
if (!dir_emit_dots(file, ctx))
return 0;
- int ret = bch2_readdir(c, inode_inum(inode), ctx);
+ int ret = bch2_readdir(c, inode_inum(inode), &hash, ctx);
bch_err_fn(c, ret);
return bch2_err_class(ret);
@@ -1614,7 +1617,7 @@ static const __maybe_unused unsigned bch_flags_to_xflags[] = {
};
static int bch2_fileattr_get(struct dentry *dentry,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct bch_inode_info *inode = to_bch_ei(d_inode(dentry));
struct bch_fs *c = inode->v.i_sb->s_fs_info;
@@ -1677,7 +1680,7 @@ static int fssetxattr_inode_update_fn(struct btree_trans *trans,
static int bch2_fileattr_set(struct mnt_idmap *idmap,
struct dentry *dentry,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct bch_inode_info *inode = to_bch_ei(d_inode(dentry));
struct bch_fs *c = inode->v.i_sb->s_fs_info;
@@ -1727,7 +1730,8 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap,
bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
ATTR_CTIME);
mutex_unlock(&inode->ei_update_lock);
- return ret;
+
+ return bch2_err_class(ret);
}
static const struct file_operations bch_file_operations = {
@@ -1735,7 +1739,7 @@ static const struct file_operations bch_file_operations = {
.llseek = bch2_llseek,
.read_iter = bch2_read_iter,
.write_iter = bch2_write_iter,
- .mmap = bch2_mmap,
+ .mmap_prepare = bch2_mmap_prepare,
.get_unmapped_area = thp_get_unmapped_area,
.fsync = bch2_fsync,
.splice_read = filemap_splice_read,
@@ -2002,14 +2006,14 @@ retry:
goto err;
if (k.k->type != KEY_TYPE_dirent) {
- ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+ ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
goto err;
}
d = bkey_s_c_to_dirent(k);
ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
if (ret > 0)
- ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+ ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
if (ret)
goto err;
@@ -2175,7 +2179,13 @@ static void bch2_evict_inode(struct inode *vinode)
KEY_TYPE_QUOTA_WARN);
bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
KEY_TYPE_QUOTA_WARN);
- bch2_inode_rm(c, inode_inum(inode));
+ int ret = bch2_inode_rm(c, inode_inum(inode));
+ if (ret && !bch2_err_matches(ret, EROFS)) {
+ bch_err_msg(c, ret, "VFS incorrectly tried to delete inode %llu:%llu",
+ inode->ei_inum.subvol,
+ inode->ei_inum.inum);
+ bch2_sb_error_count(c, BCH_FSCK_ERR_vfs_bad_inode_rm);
+ }
/*
* If we are deleting, we need it present in the vfs hash table
@@ -2322,14 +2332,13 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
struct bch_fs *c = root->d_sb->s_fs_info;
bool first = true;
- rcu_read_lock();
+ guard(rcu)();
for_each_online_member_rcu(c, ca) {
if (!first)
seq_putc(seq, ':');
first = false;
seq_puts(seq, ca->disk_sb.sb_name);
}
- rcu_read_unlock();
return 0;
}
@@ -2480,6 +2489,14 @@ static int bch2_fs_get_tree(struct fs_context *fc)
if (ret)
goto err_stop_fs;
+ /*
+ * We might be doing a RO mount because other options required it, or we
+ * have no alloc info and it's a small image with no room to regenerate
+ * it
+ */
+ if (c->opts.read_only)
+ fc->sb_flags |= SB_RDONLY;
+
sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
ret = PTR_ERR_OR_ZERO(sb);
if (ret)
@@ -2526,16 +2543,16 @@ got_sb:
sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
- rcu_read_lock();
- for_each_online_member_rcu(c, ca) {
- struct block_device *bdev = ca->disk_sb.bdev;
+ scoped_guard(rcu) {
+ for_each_online_member_rcu(c, ca) {
+ struct block_device *bdev = ca->disk_sb.bdev;
- /* XXX: create an anonymous device for multi device filesystems */
- sb->s_bdev = bdev;
- sb->s_dev = bdev->bd_dev;
- break;
+ /* XXX: create an anonymous device for multi device filesystems */
+ sb->s_bdev = bdev;
+ sb->s_dev = bdev->bd_dev;
+ break;
+ }
}
- rcu_read_unlock();
c->dev = sb->s_dev;
@@ -2547,9 +2564,10 @@ got_sb:
sb->s_shrink->seeks = 0;
#ifdef CONFIG_UNICODE
- sb->s_encoding = c->cf_encoding;
-#endif
+ if (bch2_fs_casefold_enabled(c))
+ sb->s_encoding = c->cf_encoding;
generic_set_sb_d_ops(sb);
+#endif
vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
ret = PTR_ERR_OR_ZERO(vinode);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 49f46df8340e..15c1e890d299 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -12,6 +12,7 @@
#include "fs.h"
#include "fsck.h"
#include "inode.h"
+#include "io_misc.h"
#include "keylist.h"
#include "namei.h"
#include "recovery_passes.h"
@@ -23,14 +24,15 @@
#include <linux/bsearch.h>
#include <linux/dcache.h> /* struct qstr */
-static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
+static int dirent_points_to_inode_nowarn(struct bch_fs *c,
+ struct bkey_s_c_dirent d,
struct bch_inode_unpacked *inode)
{
if (d.v->d_type == DT_SUBVOL
? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
: le64_to_cpu(d.v->d_inum) == inode->bi_inum)
return 0;
- return -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+ return bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
}
static void dirent_inode_mismatch_msg(struct printbuf *out,
@@ -49,7 +51,7 @@ static int dirent_points_to_inode(struct bch_fs *c,
struct bkey_s_c_dirent dirent,
struct bch_inode_unpacked *inode)
{
- int ret = dirent_points_to_inode_nowarn(dirent, inode);
+ int ret = dirent_points_to_inode_nowarn(c, dirent, inode);
if (ret) {
struct printbuf buf = PRINTBUF;
dirent_inode_mismatch_msg(&buf, c, dirent, inode);
@@ -152,7 +154,7 @@ static int find_snapshot_tree_subvol(struct btree_trans *trans,
goto found;
}
}
- ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol;
+ ret = bch_err_throw(trans->c, ENOENT_no_snapshot_tree_subvol);
found:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -229,7 +231,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
if (d_type != DT_DIR) {
bch_err(c, "error looking up lost+found: not a directory");
- return -BCH_ERR_ENOENT_not_directory;
+ return bch_err_throw(c, ENOENT_not_directory);
}
/*
@@ -326,7 +328,8 @@ static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
(inode->bi_flags & BCH_INODE_has_child_snapshot))
return false;
- return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);
+ return !bch2_inode_has_backpointer(inode) &&
+ !(inode->bi_flags & BCH_INODE_unlinked);
}
static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot)
@@ -371,6 +374,18 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
if (inode->bi_subvol) {
inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
+ struct btree_iter subvol_iter;
+ struct bkey_i_subvolume *subvol =
+ bch2_bkey_get_mut_typed(trans, &subvol_iter,
+ BTREE_ID_subvolumes, POS(0, inode->bi_subvol),
+ 0, subvolume);
+ ret = PTR_ERR_OR_ZERO(subvol);
+ if (ret)
+ return ret;
+
+ subvol->v.fs_path_parent = BCACHEFS_ROOT_SUBVOL;
+ bch2_trans_iter_exit(trans, &subvol_iter);
+
u64 root_inum;
ret = subvol_lookup(trans, inode->bi_parent_subvol,
&dirent_snapshot, &root_inum);
@@ -386,6 +401,8 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
if (ret)
return ret;
+ bch_verbose(c, "got lostfound inum %llu", lostfound.bi_inum);
+
lostfound.bi_nlink += S_ISDIR(inode->bi_mode);
/* ensure lost+found inode is also present in inode snapshot */
@@ -422,6 +439,16 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
if (ret)
return ret;
+ {
+ CLASS(printbuf, buf)();
+ ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum,
+ inode->bi_snapshot, NULL, &buf);
+ if (ret)
+ return ret;
+
+ bch_info(c, "reattached at %s", buf.buf);
+ }
+
/*
* Fix up inodes in child snapshots: if they should also be reattached
* update the backpointer field, if they should not be we need to emit
@@ -489,13 +516,21 @@ static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
static int remove_backpointer(struct btree_trans *trans,
struct bch_inode_unpacked *inode)
{
- if (!inode->bi_dir)
+ if (!bch2_inode_has_backpointer(inode))
return 0;
+ u32 snapshot = inode->bi_snapshot;
+
+ if (inode->bi_parent_subvol) {
+ int ret = bch2_subvolume_get_snapshot(trans, inode->bi_parent_subvol, &snapshot);
+ if (ret)
+ return ret;
+ }
+
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter,
- SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot));
+ SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
int ret = bkey_err(d) ?:
dirent_points_to_inode(c, d, inode) ?:
bch2_fsck_remove_dirent(trans, d.k->p);
@@ -531,7 +566,7 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub
if (!bch2_snapshot_is_leaf(c, snapshotid)) {
bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
- return -BCH_ERR_fsck_repair_unimplemented;
+ return bch_err_throw(c, fsck_repair_unimplemented);
}
/*
@@ -643,11 +678,6 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32
return __bch2_fsck_write_inode(trans, &new_inode);
}
-struct snapshots_seen {
- struct bpos pos;
- snapshot_id_list ids;
-};
-
static inline void snapshots_seen_exit(struct snapshots_seen *s)
{
darray_exit(&s->ids);
@@ -699,14 +729,8 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
u32 id, u32 ancestor)
{
- ssize_t i;
-
EBUG_ON(id > ancestor);
- /* @ancestor should be the snapshot most recently added to @seen */
- EBUG_ON(ancestor != seen->pos.snapshot);
- EBUG_ON(ancestor != darray_last(seen->ids));
-
if (id == ancestor)
return true;
@@ -722,11 +746,8 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
* numerically, since snapshot ID lists are kept sorted, so if we find
* an id that's an ancestor of @id we're done:
*/
-
- for (i = seen->ids.nr - 2;
- i >= 0 && seen->ids.data[i] >= id;
- --i)
- if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
+ darray_for_each_reverse(seen->ids, i)
+ if (*i != ancestor && bch2_snapshot_is_ancestor(c, id, *i))
return false;
return true;
@@ -810,7 +831,7 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w,
if (!n->whiteout) {
return bch2_inode_unpack(inode, &n->inode);
} else {
- n->inode.bi_inum = inode.k->p.inode;
+ n->inode.bi_inum = inode.k->p.offset;
n->inode.bi_snapshot = inode.k->p.snapshot;
return 0;
}
@@ -890,14 +911,11 @@ lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, str
{
struct bch_fs *c = trans->c;
- struct inode_walker_entry *i;
- __darray_for_each(w->inodes, i)
- if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot))
- goto found;
+ struct inode_walker_entry *i = darray_find_p(w->inodes, i,
+ bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot));
- return NULL;
-found:
- BUG_ON(k.k->p.snapshot > i->inode.bi_snapshot);
+ if (!i)
+ return NULL;
struct printbuf buf = PRINTBUF;
int ret = 0;
@@ -910,17 +928,15 @@ found:
w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot,
(bch2_bkey_val_to_text(&buf, c, k),
buf.buf))) {
- struct bch_inode_unpacked new = i->inode;
- struct bkey_i whiteout;
-
- new.bi_snapshot = k.k->p.snapshot;
-
if (!i->whiteout) {
+ struct bch_inode_unpacked new = i->inode;
+ new.bi_snapshot = k.k->p.snapshot;
ret = __bch2_fsck_write_inode(trans, &new);
} else {
+ struct bkey_i whiteout;
bkey_init(&whiteout.k);
whiteout.k.type = KEY_TYPE_whiteout;
- whiteout.k.p = SPOS(0, i->inode.bi_inum, i->inode.bi_snapshot);
+ whiteout.k.p = SPOS(0, i->inode.bi_inum, k.k->p.snapshot);
ret = bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
&whiteout,
BTREE_UPDATE_internal_snapshot_node);
@@ -947,7 +963,7 @@ found:
if (ret)
goto fsck_err;
- ret = -BCH_ERR_transaction_restart_nested;
+ ret = bch_err_throw(c, transaction_restart_nested);
goto fsck_err;
}
@@ -992,7 +1008,8 @@ int bch2_fsck_update_backpointers(struct btree_trans *trans,
int ret = 0;
if (d->v.d_type == DT_SUBVOL) {
- BUG();
+ bch_err(trans->c, "%s does not support DT_SUBVOL", __func__);
+ ret = -BCH_ERR_fsck_repair_unimplemented;
} else {
ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum));
if (ret)
@@ -1048,7 +1065,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
- if ((ret || dirent_points_to_inode_nowarn(d, inode)) &&
+ if ((ret || dirent_points_to_inode_nowarn(c, d, inode)) &&
inode->bi_subvol &&
(inode->bi_flags & BCH_INODE_has_child_snapshot)) {
/* Older version of a renamed subvolume root: we won't have a
@@ -1069,7 +1086,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
trans, inode_points_to_missing_dirent,
"inode points to missing dirent\n%s",
(bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) ||
- fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode),
+ fsck_err_on(!ret && dirent_points_to_inode_nowarn(c, d, inode),
trans, inode_points_to_wrong_dirent,
"%s",
(printbuf_reset(&buf),
@@ -1141,13 +1158,14 @@ static int check_inode(struct btree_trans *trans,
if (ret)
goto err;
- if (u.bi_dir || u.bi_dir_offset) {
+ if (bch2_inode_has_backpointer(&u)) {
ret = check_inode_dirent_inode(trans, &u, &do_update);
if (ret)
goto err;
}
- if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked),
+ if (fsck_err_on(bch2_inode_has_backpointer(&u) &&
+ (u.bi_flags & BCH_INODE_unlinked),
trans, inode_unlinked_but_has_dirent,
"inode unlinked but has dirent\n%s",
(printbuf_reset(&buf),
@@ -1174,6 +1192,14 @@ static int check_inode(struct btree_trans *trans,
ret = 0;
}
+ if (fsck_err_on(S_ISDIR(u.bi_mode) && u.bi_size,
+ trans, inode_dir_has_nonzero_i_size,
+ "directory %llu:%u with nonzero i_size %lli",
+ u.bi_inum, u.bi_snapshot, u.bi_size)) {
+ u.bi_size = 0;
+ do_update = true;
+ }
+
ret = bch2_inode_has_child_snapshots(trans, k.k->p);
if (ret < 0)
goto err;
@@ -1436,6 +1462,7 @@ static int check_key_has_inode(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
+ struct btree_iter iter2 = {};
int ret = PTR_ERR_OR_ZERO(i);
if (ret)
return ret;
@@ -1445,40 +1472,105 @@ static int check_key_has_inode(struct btree_trans *trans,
bool have_inode = i && !i->whiteout;
- if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
- ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
- if (ret)
- goto err;
+ if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes)))
+ goto reconstruct;
- inode->last_pos.inode--;
- ret = -BCH_ERR_transaction_restart_nested;
- goto err;
+ if (have_inode && btree_matches_i_mode(iter->btree_id, i->inode.bi_mode))
+ goto out;
+
+ prt_printf(&buf, ", ");
+
+ bool have_old_inode = false;
+ darray_for_each(inode->inodes, i2)
+ if (!i2->whiteout &&
+ bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i2->inode.bi_snapshot) &&
+ btree_matches_i_mode(iter->btree_id, i2->inode.bi_mode)) {
+ prt_printf(&buf, "but found good inode in older snapshot\n");
+ bch2_inode_unpacked_to_text(&buf, &i2->inode);
+ prt_newline(&buf);
+ have_old_inode = true;
+ break;
+ }
+
+ struct bkey_s_c k2;
+ unsigned nr_keys = 0;
+
+ prt_printf(&buf, "found keys:\n");
+
+ for_each_btree_key_max_norestart(trans, iter2, iter->btree_id,
+ SPOS(k.k->p.inode, 0, k.k->p.snapshot),
+ POS(k.k->p.inode, U64_MAX),
+ 0, k2, ret) {
+ nr_keys++;
+ if (nr_keys <= 10) {
+ bch2_bkey_val_to_text(&buf, c, k2);
+ prt_newline(&buf);
+ }
+ if (nr_keys >= 100)
+ break;
}
- if (fsck_err_on(!have_inode,
- trans, key_in_missing_inode,
- "key in missing inode:\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- goto delete;
+ if (ret)
+ goto err;
- if (fsck_err_on(have_inode && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
- trans, key_in_wrong_inode_type,
- "key for wrong inode mode %o:\n%s",
- i->inode.bi_mode,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- goto delete;
+ if (nr_keys > 100)
+ prt_printf(&buf, "found > %u keys for this missing inode\n", nr_keys);
+ else if (nr_keys > 10)
+ prt_printf(&buf, "found %u keys for this missing inode\n", nr_keys);
+
+ if (!have_inode) {
+ if (fsck_err_on(!have_inode,
+ trans, key_in_missing_inode,
+ "key in missing inode%s", buf.buf)) {
+ /*
+ * Maybe a deletion that raced with data move, or something
+ * weird like that? But if we know the inode was deleted, or
+ * it's just a few keys, we can safely delete them.
+ *
+ * If it's many keys, we should probably recreate the inode
+ */
+ if (have_old_inode || nr_keys <= 2)
+ goto delete;
+ else
+ goto reconstruct;
+ }
+ } else {
+ /*
+ * not autofix, this one would be a giant wtf - bit error in the
+ * inode corrupting i_mode?
+ *
+ * may want to try repairing inode instead of deleting
+ */
+ if (fsck_err_on(!btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
+ trans, key_in_wrong_inode_type,
+ "key for wrong inode mode %o%s",
+ i->inode.bi_mode, buf.buf))
+ goto delete;
+ }
out:
err:
fsck_err:
+ bch2_trans_iter_exit(trans, &iter2);
printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
delete:
+ /*
+ * XXX: print out more info
+ * count up extents for this inode, check if we have different inode in
+ * an older snapshot version, perhaps decide if we want to reconstitute
+ */
ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
goto out;
+reconstruct:
+ ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+
+ inode->last_pos.inode--;
+ ret = bch_err_throw(c, transaction_restart_nested);
+ goto out;
}
static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
@@ -1569,7 +1661,7 @@ static int extent_ends_at(struct bch_fs *c,
sizeof(seen->ids.data[0]) * seen->ids.size,
GFP_KERNEL);
if (!n.seen.ids.data)
- return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
+ return bch_err_throw(c, ENOMEM_fsck_extent_ends_at);
__darray_for_each(extent_ends->e, i) {
if (i->snapshot == k.k->p.snapshot) {
@@ -1619,7 +1711,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
__func__, buf.buf);
- ret = -BCH_ERR_internal_fsck_err;
+ ret = bch_err_throw(c, internal_fsck_err);
goto err;
}
@@ -1644,7 +1736,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
pos2.size != k2.k->size) {
bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
__func__, buf.buf);
- ret = -BCH_ERR_internal_fsck_err;
+ ret = bch_err_throw(c, internal_fsck_err);
goto err;
}
@@ -1692,7 +1784,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
* We overwrote the second extent - restart
* check_extent() from the top:
*/
- ret = -BCH_ERR_transaction_restart_nested;
+ ret = bch_err_throw(c, transaction_restart_nested);
}
}
fsck_err:
@@ -1820,20 +1912,20 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
!key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot))
continue;
- if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
+ u64 last_block = round_up(i->inode.bi_size, block_bytes(c)) >> 9;
+
+ if (fsck_err_on(k.k->p.offset > last_block &&
!bkey_extent_is_reservation(k),
trans, extent_past_end_of_inode,
"extent type past end of inode %llu:%u, i_size %llu\n%s",
i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- struct btree_iter iter2;
-
- bch2_trans_copy_iter(trans, &iter2, iter);
- bch2_btree_iter_set_snapshot(trans, &iter2, i->inode.bi_snapshot);
- ret = bch2_btree_iter_traverse(trans, &iter2) ?:
- bch2_btree_delete_at(trans, &iter2,
- BTREE_UPDATE_internal_snapshot_node);
- bch2_trans_iter_exit(trans, &iter2);
+ ret = snapshots_seen_add_inorder(c, s, i->inode.bi_snapshot) ?:
+ bch2_fpunch_snapshot(trans,
+ SPOS(i->inode.bi_inum,
+ last_block,
+ i->inode.bi_snapshot),
+ POS(i->inode.bi_inum, U64_MAX));
if (ret)
goto err;
@@ -1947,14 +2039,22 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_
continue;
}
- if (fsck_err_on(i->inode.bi_nlink != i->count,
- trans, inode_dir_wrong_nlink,
- "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
- w->last_pos.inode, i->inode.bi_snapshot, i->inode.bi_nlink, i->count)) {
- i->inode.bi_nlink = i->count;
- ret = bch2_fsck_write_inode(trans, &i->inode);
- if (ret)
- break;
+ if (i->inode.bi_nlink != i->count) {
+ CLASS(printbuf, buf)();
+
+ lockrestart_do(trans,
+ bch2_inum_snapshot_to_path(trans, w->last_pos.inode,
+ i->inode.bi_snapshot, NULL, &buf));
+
+ if (fsck_err_on(i->inode.bi_nlink != i->count,
+ trans, inode_dir_wrong_nlink,
+ "directory with wrong i_nlink: got %u, should be %llu\n%s",
+ i->inode.bi_nlink, i->count, buf.buf)) {
+ i->inode.bi_nlink = i->count;
+ ret = bch2_fsck_write_inode(trans, &i->inode);
+ if (ret)
+ break;
+ }
}
}
fsck_err:
@@ -2045,7 +2145,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
if (!new_parent_subvol) {
bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
- return -BCH_ERR_fsck_repair_unimplemented;
+ return bch_err_throw(c, fsck_repair_unimplemented);
}
struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
@@ -2107,7 +2207,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
if (ret) {
bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
- ret = -BCH_ERR_fsck_repair_unimplemented;
+ ret = bch_err_throw(c, fsck_repair_unimplemented);
goto err;
}
@@ -2139,7 +2239,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
struct bch_hash_info *hash_info,
struct inode_walker *dir,
struct inode_walker *target,
- struct snapshots_seen *s)
+ struct snapshots_seen *s,
+ bool *need_second_pass)
{
struct bch_fs *c = trans->c;
struct inode_walker_entry *i;
@@ -2181,7 +2282,10 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
*hash_info = bch2_hash_info_init(c, &i->inode);
dir->first_this_inode = false;
- ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k);
+ hash_info->cf_encoding = bch2_inode_casefold(c, &i->inode) ? c->cf_encoding : NULL;
+
+ ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info,
+ iter, k, need_second_pass);
if (ret < 0)
goto err;
if (ret) {
@@ -2202,31 +2306,34 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k),
buf.buf))) {
- struct qstr name = bch2_dirent_get_name(d);
- u32 subvol = d.v->d_type == DT_SUBVOL
- ? le32_to_cpu(d.v->d_parent_subvol)
- : 0;
+ subvol_inum dir_inum = { .subvol = d.v->d_type == DT_SUBVOL
+ ? le32_to_cpu(d.v->d_parent_subvol)
+ : 0,
+ };
u64 target = d.v->d_type == DT_SUBVOL
? le32_to_cpu(d.v->d_child_subvol)
: le64_to_cpu(d.v->d_inum);
- u64 dir_offset;
+ struct qstr name = bch2_dirent_get_name(d);
+
+ struct bkey_i_dirent *new_d =
+ bch2_dirent_create_key(trans, hash_info, dir_inum,
+ d.v->d_type, &name, NULL, target);
+ ret = PTR_ERR_OR_ZERO(new_d);
+ if (ret)
+ goto out;
+
+ new_d->k.p.inode = d.k->p.inode;
+ new_d->k.p.snapshot = d.k->p.snapshot;
- ret = bch2_hash_delete_at(trans,
+ struct btree_iter dup_iter = {};
+ ret = bch2_hash_delete_at(trans,
bch2_dirent_hash_desc, hash_info, iter,
BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_dirent_create_snapshot(trans, subvol,
- d.k->p.inode, d.k->p.snapshot,
- hash_info,
- d.v->d_type,
- &name,
- target,
- &dir_offset,
- BTREE_ITER_with_updates|
- BTREE_UPDATE_internal_snapshot_node|
- STR_HASH_must_create) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-
- /* might need another check_dirents pass */
+ bch2_str_hash_repair_key(trans, s,
+ &bch2_dirent_hash_desc, hash_info,
+ iter, bkey_i_to_s_c(&new_d->k_i),
+ &dup_iter, bkey_s_c_null,
+ need_second_pass);
goto out;
}
@@ -2294,7 +2401,6 @@ out:
err:
fsck_err:
printbuf_exit(&buf);
- bch_err_fn(c, ret);
return ret;
}
@@ -2308,16 +2414,31 @@ int bch2_check_dirents(struct bch_fs *c)
struct inode_walker target = inode_walker_init();
struct snapshots_seen s;
struct bch_hash_info hash_info;
+ bool need_second_pass = false, did_second_pass = false;
+ int ret;
snapshots_seen_init(&s);
-
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_dirents,
+again:
+ ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s,
+ &need_second_pass)) ?:
check_subdir_count_notnested(trans, &dir));
+ if (!ret && need_second_pass && !did_second_pass) {
+ bch_info(c, "check_dirents requires second pass");
+ swap(did_second_pass, need_second_pass);
+ goto again;
+ }
+
+ if (!ret && need_second_pass) {
+ bch_err(c, "dirents not repairing");
+ ret = -EINVAL;
+ }
+
snapshots_seen_exit(&s);
inode_walker_exit(&dir);
inode_walker_exit(&target);
@@ -2331,16 +2452,14 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
struct inode_walker *inode)
{
struct bch_fs *c = trans->c;
- struct inode_walker_entry *i;
- int ret;
- ret = bch2_check_key_has_snapshot(trans, iter, k);
+ int ret = bch2_check_key_has_snapshot(trans, iter, k);
if (ret < 0)
return ret;
if (ret)
return 0;
- i = walk_inode(trans, inode, k);
+ struct inode_walker_entry *i = walk_inode(trans, inode, k);
ret = PTR_ERR_OR_ZERO(i);
if (ret)
return ret;
@@ -2356,9 +2475,9 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
*hash_info = bch2_hash_info_init(c, &i->inode);
inode->first_this_inode = false;
- ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k);
- bch_err_fn(c, ret);
- return ret;
+ bool need_second_pass = false;
+ return bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info,
+ iter, k, &need_second_pass);
}
/*
@@ -2470,6 +2589,11 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
if (k.k->type != KEY_TYPE_subvolume)
return 0;
+ subvol_inum start = {
+ .subvol = k.k->p.offset,
+ .inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode),
+ };
+
while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
ret = darray_push(&subvol_path, k.k->p.offset);
if (ret)
@@ -2488,11 +2612,11 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
if (darray_u32_has(&subvol_path, parent)) {
printbuf_reset(&buf);
- prt_printf(&buf, "subvolume loop:\n");
+ prt_printf(&buf, "subvolume loop: ");
- darray_for_each_reverse(subvol_path, i)
- prt_printf(&buf, "%u ", *i);
- prt_printf(&buf, "%u", parent);
+ ret = bch2_inum_to_path(trans, start, &buf);
+ if (ret)
+ goto err;
if (fsck_err(trans, subvol_loop, "%s", buf.buf))
ret = reattach_subvol(trans, s);
@@ -2536,19 +2660,13 @@ int bch2_check_subvolume_structure(struct bch_fs *c)
return ret;
}
-struct pathbuf_entry {
- u64 inum;
- u32 snapshot;
-};
-
-typedef DARRAY(struct pathbuf_entry) pathbuf;
-
-static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p,
+static int bch2_bi_depth_renumber_one(struct btree_trans *trans,
+ u64 inum, u32 snapshot,
u32 new_depth)
{
struct btree_iter iter;
struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, p->inum, p->snapshot), 0);
+ SPOS(0, inum, snapshot), 0);
struct bch_inode_unpacked inode;
int ret = bkey_err(k) ?:
@@ -2567,14 +2685,15 @@ err:
return ret;
}
-static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth)
+static int bch2_bi_depth_renumber(struct btree_trans *trans, darray_u64 *path,
+ u32 snapshot, u32 new_bi_depth)
{
u32 restart_count = trans->restart_count;
int ret = 0;
darray_for_each_reverse(*path, i) {
ret = nested_lockrestart_do(trans,
- bch2_bi_depth_renumber_one(trans, i, new_bi_depth));
+ bch2_bi_depth_renumber_one(trans, *i, snapshot, new_bi_depth));
bch_err_fn(trans->c, ret);
if (ret)
break;
@@ -2585,37 +2704,36 @@ static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32
return ret ?: trans_was_restarted(trans, restart_count);
}
-static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
-{
- darray_for_each(*p, i)
- if (i->inum == inum &&
- i->snapshot == snapshot)
- return true;
- return false;
-}
-
static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
{
struct bch_fs *c = trans->c;
struct btree_iter inode_iter = {};
- pathbuf path = {};
+ darray_u64 path = {};
struct printbuf buf = PRINTBUF;
u32 snapshot = inode_k.k->p.snapshot;
bool redo_bi_depth = false;
u32 min_bi_depth = U32_MAX;
int ret = 0;
+ struct bpos start = inode_k.k->p;
+
struct bch_inode_unpacked inode;
ret = bch2_inode_unpack(inode_k, &inode);
if (ret)
return ret;
- while (!inode.bi_subvol) {
+ /*
+ * If we're running full fsck, check_dirents() will have already ran,
+ * and we shouldn't see any missing backpointers here - otherwise that's
+ * handled separately, by check_unreachable_inodes
+ */
+ while (!inode.bi_subvol &&
+ bch2_inode_has_backpointer(&inode)) {
struct btree_iter dirent_iter;
struct bkey_s_c_dirent d;
- u32 parent_snapshot = snapshot;
- d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
+ d = dirent_get_by_pos(trans, &dirent_iter,
+ SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot));
ret = bkey_err(d.s_c);
if (ret && !bch2_err_matches(ret, ENOENT))
goto out;
@@ -2633,15 +2751,10 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
bch2_trans_iter_exit(trans, &dirent_iter);
- ret = darray_push(&path, ((struct pathbuf_entry) {
- .inum = inode.bi_inum,
- .snapshot = snapshot,
- }));
+ ret = darray_push(&path, inode.bi_inum);
if (ret)
return ret;
- snapshot = parent_snapshot;
-
bch2_trans_iter_exit(trans, &inode_iter);
inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
SPOS(0, inode.bi_dir, snapshot), 0);
@@ -2663,21 +2776,28 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
break;
inode = parent_inode;
- snapshot = inode_k.k->p.snapshot;
redo_bi_depth = true;
- if (path_is_dup(&path, inode.bi_inum, snapshot)) {
+ if (darray_find(path, inode.bi_inum)) {
printbuf_reset(&buf);
- prt_printf(&buf, "directory structure loop:\n");
- darray_for_each_reverse(path, i)
- prt_printf(&buf, "%llu:%u ", i->inum, i->snapshot);
- prt_printf(&buf, "%llu:%u", inode.bi_inum, snapshot);
+ prt_printf(&buf, "directory structure loop in snapshot %u: ",
+ snapshot);
+
+ ret = bch2_inum_snapshot_to_path(trans, start.offset, start.snapshot, NULL, &buf);
+ if (ret)
+ goto out;
+
+ if (c->opts.verbose) {
+ prt_newline(&buf);
+ darray_for_each(path, i)
+ prt_printf(&buf, "%llu ", *i);
+ }
if (fsck_err(trans, dir_loop, "%s", buf.buf)) {
ret = remove_backpointer(trans, &inode);
bch_err_msg(c, ret, "removing dirent");
if (ret)
- break;
+ goto out;
ret = reattach_inode(trans, &inode);
bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
@@ -2691,7 +2811,7 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
min_bi_depth = 0;
if (redo_bi_depth)
- ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth);
+ ret = bch2_bi_depth_renumber(trans, &path, snapshot, min_bi_depth);
out:
fsck_err:
bch2_trans_iter_exit(trans, &inode_iter);
@@ -2708,7 +2828,7 @@ fsck_err:
int bch2_check_directory_structure(struct bch_fs *c)
{
int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
+ for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
BTREE_ITER_intent|
BTREE_ITER_prefetch|
BTREE_ITER_all_snapshots, k,
@@ -2747,7 +2867,7 @@ static int add_nlink(struct bch_fs *c, struct nlink_table *t,
if (!d) {
bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
new_size);
- return -BCH_ERR_ENOMEM_fsck_add_nlink;
+ return bch_err_throw(c, ENOMEM_fsck_add_nlink);
}
if (t->d)
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
index 574948278cd4..e5fe7cf7b251 100644
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
@@ -4,6 +4,12 @@
#include "str_hash.h"
+/* recoverds snapshot IDs of overwrites at @pos */
+struct snapshots_seen {
+ struct bpos pos;
+ snapshot_id_list ids;
+};
+
int bch2_fsck_update_backpointers(struct btree_trans *,
struct snapshots_seen *,
const struct bch_hash_desc,
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 5cf70108ae2f..ef4cc7395b86 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -38,6 +38,7 @@ static const char * const bch2_inode_flag_strs[] = {
#undef x
static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos);
+static int may_delete_deleted_inum(struct btree_trans *, subvol_inum);
static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
@@ -1041,7 +1042,7 @@ again:
goto found_slot;
if (!ret && start == min)
- ret = -BCH_ERR_ENOSPC_inode_create;
+ ret = bch_err_throw(trans->c, ENOSPC_inode_create);
if (ret) {
bch2_trans_iter_exit(trans, iter);
@@ -1130,19 +1131,23 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
u32 snapshot;
int ret;
+ ret = lockrestart_do(trans, may_delete_deleted_inum(trans, inum));
+ if (ret)
+ goto err2;
+
/*
* If this was a directory, there shouldn't be any real dirents left -
* but there could be whiteouts (from hash collisions) that we should
* delete:
*
- * XXX: the dirent could ideally would delete whiteouts when they're no
+ * XXX: the dirent code ideally would delete whiteouts when they're no
* longer needed
*/
ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
if (ret)
- goto err;
+ goto err2;
retry:
bch2_trans_begin(trans);
@@ -1161,7 +1166,7 @@ retry:
bch2_fs_inconsistent(c,
"inode %llu:%u not found when deleting",
inum.inum, snapshot);
- ret = -BCH_ERR_ENOENT_inode;
+ ret = bch_err_throw(c, ENOENT_inode);
goto err;
}
@@ -1260,7 +1265,14 @@ int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum,
{
struct bch_fs *c = trans->c;
-#ifdef CONFIG_UNICODE
+#ifndef CONFIG_UNICODE
+ bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE");
+ return -EOPNOTSUPP;
+#endif
+
+ if (c->opts.casefold_disabled)
+ return -EOPNOTSUPP;
+
int ret = 0;
/* Not supported on individual files. */
if (!S_ISDIR(bi->bi_mode))
@@ -1284,10 +1296,6 @@ int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum,
bi->bi_fields_set |= BIT(Inode_opt_casefold);
return bch2_maybe_propagate_has_case_insensitive(trans, inum, bi);
-#else
- bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE");
- return -EOPNOTSUPP;
-#endif
}
static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
@@ -1328,7 +1336,7 @@ retry:
bch2_fs_inconsistent(c,
"inode %llu:%u not found when deleting",
inum, snapshot);
- ret = -BCH_ERR_ENOENT_inode;
+ ret = bch_err_throw(c, ENOENT_inode);
goto err;
}
@@ -1392,10 +1400,8 @@ int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot));
}
-static int may_delete_deleted_inode(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos pos,
- bool *need_another_pass)
+static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos,
+ bool from_deleted_inodes)
{
struct bch_fs *c = trans->c;
struct btree_iter inode_iter;
@@ -1409,12 +1415,14 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
if (ret)
return ret;
- ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
- if (fsck_err_on(!bkey_is_inode(k.k),
+ ret = bkey_is_inode(k.k) ? 0 : bch_err_throw(c, ENOENT_inode);
+ if (fsck_err_on(from_deleted_inodes && ret,
trans, deleted_inode_missing,
"nonexistent inode %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
+ if (ret)
+ goto out;
ret = bch2_inode_unpack(k, &inode);
if (ret)
@@ -1422,7 +1430,8 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
if (S_ISDIR(inode.bi_mode)) {
ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot);
- if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY),
+ if (fsck_err_on(from_deleted_inodes &&
+ bch2_err_matches(ret, ENOTEMPTY),
trans, deleted_inode_is_dir,
"non empty directory %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
@@ -1431,17 +1440,25 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
goto out;
}
- if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked),
+ ret = inode.bi_flags & BCH_INODE_unlinked ? 0 : bch_err_throw(c, inode_not_unlinked);
+ if (fsck_err_on(from_deleted_inodes && ret,
trans, deleted_inode_not_unlinked,
"non-deleted inode %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
+ if (ret)
+ goto out;
+
+ ret = !(inode.bi_flags & BCH_INODE_has_child_snapshot)
+ ? 0 : bch_err_throw(c, inode_has_child_snapshot);
- if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot,
+ if (fsck_err_on(from_deleted_inodes && ret,
trans, deleted_inode_has_child_snapshots,
"inode with child snapshots %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
+ if (ret)
+ goto out;
ret = bch2_inode_has_child_snapshots(trans, k.k->p);
if (ret < 0)
@@ -1458,19 +1475,28 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
if (ret)
goto out;
}
+
+ if (!from_deleted_inodes) {
+ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ bch_err_throw(c, inode_has_child_snapshot);
+ goto out;
+ }
+
goto delete;
}
- if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
- !fsck_err(trans, deleted_inode_but_clean,
- "filesystem marked as clean but have deleted inode %llu:%u",
- pos.offset, pos.snapshot)) {
- ret = 0;
- goto out;
- }
+ if (from_deleted_inodes) {
+ if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
+ !fsck_err(trans, deleted_inode_but_clean,
+ "filesystem marked as clean but have deleted inode %llu:%u",
+ pos.offset, pos.snapshot)) {
+ ret = 0;
+ goto out;
+ }
- ret = 1;
+ ret = 1;
+ }
out:
fsck_err:
bch2_trans_iter_exit(trans, &inode_iter);
@@ -1481,12 +1507,19 @@ delete:
goto out;
}
+static int may_delete_deleted_inum(struct btree_trans *trans, subvol_inum inum)
+{
+ u32 snapshot;
+
+ return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
+ may_delete_deleted_inode(trans, SPOS(0, inum.inum, snapshot), false);
+}
+
int bch2_delete_dead_inodes(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- bool need_another_pass;
int ret;
-again:
+
/*
* if we ran check_inodes() unlinked inodes will have already been
* cleaned up but the write buffer will be out of sync; therefore we
@@ -1496,8 +1529,6 @@ again:
if (ret)
goto err;
- need_another_pass = false;
-
/*
* Weird transaction restart handling here because on successful delete,
* bch2_inode_rm_snapshot() will return a nested transaction restart,
@@ -1507,7 +1538,7 @@ again:
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
- ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass);
+ ret = may_delete_deleted_inode(trans, k.k->p, true);
if (ret > 0) {
bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u",
k.k->p.offset, k.k->p.snapshot);
@@ -1528,10 +1559,8 @@ again:
ret;
}));
-
- if (!ret && need_another_pass)
- goto again;
err:
bch2_trans_put(trans);
+ bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 77ad2d549541..b8ec3e628d90 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -254,6 +254,11 @@ static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_
: c->opts.casefold;
}
+static inline bool bch2_inode_has_backpointer(const struct bch_inode_unpacked *bi)
+{
+ return bi->bi_dir || bi->bi_dir_offset;
+}
+
/* i_nlink: */
static inline unsigned nlink_bias(umode_t mode)
@@ -283,15 +288,6 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
-static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *inode)
-{
- bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset;
-
- return S_ISDIR(inode->bi_mode) ||
- inode->bi_subvol ||
- (!inode->bi_nlink && inode_has_bp);
-}
-
struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
struct bch_inode_unpacked *);
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index cc07729a4b62..07023667a475 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -91,7 +91,7 @@ int bch2_extent_fallocate(struct btree_trans *trans,
opts.data_replicas,
BCH_WATERMARK_normal, 0, &cl, &wp);
if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
- ret = -BCH_ERR_transaction_restart_nested;
+ ret = bch_err_throw(c, transaction_restart_nested);
if (ret)
goto err;
@@ -135,6 +135,33 @@ err_noprint:
return ret;
}
+/* For fsck */
+int bch2_fpunch_snapshot(struct btree_trans *trans, struct bpos start, struct bpos end)
+{
+ u32 restart_count = trans->restart_count;
+ struct bch_fs *c = trans->c;
+ struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0);
+ unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
+ struct bkey_i delete;
+
+ int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
+ start, end, 0, k,
+ &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ bkey_init(&delete.k);
+ delete.k.p = iter.pos;
+
+ /* create the biggest key we can */
+ bch2_key_resize(&delete.k, max_sectors);
+ bch2_cut_back(end, &delete);
+
+ bch2_extent_trim_atomic(trans, &iter, &delete) ?:
+ bch2_trans_update(trans, &iter, &delete, 0);
+ }));
+
+ bch2_disk_reservation_put(c, &disk_res);
+ return ret ?: trans_was_restarted(trans, restart_count);
+}
+
/*
* Returns -BCH_ERR_transacton_restart if we had to drop locks:
*/
diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h
index 9cb44a7c43c1..b93e4d4b3c0c 100644
--- a/fs/bcachefs/io_misc.h
+++ b/fs/bcachefs/io_misc.h
@@ -5,6 +5,8 @@
int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
u64, struct bch_io_opts, s64 *,
struct write_point_specifier);
+
+int bch2_fpunch_snapshot(struct btree_trans *, struct bpos, struct bpos);
int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
subvol_inum, u64, s64 *);
int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index cc708d46557e..e0874ad9a6cf 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -56,7 +56,7 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
if (!target)
return false;
- rcu_read_lock();
+ guard(rcu)();
devs = bch2_target_to_mask(c, target) ?:
&c->rw_devs[BCH_DATA_user];
@@ -73,7 +73,6 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
total += max(congested, 0LL);
nr++;
}
- rcu_read_unlock();
return get_random_u32_below(nr * CONGESTED_MAX) < total;
}
@@ -138,21 +137,21 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
BUG_ON(!opts.promote_target);
if (!(flags & BCH_READ_may_promote))
- return -BCH_ERR_nopromote_may_not;
+ return bch_err_throw(c, nopromote_may_not);
if (bch2_bkey_has_target(c, k, opts.promote_target))
- return -BCH_ERR_nopromote_already_promoted;
+ return bch_err_throw(c, nopromote_already_promoted);
if (bkey_extent_is_unwritten(k))
- return -BCH_ERR_nopromote_unwritten;
+ return bch_err_throw(c, nopromote_unwritten);
if (bch2_target_congested(c, opts.promote_target))
- return -BCH_ERR_nopromote_congested;
+ return bch_err_throw(c, nopromote_congested);
}
if (rhashtable_lookup_fast(&c->promote_table, &pos,
bch_promote_params))
- return -BCH_ERR_nopromote_in_flight;
+ return bch_err_throw(c, nopromote_in_flight);
return 0;
}
@@ -167,6 +166,7 @@ static noinline void promote_free(struct bch_read_bio *rbio)
BUG_ON(ret);
async_object_list_del(c, promote, op->list_idx);
+ async_object_list_del(c, rbio, rbio->list_idx);
bch2_data_update_exit(&op->write);
@@ -240,7 +240,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL);
if (!op) {
- ret = -BCH_ERR_nopromote_enomem;
+ ret = bch_err_throw(c, nopromote_enomem);
goto err_put;
}
@@ -249,7 +249,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
bch_promote_params)) {
- ret = -BCH_ERR_nopromote_in_flight;
+ ret = bch_err_throw(c, nopromote_in_flight);
goto err;
}
@@ -344,6 +344,10 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
*bounce = true;
*read_full = promote_full;
+
+ if (have_io_error(failed))
+ orig->self_healing = true;
+
return promote;
nopromote:
trace_io_read_nopromote(c, ret);
@@ -453,6 +457,10 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
if (rbio->start_time)
bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
rbio->start_time);
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
+ if (rbio->list_idx)
+ async_object_list_del(rbio->c, rbio, rbio->list_idx);
+#endif
bio_endio(&rbio->bio);
}
@@ -545,7 +553,7 @@ retry:
if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
/* extent we wanted to read no longer exists: */
- rbio->ret = -BCH_ERR_data_read_key_overwritten;
+ rbio->ret = bch_err_throw(trans->c, data_read_key_overwritten);
goto err;
}
@@ -636,12 +644,15 @@ static void bch2_rbio_retry(struct work_struct *work)
prt_str(&buf, "(internal move) ");
prt_str(&buf, "data read error, ");
- if (!ret)
+ if (!ret) {
prt_str(&buf, "successful retry");
- else
+ if (rbio->self_healing)
+ prt_str(&buf, ", self healing");
+ } else
prt_str(&buf, bch2_err_str(ret));
prt_newline(&buf);
+
if (!bkey_deleted(&sk.k->k)) {
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k));
prt_newline(&buf);
@@ -1036,7 +1047,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) &&
!orig->data_update)
- return -BCH_ERR_extent_poisoned;
+ return bch_err_throw(c, extent_poisoned);
retry_pick:
ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
@@ -1074,7 +1085,7 @@ retry_pick:
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
- ret = -BCH_ERR_data_read_no_encryption_key;
+ ret = bch_err_throw(c, data_read_no_encryption_key);
goto err;
}
@@ -1128,7 +1139,7 @@ retry_pick:
if (ca)
enumerated_ref_put(&ca->io_ref[READ],
BCH_DEV_READ_REF_io_read);
- rbio->ret = -BCH_ERR_data_read_buffer_too_small;
+ rbio->ret = bch_err_throw(c, data_read_buffer_too_small);
goto out_read_done;
}
@@ -1333,7 +1344,7 @@ hole:
* have to signal that:
*/
if (u)
- orig->ret = -BCH_ERR_data_read_key_overwritten;
+ orig->ret = bch_err_throw(c, data_read_key_overwritten);
zero_fill_bio_iter(&orig->bio, iter);
out_read_done:
@@ -1485,7 +1496,12 @@ void bch2_read_bio_to_text(struct printbuf *out, struct bch_read_bio *rbio)
prt_printf(out, "have_ioref:\t%u\n", rbio->have_ioref);
prt_printf(out, "narrow_crcs:\t%u\n", rbio->narrow_crcs);
prt_printf(out, "context:\t%u\n", rbio->context);
- prt_printf(out, "ret:\t%s\n", bch2_err_str(rbio->ret));
+
+ int ret = READ_ONCE(rbio->ret);
+ if (ret < 0)
+ prt_printf(out, "ret:\t%s\n", bch2_err_str(ret));
+ else
+ prt_printf(out, "ret:\t%i\n", ret);
prt_printf(out, "flags:\t");
bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags);
@@ -1510,18 +1526,18 @@ int bch2_fs_io_read_init(struct bch_fs *c)
c->opts.btree_node_size,
c->opts.encoded_extent_max) /
PAGE_SIZE, 0))
- return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
+ return bch_err_throw(c, ENOMEM_bio_bounce_pages_init);
if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
BIOSET_NEED_BVECS))
- return -BCH_ERR_ENOMEM_bio_read_init;
+ return bch_err_throw(c, ENOMEM_bio_read_init);
if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
BIOSET_NEED_BVECS))
- return -BCH_ERR_ENOMEM_bio_read_split_init;
+ return bch_err_throw(c, ENOMEM_bio_read_split_init);
if (rhashtable_init(&c->promote_table, &bch_promote_params))
- return -BCH_ERR_ENOMEM_promote_table_init;
+ return bch_err_throw(c, ENOMEM_promote_table_init);
return 0;
}
diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h
index c08b9c047b3e..9c5ddbf861b3 100644
--- a/fs/bcachefs/io_read.h
+++ b/fs/bcachefs/io_read.h
@@ -44,6 +44,7 @@ struct bch_read_bio {
have_ioref:1,
narrow_crcs:1,
saw_error:1,
+ self_healing:1,
context:2;
};
u16 _state;
@@ -91,6 +92,8 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans,
return 0;
*data_btree = BTREE_ID_reflink;
+
+ struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter,
offset_into_extent,
@@ -102,10 +105,10 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans,
if (bkey_deleted(k.k)) {
bch2_trans_iter_exit(trans, &iter);
- return -BCH_ERR_missing_indirect_extent;
+ return bch_err_throw(c, missing_indirect_extent);
}
- bch2_bkey_buf_reassemble(extent, trans->c, k);
+ bch2_bkey_buf_reassemble(extent, c, k);
bch2_trans_iter_exit(trans, &iter);
return 0;
}
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 52a60982a66b..88b1eec8eff3 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -558,6 +558,7 @@ static void bch2_write_done(struct closure *cl)
static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
{
+ struct bch_fs *c = op->c;
struct keylist *keys = &op->insert_keys;
struct bkey_i *src, *dst = keys->keys, *n;
@@ -569,7 +570,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
test_bit(ptr->dev, op->failed.d));
if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
- return -BCH_ERR_data_write_io;
+ return bch_err_throw(c, data_write_io);
}
if (dst != src)
@@ -976,7 +977,7 @@ csum_err:
op->crc.csum_type < BCH_CSUM_NR
? __bch2_csum_types[op->crc.csum_type]
: "(unknown)");
- return -BCH_ERR_data_write_csum;
+ return bch_err_throw(c, data_write_csum);
}
static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
@@ -1208,16 +1209,13 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op,
e = bkey_s_c_to_extent(k);
- rcu_read_lock();
+ guard(rcu)();
extent_for_each_ptr_decode(e, p, entry) {
- if (crc_is_encoded(p.crc) || p.has_ec) {
- rcu_read_unlock();
+ if (crc_is_encoded(p.crc) || p.has_ec)
return false;
- }
replicas += bch2_extent_ptr_durability(c, &p);
}
- rcu_read_unlock();
return replicas >= op->opts.data_replicas;
}
@@ -1290,7 +1288,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
static void __bch2_nocow_write_done(struct bch_write_op *op)
{
if (unlikely(op->flags & BCH_WRITE_io_error)) {
- op->error = -BCH_ERR_data_write_io;
+ op->error = bch_err_throw(op->c, data_write_io);
} else if (unlikely(op->flags & BCH_WRITE_convert_unwritten))
bch2_nocow_write_convert_unwritten(op);
}
@@ -1483,10 +1481,10 @@ err_bucket_stale:
"pointer to invalid bucket in nocow path on device %llu\n %s",
stale_at->b.inode,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = -BCH_ERR_data_write_invalid_ptr;
+ ret = bch_err_throw(c, data_write_invalid_ptr);
} else {
/* We can retry this: */
- ret = -BCH_ERR_transaction_restart;
+ ret = bch_err_throw(c, transaction_restart);
}
printbuf_exit(&buf);
@@ -1693,18 +1691,18 @@ CLOSURE_CALLBACK(bch2_write)
if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) {
bch2_write_op_error(op, op->pos.offset, "misaligned write");
- op->error = -BCH_ERR_data_write_misaligned;
+ op->error = bch_err_throw(c, data_write_misaligned);
goto err;
}
if (c->opts.nochanges) {
- op->error = -BCH_ERR_erofs_no_writes;
+ op->error = bch_err_throw(c, erofs_no_writes);
goto err;
}
if (!(op->flags & BCH_WRITE_move) &&
!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_write)) {
- op->error = -BCH_ERR_erofs_no_writes;
+ op->error = bch_err_throw(c, erofs_no_writes);
goto err;
}
@@ -1776,7 +1774,7 @@ int bch2_fs_io_write_init(struct bch_fs *c)
{
if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) ||
bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0))
- return -BCH_ERR_ENOMEM_bio_write_init;
+ return bch_err_throw(c, ENOMEM_bio_write_init);
return 0;
}
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 09b70fd140a1..ddfeb0dafc9d 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -397,7 +397,7 @@ static int journal_entry_open(struct journal *j)
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
if (j->blocked)
- return -BCH_ERR_journal_blocked;
+ return bch_err_throw(c, journal_blocked);
if (j->cur_entry_error)
return j->cur_entry_error;
@@ -407,23 +407,23 @@ static int journal_entry_open(struct journal *j)
return ret;
if (!fifo_free(&j->pin))
- return -BCH_ERR_journal_pin_full;
+ return bch_err_throw(c, journal_pin_full);
if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
- return -BCH_ERR_journal_max_in_flight;
+ return bch_err_throw(c, journal_max_in_flight);
if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR)
- return -BCH_ERR_journal_max_open;
+ return bch_err_throw(c, journal_max_open);
if (unlikely(journal_cur_seq(j) >= JOURNAL_SEQ_MAX)) {
bch_err(c, "cannot start: journal seq overflow");
if (bch2_fs_emergency_read_only_locked(c))
bch_err(c, "fatal error - emergency read only");
- return -BCH_ERR_journal_shutdown;
+ return bch_err_throw(c, journal_shutdown);
}
if (!j->free_buf && !buf->data)
- return -BCH_ERR_journal_buf_enomem; /* will retry after write completion frees up a buf */
+ return bch_err_throw(c, journal_buf_enomem); /* will retry after write completion frees up a buf */
BUG_ON(!j->cur_entry_sectors);
@@ -447,7 +447,7 @@ static int journal_entry_open(struct journal *j)
u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
if (u64s <= (ssize_t) j->early_journal_entries.nr)
- return -BCH_ERR_journal_full;
+ return bch_err_throw(c, journal_full);
if (fifo_empty(&j->pin) && j->reclaim_thread)
wake_up_process(j->reclaim_thread);
@@ -464,7 +464,7 @@ static int journal_entry_open(struct journal *j)
journal_cur_seq(j));
if (bch2_fs_emergency_read_only_locked(c))
bch_err(c, "fatal error - emergency read only");
- return -BCH_ERR_journal_shutdown;
+ return bch_err_throw(c, journal_shutdown);
}
BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
@@ -597,16 +597,16 @@ retry:
return ret;
if (j->blocked)
- return -BCH_ERR_journal_blocked;
+ return bch_err_throw(c, journal_blocked);
if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
- ret = -BCH_ERR_journal_full;
+ ret = bch_err_throw(c, journal_full);
can_discard = j->can_discard;
goto out;
}
if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
- ret = -BCH_ERR_journal_max_in_flight;
+ ret = bch_err_throw(c, journal_max_in_flight);
goto out;
}
@@ -647,7 +647,7 @@ out:
goto retry;
if (journal_error_check_stuck(j, ret, flags))
- ret = -BCH_ERR_journal_stuck;
+ ret = bch_err_throw(c, journal_stuck);
if (ret == -BCH_ERR_journal_max_in_flight &&
track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) &&
@@ -708,10 +708,9 @@ static unsigned max_dev_latency(struct bch_fs *c)
{
u64 nsecs = 0;
- rcu_read_lock();
+ guard(rcu)();
for_each_rw_member_rcu(c, ca)
nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration);
- rcu_read_unlock();
return nsecs_to_jiffies(nsecs);
}
@@ -813,6 +812,7 @@ out:
int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
struct closure *parent)
{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *buf;
int ret = 0;
@@ -828,7 +828,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
/* Recheck under lock: */
if (j->err_seq && seq >= j->err_seq) {
- ret = -BCH_ERR_journal_flush_err;
+ ret = bch_err_throw(c, journal_flush_err);
goto out;
}
@@ -999,7 +999,7 @@ int bch2_journal_meta(struct journal *j)
struct bch_fs *c = container_of(j, struct bch_fs, journal);
if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_journal))
- return -BCH_ERR_erofs_no_writes;
+ return bch_err_throw(c, erofs_no_writes);
int ret = __bch2_journal_meta(j);
enumerated_ref_put(&c->writes, BCH_WRITE_REF_journal);
@@ -1082,6 +1082,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou
if (open && !*blocked) {
__bch2_journal_block(j);
+ s.v = atomic64_read_acquire(&j->reservations.counter);
*blocked = true;
}
@@ -1132,7 +1133,7 @@ static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr,
new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL);
new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL);
if (!bu || !ob || !new_buckets || !new_bucket_seq) {
- ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
+ ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets);
goto err_free;
}
@@ -1283,7 +1284,7 @@ static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca
ret = 0; /* wait and retry */
bch2_disk_reservation_put(c, &disk_res);
- closure_sync(&cl);
+ bch2_wait_on_allocator(c, &cl);
}
return ret;
@@ -1304,6 +1305,66 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
return ret;
}
+int bch2_dev_journal_bucket_delete(struct bch_dev *ca, u64 b)
+{
+ struct bch_fs *c = ca->fs;
+ struct journal *j = &c->journal;
+ struct journal_device *ja = &ca->journal;
+
+ guard(mutex)(&c->sb_lock);
+ unsigned pos;
+ for (pos = 0; pos < ja->nr; pos++)
+ if (ja->buckets[pos] == b)
+ break;
+
+ if (pos == ja->nr) {
+ bch_err(ca, "journal bucket %llu not found when deleting", b);
+ return -EINVAL;
+ }
+
+ u64 *new_buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);;
+ if (!new_buckets)
+ return bch_err_throw(c, ENOMEM_set_nr_journal_buckets);
+
+ memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
+ memmove(&new_buckets[pos],
+ &new_buckets[pos + 1],
+ (ja->nr - 1 - pos) * sizeof(new_buckets[0]));
+
+ int ret = bch2_journal_buckets_to_sb(c, ca, ja->buckets, ja->nr - 1) ?:
+ bch2_write_super(c);
+ if (ret) {
+ kfree(new_buckets);
+ return ret;
+ }
+
+ scoped_guard(spinlock, &j->lock) {
+ if (pos < ja->discard_idx)
+ --ja->discard_idx;
+ if (pos < ja->dirty_idx_ondisk)
+ --ja->dirty_idx_ondisk;
+ if (pos < ja->dirty_idx)
+ --ja->dirty_idx;
+ if (pos < ja->cur_idx)
+ --ja->cur_idx;
+
+ ja->nr--;
+
+ memmove(&ja->buckets[pos],
+ &ja->buckets[pos + 1],
+ (ja->nr - pos) * sizeof(ja->buckets[0]));
+
+ memmove(&ja->bucket_seq[pos],
+ &ja->bucket_seq[pos + 1],
+ (ja->nr - pos) * sizeof(ja->bucket_seq[0]));
+
+ bch2_journal_space_available(j);
+ }
+
+ kfree(new_buckets);
+ return 0;
+}
+
int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
{
struct bch_fs *c = ca->fs;
@@ -1313,14 +1374,14 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
bch_err(c, "cannot allocate journal, filesystem is an unresized image file");
- return -BCH_ERR_erofs_filesystem_full;
+ return bch_err_throw(c, erofs_filesystem_full);
}
unsigned nr;
int ret;
if (dynamic_fault("bcachefs:add:journal_alloc")) {
- ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
+ ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets);
goto err;
}
@@ -1414,14 +1475,13 @@ void bch2_fs_journal_stop(struct journal *j)
clear_bit(JOURNAL_running, &j->flags);
}
-int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
+int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_entry_pin_list *p;
struct journal_replay *i, **_i;
struct genradix_iter iter;
bool had_entries = false;
- u64 last_seq = cur_seq, nr, seq;
/*
*
@@ -1435,17 +1495,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
return -EINVAL;
}
- genradix_for_each_reverse(&c->journal_entries, iter, _i) {
- i = *_i;
-
- if (journal_replay_ignore(i))
- continue;
-
- last_seq = le64_to_cpu(i->j.last_seq);
- break;
- }
+ /* Clean filesystem? */
+ if (!last_seq)
+ last_seq = cur_seq;
- nr = cur_seq - last_seq;
+ u64 nr = cur_seq - last_seq;
/*
* Extra fudge factor, in case we crashed when the journal pin fifo was
@@ -1459,7 +1513,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
if (!j->pin.data) {
bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
- return -BCH_ERR_ENOMEM_journal_pin_fifo;
+ return bch_err_throw(c, ENOMEM_journal_pin_fifo);
}
j->replay_journal_seq = last_seq;
@@ -1472,6 +1526,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
j->pin.back = cur_seq;
atomic64_set(&j->seq, cur_seq - 1);
+ u64 seq;
fifo_for_each_entry_ptr(p, &j->pin, seq)
journal_pin_list_init(p, 1);
@@ -1547,6 +1602,7 @@ void bch2_dev_journal_exit(struct bch_dev *ca)
int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
{
+ struct bch_fs *c = ca->fs;
struct journal_device *ja = &ca->journal;
struct bch_sb_field_journal *journal_buckets =
bch2_sb_field_get(sb, journal);
@@ -1566,7 +1622,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
if (!ja->bucket_seq)
- return -BCH_ERR_ENOMEM_dev_journal_init;
+ return bch_err_throw(c, ENOMEM_dev_journal_init);
unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
@@ -1574,7 +1630,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
nr_bvecs), GFP_KERNEL);
if (!ja->bio[i])
- return -BCH_ERR_ENOMEM_dev_journal_init;
+ return bch_err_throw(c, ENOMEM_dev_journal_init);
ja->bio[i]->ca = ca;
ja->bio[i]->buf_idx = i;
@@ -1583,7 +1639,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
if (!ja->buckets)
- return -BCH_ERR_ENOMEM_dev_journal_init;
+ return bch_err_throw(c, ENOMEM_dev_journal_init);
if (journal_buckets_v2) {
unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
@@ -1637,10 +1693,12 @@ void bch2_fs_journal_init_early(struct journal *j)
int bch2_fs_journal_init(struct journal *j)
{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN;
j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL);
if (!j->free_buf)
- return -BCH_ERR_ENOMEM_journal_buf;
+ return bch_err_throw(c, ENOMEM_journal_buf);
for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
j->buf[i].idx = i;
@@ -1648,7 +1706,7 @@ int bch2_fs_journal_init(struct journal *j)
j->wq = alloc_workqueue("bcachefs_journal",
WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512);
if (!j->wq)
- return -BCH_ERR_ENOMEM_fs_other_alloc;
+ return bch_err_throw(c, ENOMEM_fs_other_alloc);
return 0;
}
@@ -1672,7 +1730,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
printbuf_tabstop_push(out, 28);
out->atomic++;
- rcu_read_lock();
+ guard(rcu)();
s = READ_ONCE(j->reservations);
prt_printf(out, "flags:\t");
@@ -1763,8 +1821,6 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required);
- rcu_read_unlock();
-
--out->atomic;
}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 8ff00a0ec778..977907038d98 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -444,15 +444,16 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u
void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
-int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
- unsigned nr);
+int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned);
+int bch2_dev_journal_bucket_delete(struct bch_dev *, u64);
+
int bch2_dev_journal_alloc(struct bch_dev *, bool);
int bch2_fs_journal_alloc(struct bch_fs *);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
void bch2_fs_journal_stop(struct journal *);
-int bch2_fs_journal_start(struct journal *, u64);
+int bch2_fs_journal_start(struct journal *, u64, u64);
void bch2_journal_set_replay_done(struct journal *);
void bch2_dev_journal_exit(struct bch_dev *);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 63bb207208b2..9e028dbcc3d0 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -49,25 +49,27 @@ void bch2_journal_pos_from_member_info_resume(struct bch_fs *c)
mutex_unlock(&c->sb_lock);
}
-void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
- struct journal_replay *j)
+static void bch2_journal_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct journal_ptr *p)
+{
+ struct bch_dev *ca = bch2_dev_tryget_noerror(c, p->dev);
+ prt_printf(out, "%s %u:%u:%u (sector %llu)",
+ ca ? ca->name : "(invalid dev)",
+ p->dev, p->bucket, p->bucket_offset, p->sector);
+ bch2_dev_put(ca);
+}
+
+void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j)
{
darray_for_each(j->ptrs, i) {
if (i != j->ptrs.data)
prt_printf(out, " ");
- prt_printf(out, "%u:%u:%u (sector %llu)",
- i->dev, i->bucket, i->bucket_offset, i->sector);
+ bch2_journal_ptr_to_text(out, c, i);
}
}
-static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
- struct journal_replay *j)
+static void bch2_journal_datetime_to_text(struct printbuf *out, struct jset *j)
{
- prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
-
- bch2_journal_ptrs_to_text(out, c, j);
-
- for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) {
+ for_each_jset_entry_type(entry, j, BCH_JSET_ENTRY_datetime) {
struct jset_entry_datetime *datetime =
container_of(entry, struct jset_entry_datetime, entry);
bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
@@ -75,6 +77,15 @@ static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
}
}
+static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
+ struct journal_replay *j)
+{
+ prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
+ bch2_journal_datetime_to_text(out, &j->j);
+ prt_char(out, ' ');
+ bch2_journal_ptrs_to_text(out, c, j);
+}
+
static struct nonce journal_nonce(const struct jset *jset)
{
return (struct nonce) {{
@@ -149,6 +160,9 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
struct printbuf buf = PRINTBUF;
int ret = JOURNAL_ENTRY_ADD_OK;
+ if (last_seq && c->opts.journal_rewind)
+ last_seq = min(last_seq, c->opts.journal_rewind);
+
if (!c->journal.oldest_seq_found_ondisk ||
le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk)
c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq);
@@ -188,7 +202,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
GFP_KERNEL);
if (!_i)
- return -BCH_ERR_ENOMEM_journal_entry_add;
+ return bch_err_throw(c, ENOMEM_journal_entry_add);
/*
* Duplicate journal entries? If so we want the one that didn't have a
@@ -231,7 +245,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
replace:
i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
if (!i)
- return -BCH_ERR_ENOMEM_journal_entry_add;
+ return bch_err_throw(c, ENOMEM_journal_entry_add);
darray_init(&i->ptrs);
i->csum_good = entry_ptr.csum_good;
@@ -311,7 +325,7 @@ static void journal_entry_err_msg(struct printbuf *out,
bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \
if (bch2_fs_inconsistent(c, \
"corrupt metadata before write: %s\n", _buf.buf)) {\
- ret = -BCH_ERR_fsck_errors_not_fixed; \
+ ret = bch_err_throw(c, fsck_errors_not_fixed); \
goto fsck_err; \
} \
break; \
@@ -418,6 +432,10 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs
bool first = true;
jset_entry_for_each_key(entry, k) {
+ /* We may be called on entries that haven't been validated: */
+ if (!k->k.u64s)
+ break;
+
if (!first) {
prt_newline(out);
bch2_prt_jset_entry_type(out, entry->type);
@@ -1005,19 +1023,19 @@ struct journal_read_buf {
size_t size;
};
-static int journal_read_buf_realloc(struct journal_read_buf *b,
+static int journal_read_buf_realloc(struct bch_fs *c, struct journal_read_buf *b,
size_t new_size)
{
void *n;
/* the bios are sized for this many pages, max: */
if (new_size > JOURNAL_ENTRY_SIZE_MAX)
- return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
+ return bch_err_throw(c, ENOMEM_journal_read_buf_realloc);
new_size = roundup_pow_of_two(new_size);
n = kvmalloc(new_size, GFP_KERNEL);
if (!n)
- return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
+ return bch_err_throw(c, ENOMEM_journal_read_buf_realloc);
kvfree(b->data);
b->data = n;
@@ -1037,7 +1055,6 @@ static int journal_read_bucket(struct bch_dev *ca,
u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
end = offset + ca->mi.bucket_size;
bool saw_bad = false, csum_good;
- struct printbuf err = PRINTBUF;
int ret = 0;
pr_debug("reading %u", bucket);
@@ -1053,7 +1070,7 @@ reread:
bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
if (!bio)
- return -BCH_ERR_ENOMEM_journal_read_bucket;
+ return bch_err_throw(c, ENOMEM_journal_read_bucket);
bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
bio->bi_iter.bi_sector = offset;
@@ -1064,7 +1081,7 @@ reread:
kfree(bio);
if (!ret && bch2_meta_read_fault("journal"))
- ret = -BCH_ERR_EIO_fault_injected;
+ ret = bch_err_throw(c, EIO_fault_injected);
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
submit_time, !ret);
@@ -1078,7 +1095,7 @@ reread:
* found on a different device, and missing or
* no journal entries will be handled later
*/
- goto out;
+ return 0;
}
j = buf->data;
@@ -1092,15 +1109,15 @@ reread:
break;
case JOURNAL_ENTRY_REREAD:
if (vstruct_bytes(j) > buf->size) {
- ret = journal_read_buf_realloc(buf,
+ ret = journal_read_buf_realloc(c, buf,
vstruct_bytes(j));
if (ret)
- goto err;
+ return ret;
}
goto reread;
case JOURNAL_ENTRY_NONE:
if (!saw_bad)
- goto out;
+ return 0;
/*
* On checksum error we don't really trust the size
* field of the journal entry we read, so try reading
@@ -1109,7 +1126,7 @@ reread:
sectors = block_sectors(c);
goto next_block;
default:
- goto err;
+ return ret;
}
if (le64_to_cpu(j->seq) > ja->highest_seq_found) {
@@ -1126,22 +1143,20 @@ reread:
* bucket:
*/
if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
- goto out;
+ return 0;
ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
- enum bch_csum_type csum_type = JSET_CSUM_TYPE(j);
struct bch_csum csum;
csum_good = jset_csum_good(c, j, &csum);
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
if (!csum_good) {
- bch_err_dev_ratelimited(ca, "%s",
- (printbuf_reset(&err),
- prt_str(&err, "journal "),
- bch2_csum_err_msg(&err, csum_type, j->csum, csum),
- err.buf));
+ /*
+ * Don't print an error here, we'll print the error
+ * later if we need this journal entry
+ */
saw_bad = true;
}
@@ -1153,6 +1168,7 @@ reread:
mutex_lock(&jlist->lock);
ret = journal_entry_add(c, ca, (struct journal_ptr) {
.csum_good = csum_good,
+ .csum = csum,
.dev = ca->dev_idx,
.bucket = bucket,
.bucket_offset = offset -
@@ -1167,7 +1183,7 @@ reread:
case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
break;
default:
- goto err;
+ return ret;
}
next_block:
pr_debug("next");
@@ -1176,11 +1192,7 @@ next_block:
j = ((void *) j) + (sectors << 9);
}
-out:
- ret = 0;
-err:
- printbuf_exit(&err);
- return ret;
+ return 0;
}
static CLOSURE_CALLBACK(bch2_journal_read_device)
@@ -1197,7 +1209,7 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
if (!ja->nr)
goto out;
- ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
+ ret = journal_read_buf_realloc(c, &buf, PAGE_SIZE);
if (ret)
goto err;
@@ -1229,13 +1241,105 @@ err:
goto out;
}
+noinline_for_stack
+static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j)
+{
+ struct printbuf buf = PRINTBUF;
+ enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j);
+ bool have_good = false;
+
+ prt_printf(&buf, "invalid journal checksum(s) at seq %llu ", le64_to_cpu(j->j.seq));
+ bch2_journal_datetime_to_text(&buf, &j->j);
+ prt_newline(&buf);
+
+ darray_for_each(j->ptrs, ptr)
+ if (!ptr->csum_good) {
+ bch2_journal_ptr_to_text(&buf, c, ptr);
+ prt_char(&buf, ' ');
+ bch2_csum_to_text(&buf, csum_type, ptr->csum);
+ prt_newline(&buf);
+ } else {
+ have_good = true;
+ }
+
+ prt_printf(&buf, "should be ");
+ bch2_csum_to_text(&buf, csum_type, j->j.csum);
+
+ if (have_good)
+ prt_printf(&buf, "\n(had good copy on another device)");
+
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+}
+
+noinline_for_stack
+static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq)
+{
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ struct genradix_iter radix_iter;
+ struct journal_replay *i, **_i, *prev = NULL;
+ u64 seq = start_seq;
+
+ genradix_for_each(&c->journal_entries, radix_iter, _i) {
+ i = *_i;
+
+ if (journal_replay_ignore(i))
+ continue;
+
+ BUG_ON(seq > le64_to_cpu(i->j.seq));
+
+ while (seq < le64_to_cpu(i->j.seq)) {
+ while (seq < le64_to_cpu(i->j.seq) &&
+ bch2_journal_seq_is_blacklisted(c, seq, false))
+ seq++;
+
+ if (seq == le64_to_cpu(i->j.seq))
+ break;
+
+ u64 missing_start = seq;
+
+ while (seq < le64_to_cpu(i->j.seq) &&
+ !bch2_journal_seq_is_blacklisted(c, seq, false))
+ seq++;
+
+ u64 missing_end = seq - 1;
+
+ printbuf_reset(&buf);
+ prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
+ missing_start, missing_end,
+ start_seq, end_seq);
+
+ prt_printf(&buf, "\nprev at ");
+ if (prev) {
+ bch2_journal_ptrs_to_text(&buf, c, prev);
+ prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
+ } else
+ prt_printf(&buf, "(none)");
+
+ prt_printf(&buf, "\nnext at ");
+ bch2_journal_ptrs_to_text(&buf, c, i);
+ prt_printf(&buf, ", continue?");
+
+ fsck_err(c, journal_entries_missing, "%s", buf.buf);
+ }
+
+ prev = i;
+ seq++;
+ }
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
int bch2_journal_read(struct bch_fs *c,
u64 *last_seq,
u64 *blacklist_seq,
u64 *start_seq)
{
struct journal_list jlist;
- struct journal_replay *i, **_i, *prev = NULL;
+ struct journal_replay *i, **_i;
struct genradix_iter radix_iter;
struct printbuf buf = PRINTBUF;
bool degraded = false, last_write_torn = false;
@@ -1326,14 +1430,24 @@ int bch2_journal_read(struct bch_fs *c,
return 0;
}
- bch_info(c, "journal read done, replaying entries %llu-%llu",
- *last_seq, *blacklist_seq - 1);
+ printbuf_reset(&buf);
+ prt_printf(&buf, "journal read done, replaying entries %llu-%llu",
+ *last_seq, *blacklist_seq - 1);
- if (*start_seq != *blacklist_seq)
- bch_info(c, "dropped unflushed entries %llu-%llu",
- *blacklist_seq, *start_seq - 1);
+ /*
+ * Drop blacklisted entries and entries older than last_seq (or start of
+ * journal rewind:
+ */
+ u64 drop_before = *last_seq;
+ if (c->opts.journal_rewind) {
+ drop_before = min(drop_before, c->opts.journal_rewind);
+ prt_printf(&buf, " (rewinding from %llu)", c->opts.journal_rewind);
+ }
- /* Drop blacklisted entries and entries older than last_seq: */
+ *last_seq = drop_before;
+ if (*start_seq != *blacklist_seq)
+ prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1);
+ bch_info(c, "%s", buf.buf);
genradix_for_each(&c->journal_entries, radix_iter, _i) {
i = *_i;
@@ -1341,7 +1455,7 @@ int bch2_journal_read(struct bch_fs *c,
continue;
seq = le64_to_cpu(i->j.seq);
- if (seq < *last_seq) {
+ if (seq < drop_before) {
journal_replay_free(c, i, false);
continue;
}
@@ -1354,56 +1468,9 @@ int bch2_journal_read(struct bch_fs *c,
}
}
- /* Check for missing entries: */
- seq = *last_seq;
- genradix_for_each(&c->journal_entries, radix_iter, _i) {
- i = *_i;
-
- if (journal_replay_ignore(i))
- continue;
-
- BUG_ON(seq > le64_to_cpu(i->j.seq));
-
- while (seq < le64_to_cpu(i->j.seq)) {
- u64 missing_start, missing_end;
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
-
- while (seq < le64_to_cpu(i->j.seq) &&
- bch2_journal_seq_is_blacklisted(c, seq, false))
- seq++;
-
- if (seq == le64_to_cpu(i->j.seq))
- break;
-
- missing_start = seq;
-
- while (seq < le64_to_cpu(i->j.seq) &&
- !bch2_journal_seq_is_blacklisted(c, seq, false))
- seq++;
-
- if (prev) {
- bch2_journal_ptrs_to_text(&buf1, c, prev);
- prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
- } else
- prt_printf(&buf1, "(none)");
- bch2_journal_ptrs_to_text(&buf2, c, i);
-
- missing_end = seq - 1;
- fsck_err(c, journal_entries_missing,
- "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
- "prev at %s\n"
- "next at %s, continue?",
- missing_start, missing_end,
- *last_seq, *blacklist_seq - 1,
- buf1.buf, buf2.buf);
-
- printbuf_exit(&buf1);
- printbuf_exit(&buf2);
- }
-
- prev = i;
- seq++;
- }
+ ret = bch2_journal_check_for_missing(c, drop_before, *blacklist_seq - 1);
+ if (ret)
+ goto err;
genradix_for_each(&c->journal_entries, radix_iter, _i) {
union bch_replicas_padded replicas = {
@@ -1416,15 +1483,15 @@ int bch2_journal_read(struct bch_fs *c,
if (journal_replay_ignore(i))
continue;
- darray_for_each(i->ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-
- if (!ptr->csum_good)
- bch_err_dev_offset(ca, ptr->sector,
- "invalid journal checksum, seq %llu%s",
- le64_to_cpu(i->j.seq),
- i->csum_good ? " (had good copy on another device)" : "");
- }
+ /*
+ * Don't print checksum errors until we know we're going to use
+ * a given journal entry:
+ */
+ darray_for_each(i->ptrs, ptr)
+ if (!ptr->csum_good) {
+ bch2_journal_print_checksum_error(c, i);
+ break;
+ }
ret = jset_validate(c,
bch2_dev_have_ref(c, i->ptrs.data[0].dev),
@@ -1467,7 +1534,7 @@ static void journal_advance_devs_to_next_bucket(struct journal *j,
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- rcu_read_lock();
+ guard(rcu)();
darray_for_each(*devs, i) {
struct bch_dev *ca = rcu_dereference(c->devs[*i]);
if (!ca)
@@ -1489,7 +1556,6 @@ static void journal_advance_devs_to_next_bucket(struct journal *j,
ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq);
}
}
- rcu_read_unlock();
}
static void __journal_write_alloc(struct journal *j,
@@ -1559,7 +1625,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
retry_target:
devs = target_rw_devs(c, BCH_DATA_journal, target);
- devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
+ bch2_dev_alloc_list(c, &j->wp.stripe, &devs, &devs_sorted);
retry_alloc:
__journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want);
@@ -1581,6 +1647,16 @@ retry_alloc:
done:
BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
+#if 0
+ /*
+ * XXX: we need a way to alert the user when we go degraded for any
+ * reason
+ */
+ if (*replicas < min(replicas_want,
+ dev_mask_nr(&c->rw_devs[BCH_DATA_free]))) {
+ }
+#endif
+
return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices;
}
@@ -1628,7 +1704,7 @@ static CLOSURE_CALLBACK(journal_write_done)
: j->noflush_write_time, j->write_start_time);
if (!w->devs_written.nr) {
- err = -BCH_ERR_journal_write_err;
+ err = bch_err_throw(c, journal_write_err);
} else {
bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
w->devs_written);
@@ -1640,9 +1716,10 @@ static CLOSURE_CALLBACK(journal_write_done)
bch2_log_msg_start(c, &buf);
if (err == -BCH_ERR_journal_write_err)
- prt_printf(&buf, "unable to write journal to sufficient devices");
+ prt_printf(&buf, "unable to write journal to sufficient devices\n");
else
- prt_printf(&buf, "journal write error marking replicas: %s", bch2_err_str(err));
+ prt_printf(&buf, "journal write error marking replicas: %s\n",
+ bch2_err_str(err));
bch2_fs_emergency_read_only2(c, &buf);
@@ -1690,6 +1767,7 @@ static CLOSURE_CALLBACK(journal_write_done)
closure_wake_up(&c->freelist_wait);
bch2_reset_alloc_cursors(c);
+ do_discards = true;
}
j->seq_ondisk = seq;
@@ -2058,7 +2136,7 @@ CLOSURE_CALLBACK(bch2_journal_write)
struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
union bch_replicas_padded replicas;
- unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_journal]);
+ unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]);
int ret;
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 12b39fcb4424..6fa82c4050fe 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -9,6 +9,7 @@ void bch2_journal_pos_from_member_info_resume(struct bch_fs *);
struct journal_ptr {
bool csum_good;
+ struct bch_csum csum;
u8 dev;
u32 bucket;
u32 bucket_offset;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 70f36f6bc482..0042d43b8e57 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -83,18 +83,20 @@ static struct journal_space
journal_dev_space_available(struct journal *j, struct bch_dev *ca,
enum journal_space_from from)
{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_device *ja = &ca->journal;
unsigned sectors, buckets, unwritten;
+ unsigned bucket_size_aligned = round_down(ca->mi.bucket_size, block_sectors(c));
u64 seq;
if (from == journal_space_total)
return (struct journal_space) {
- .next_entry = ca->mi.bucket_size,
- .total = ca->mi.bucket_size * ja->nr,
+ .next_entry = bucket_size_aligned,
+ .total = bucket_size_aligned * ja->nr,
};
buckets = bch2_journal_dev_buckets_available(j, ja, from);
- sectors = ja->sectors_free;
+ sectors = round_down(ja->sectors_free, block_sectors(c));
/*
* We that we don't allocate the space for a journal entry
@@ -109,7 +111,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
continue;
/* entry won't fit on this device, skip: */
- if (unwritten > ca->mi.bucket_size)
+ if (unwritten > bucket_size_aligned)
continue;
if (unwritten >= sectors) {
@@ -119,7 +121,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
}
buckets--;
- sectors = ca->mi.bucket_size;
+ sectors = bucket_size_aligned;
}
sectors -= unwritten;
@@ -127,12 +129,12 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
if (sectors < ca->mi.bucket_size && buckets) {
buckets--;
- sectors = ca->mi.bucket_size;
+ sectors = bucket_size_aligned;
}
return (struct journal_space) {
.next_entry = sectors,
- .total = sectors + buckets * ca->mi.bucket_size,
+ .total = sectors + buckets * bucket_size_aligned,
};
}
@@ -146,7 +148,6 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
- rcu_read_lock();
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
if (!ca->journal.nr ||
!ca->mi.durability)
@@ -164,12 +165,17 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
array_insert_item(dev_space, nr_devs, pos, space);
}
- rcu_read_unlock();
if (nr_devs < nr_devs_want)
return (struct journal_space) { 0, 0 };
/*
+ * It's possible for bucket size to be misaligned w.r.t. the filesystem
+ * block size:
+ */
+ min_bucket_size = round_down(min_bucket_size, block_sectors(c));
+
+ /*
* We sorted largest to smallest, and we want the smallest out of the
* @nr_devs_want largest devices:
*/
@@ -189,8 +195,8 @@ void bch2_journal_space_available(struct journal *j)
int ret = 0;
lockdep_assert_held(&j->lock);
+ guard(rcu)();
- rcu_read_lock();
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
struct journal_device *ja = &ca->journal;
@@ -210,7 +216,6 @@ void bch2_journal_space_available(struct journal *j)
max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
nr_online++;
}
- rcu_read_unlock();
j->can_discard = can_discard;
@@ -221,15 +226,13 @@ void bch2_journal_space_available(struct journal *j)
prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
"rw journal devs:", nr_online, metadata_replicas_required(c));
- rcu_read_lock();
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
prt_printf(&buf, " %s", ca->name);
- rcu_read_unlock();
bch_err(c, "%s", buf.buf);
printbuf_exit(&buf);
}
- ret = -BCH_ERR_insufficient_journal_devices;
+ ret = bch_err_throw(c, insufficient_journal_devices);
goto out;
}
@@ -243,7 +246,7 @@ void bch2_journal_space_available(struct journal *j)
total = j->space[journal_space_total].total;
if (!j->space[journal_space_discarded].next_entry)
- ret = -BCH_ERR_journal_full;
+ ret = bch_err_throw(c, journal_full);
if ((j->space[journal_space_clean_ondisk].next_entry <
j->space[journal_space_clean_ondisk].total) &&
@@ -256,8 +259,7 @@ void bch2_journal_space_available(struct journal *j)
bch2_journal_set_watermark(j);
out:
j->cur_entry_sectors = !ret
- ? round_down(j->space[journal_space_discarded].next_entry,
- block_sectors(c))
+ ? j->space[journal_space_discarded].next_entry
: 0;
j->cur_entry_error = ret;
@@ -625,9 +627,9 @@ static u64 journal_seq_to_flush(struct journal *j)
struct bch_fs *c = container_of(j, struct bch_fs, journal);
u64 seq_to_flush = 0;
- spin_lock(&j->lock);
+ guard(spinlock)(&j->lock);
+ guard(rcu)();
- rcu_read_lock();
for_each_rw_member_rcu(c, ca) {
struct journal_device *ja = &ca->journal;
unsigned nr_buckets, bucket_to_flush;
@@ -642,15 +644,11 @@ static u64 journal_seq_to_flush(struct journal *j)
seq_to_flush = max(seq_to_flush,
ja->bucket_seq[bucket_to_flush]);
}
- rcu_read_unlock();
/* Also flush if the pin fifo is more than half full */
- seq_to_flush = max_t(s64, seq_to_flush,
- (s64) journal_cur_seq(j) -
- (j->pin.size >> 1));
- spin_unlock(&j->lock);
-
- return seq_to_flush;
+ return max_t(s64, seq_to_flush,
+ (s64) journal_cur_seq(j) -
+ (j->pin.size >> 1));
}
/**
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
index 62b910f2fb27..0cb9b93f13e7 100644
--- a/fs/bcachefs/journal_sb.c
+++ b/fs/bcachefs/journal_sb.c
@@ -210,7 +210,7 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
j = bch2_sb_field_resize(&ca->disk_sb, journal_v2,
(sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64));
if (!j)
- return -BCH_ERR_ENOSPC_sb_journal;
+ return bch_err_throw(c, ENOSPC_sb_journal);
bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index c5a7d800a0f5..af4fe416d9ec 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -78,7 +78,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
sb_blacklist_u64s(nr + 1));
if (!bl) {
- ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist;
+ ret = bch_err_throw(c, ENOSPC_sb_journal_seq_blacklist);
goto out;
}
@@ -152,7 +152,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL);
if (!t)
- return -BCH_ERR_ENOMEM_blacklist_table_init;
+ return bch_err_throw(c, ENOMEM_blacklist_table_init);
t->nr = nr;
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 2f63fc6d456f..57b5b3263b08 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -145,13 +145,11 @@ static u64 bkey_lru_type_idx(struct bch_fs *c,
case BCH_LRU_fragmentation: {
a = bch2_alloc_to_v4(k, &a_convert);
- rcu_read_lock();
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode);
- u64 idx = ca
+ return ca
? alloc_lru_idx_fragmentation(*a, ca)
: 0;
- rcu_read_unlock();
- return idx;
}
case BCH_LRU_stripes:
return k.k->type == KEY_TYPE_stripe
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index bb7a92270c09..f296cce95338 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -35,7 +35,7 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
nr_good = bch2_bkey_durability(c, k.s_c);
if ((!nr_good && !(flags & lost)) ||
(nr_good < replicas && !(flags & degraded)))
- return -BCH_ERR_remove_would_lose_data;
+ return bch_err_throw(c, remove_would_lose_data);
return 0;
}
@@ -156,7 +156,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c,
/* don't handle this yet: */
if (flags & BCH_FORCE_IF_METADATA_LOST)
- return -BCH_ERR_remove_with_metadata_missing_unimplemented;
+ return bch_err_throw(c, remove_with_metadata_missing_unimplemented);
trans = bch2_trans_get(c);
bch2_bkey_buf_init(&k);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 79f4722621d5..eec591e947bd 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -38,30 +38,74 @@ const char * const bch2_data_ops_strs[] = {
NULL
};
-static void trace_io_move2(struct bch_fs *c, struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
+struct evacuate_bucket_arg {
+ struct bpos bucket;
+ int gen;
+ struct data_update_opts data_opts;
+};
+
+static bool evacuate_bucket_pred(struct bch_fs *, void *,
+ enum btree_id, struct bkey_s_c,
+ struct bch_io_opts *,
+ struct data_update_opts *);
+
+static noinline void
+trace_io_move2(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
- if (trace_io_move_enabled()) {
- struct printbuf buf = PRINTBUF;
+ struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, k);
- prt_newline(&buf);
- bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
- trace_io_move(c, buf.buf);
- printbuf_exit(&buf);
- }
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
+ bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
+ trace_io_move(c, buf.buf);
+ printbuf_exit(&buf);
}
-static void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k)
+static noinline void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k)
{
- if (trace_io_move_read_enabled()) {
- struct printbuf buf = PRINTBUF;
+ struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, k);
- trace_io_move_read(c, buf.buf);
- printbuf_exit(&buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ trace_io_move_read(c, buf.buf);
+ printbuf_exit(&buf);
+}
+
+static noinline void
+trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts,
+ move_pred_fn pred, void *_arg, bool p)
+{
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "%ps: %u", pred, p);
+
+ if (pred == evacuate_bucket_pred) {
+ struct evacuate_bucket_arg *arg = _arg;
+ prt_printf(&buf, " gen=%u", arg->gen);
}
+
+ prt_newline(&buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
+ bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
+ trace_io_move_pred(c, buf.buf);
+ printbuf_exit(&buf);
+}
+
+static noinline void
+trace_io_move_evacuate_bucket2(struct bch_fs *c, struct bpos bucket, int gen)
+{
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "bucket: ");
+ bch2_bpos_to_text(&buf, bucket);
+ prt_printf(&buf, " gen: %i\n", gen);
+
+ trace_io_move_evacuate_bucket(c, buf.buf);
+ printbuf_exit(&buf);
}
struct moving_io {
@@ -298,7 +342,8 @@ int bch2_move_extent(struct moving_context *ctxt,
struct bch_fs *c = trans->c;
int ret = -ENOMEM;
- trace_io_move2(c, k, &io_opts, &data_opts);
+ if (trace_io_move_enabled())
+ trace_io_move2(c, k, &io_opts, &data_opts);
this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
if (ctxt->stats)
@@ -314,16 +359,14 @@ int bch2_move_extent(struct moving_context *ctxt,
return 0;
}
- /*
- * Before memory allocations & taking nocow locks in
- * bch2_data_update_init():
- */
- bch2_trans_unlock(trans);
-
- struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL);
+ struct moving_io *io = allocate_dropping_locks(trans, ret,
+ kzalloc(sizeof(struct moving_io), _gfp));
if (!io)
goto err;
+ if (ret)
+ goto err_free;
+
INIT_LIST_HEAD(&io->io_list);
io->write.ctxt = ctxt;
io->read_sectors = k.k->size;
@@ -343,6 +386,8 @@ int bch2_move_extent(struct moving_context *ctxt,
io->write.op.c = c;
io->write.data_opts = data_opts;
+ bch2_trans_unlock(trans);
+
ret = bch2_data_update_bios_init(&io->write, c, &io_opts);
if (ret)
goto err_free;
@@ -364,7 +409,8 @@ int bch2_move_extent(struct moving_context *ctxt,
atomic_inc(&io->b->count);
}
- trace_io_move_read2(c, k);
+ if (trace_io_move_read_enabled())
+ trace_io_move_read2(c, k);
mutex_lock(&ctxt->lock);
atomic_add(io->read_sectors, &ctxt->read_sectors);
@@ -390,9 +436,6 @@ int bch2_move_extent(struct moving_context *ctxt,
err_free:
kfree(io);
err:
- if (bch2_err_matches(ret, BCH_ERR_data_update_done))
- return 0;
-
if (bch2_err_matches(ret, EROFS) ||
bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret;
@@ -408,6 +451,9 @@ err:
trace_io_move_start_fail(c, buf.buf);
printbuf_exit(&buf);
}
+
+ if (bch2_err_matches(ret, BCH_ERR_data_update_done))
+ return 0;
return ret;
}
@@ -496,6 +542,7 @@ int bch2_move_get_io_opts_one(struct btree_trans *trans,
bch2_inode_opts_get(io_opts, c, &inode);
}
bch2_trans_iter_exit(trans, &inode_iter);
+ /* seem to be spinning here? */
out:
return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k);
}
@@ -910,7 +957,13 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
}
struct data_update_opts data_opts = {};
- if (!pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts)) {
+ bool p = pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts);
+
+ if (trace_io_move_pred_enabled())
+ trace_io_move_pred2(c, k, &io_opts, &data_opts,
+ pred, arg, p);
+
+ if (!p) {
bch2_trans_iter_exit(trans, &iter);
goto next;
}
@@ -918,7 +971,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
if (data_opts.scrub &&
!bch2_dev_idx_is_online(c, data_opts.read_dev)) {
bch2_trans_iter_exit(trans, &iter);
- ret = -BCH_ERR_device_offline;
+ ret = bch_err_throw(c, device_offline);
break;
}
@@ -993,12 +1046,6 @@ int bch2_move_data_phys(struct bch_fs *c,
return ret;
}
-struct evacuate_bucket_arg {
- struct bpos bucket;
- int gen;
- struct data_update_opts data_opts;
-};
-
static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg,
enum btree_id btree, struct bkey_s_c k,
struct bch_io_opts *io_opts,
@@ -1025,8 +1072,13 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
struct bpos bucket, int gen,
struct data_update_opts data_opts)
{
+ struct bch_fs *c = ctxt->trans->c;
struct evacuate_bucket_arg arg = { bucket, gen, data_opts, };
+ count_event(c, io_move_evacuate_bucket);
+ if (trace_io_move_evacuate_bucket_enabled())
+ trace_io_move_evacuate_bucket2(c, bucket, gen);
+
return __bch2_move_data_phys(ctxt, bucket_in_flight,
bucket.inode,
bucket.offset,
@@ -1124,7 +1176,7 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg,
? c->opts.metadata_replicas
: io_opts->data_replicas;
- rcu_read_lock();
+ guard(rcu)();
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
unsigned i = 0;
bkey_for_each_ptr(ptrs, ptr) {
@@ -1134,7 +1186,6 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg,
data_opts->kill_ptrs |= BIT(i);
i++;
}
- rcu_read_unlock();
if (!data_opts->kill_ptrs &&
(!nr_good || nr_good >= replicas))
@@ -1242,7 +1293,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
struct extent_ptr_decoded p;
unsigned i = 0;
- rcu_read_lock();
+ guard(rcu)();
bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
unsigned d = bch2_extent_ptr_durability(c, &p);
@@ -1253,7 +1304,6 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
i++;
}
- rcu_read_unlock();
return data_opts->kill_ptrs != 0;
}
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index e7a2a13554d7..5e6de91a8763 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -28,7 +28,7 @@
#include <linux/wait.h>
struct buckets_in_flight {
- struct rhashtable table;
+ struct rhashtable *table;
struct move_bucket *first;
struct move_bucket *last;
size_t nr;
@@ -71,7 +71,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
if (ret)
return ret;
- struct bch_dev *ca = bch2_dev_tryget(c, k.k->p.inode);
+ struct bch_dev *ca = bch2_dev_bucket_tryget(c, k.k->p);
if (!ca)
goto out;
@@ -98,7 +98,7 @@ out:
static void move_bucket_free(struct buckets_in_flight *list,
struct move_bucket *b)
{
- int ret = rhashtable_remove_fast(&list->table, &b->hash,
+ int ret = rhashtable_remove_fast(list->table, &b->hash,
bch_move_bucket_params);
BUG_ON(ret);
kfree(b);
@@ -133,7 +133,7 @@ static void move_buckets_wait(struct moving_context *ctxt,
static bool bucket_in_flight(struct buckets_in_flight *list,
struct move_bucket_key k)
{
- return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params);
+ return rhashtable_lookup_fast(list->table, &k, bch_move_bucket_params);
}
static int bch2_copygc_get_buckets(struct moving_context *ctxt,
@@ -185,7 +185,7 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
goto err;
}
- ret2 = rhashtable_lookup_insert_fast(&buckets_in_flight->table, &b_i->hash,
+ ret2 = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash,
bch_move_bucket_params);
BUG_ON(ret2);
@@ -293,11 +293,9 @@ u64 bch2_copygc_wait_amount(struct bch_fs *c)
{
u64 wait = U64_MAX;
- rcu_read_lock();
+ guard(rcu)();
for_each_rw_member_rcu(c, ca)
wait = min(wait, bch2_copygc_dev_wait_amount(ca));
- rcu_read_unlock();
-
return wait;
}
@@ -321,21 +319,21 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
bch2_printbuf_make_room(out, 4096);
- rcu_read_lock();
+ struct task_struct *t;
out->atomic++;
+ scoped_guard(rcu) {
+ prt_printf(out, "Currently calculated wait:\n");
+ for_each_rw_member_rcu(c, ca) {
+ prt_printf(out, " %s:\t", ca->name);
+ prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca));
+ prt_newline(out);
+ }
- prt_printf(out, "Currently calculated wait:\n");
- for_each_rw_member_rcu(c, ca) {
- prt_printf(out, " %s:\t", ca->name);
- prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca));
- prt_newline(out);
+ t = rcu_dereference(c->copygc_thread);
+ if (t)
+ get_task_struct(t);
}
-
- struct task_struct *t = rcu_dereference(c->copygc_thread);
- if (t)
- get_task_struct(t);
--out->atomic;
- rcu_read_unlock();
if (t) {
bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
@@ -352,10 +350,13 @@ static int bch2_copygc_thread(void *arg)
struct buckets_in_flight buckets = {};
u64 last, wait;
- int ret = rhashtable_init(&buckets.table, &bch_move_bucket_params);
+ buckets.table = kzalloc(sizeof(*buckets.table), GFP_KERNEL);
+ int ret = !buckets.table
+ ? -ENOMEM
+ : rhashtable_init(buckets.table, &bch_move_bucket_params);
bch_err_msg(c, ret, "allocating copygc buckets in flight");
if (ret)
- return ret;
+ goto err;
set_freezable();
@@ -423,11 +424,12 @@ static int bch2_copygc_thread(void *arg)
}
move_buckets_wait(&ctxt, &buckets, true);
- rhashtable_destroy(&buckets.table);
+ rhashtable_destroy(buckets.table);
bch2_moving_ctxt_exit(&ctxt);
bch2_move_stats_exit(&move_stats, c);
-
- return 0;
+err:
+ kfree(buckets.table);
+ return ret;
}
void bch2_copygc_stop(struct bch_fs *c)
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
index b9683d22bab0..f615910d6f98 100644
--- a/fs/bcachefs/movinggc.h
+++ b/fs/bcachefs/movinggc.h
@@ -7,11 +7,10 @@ void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *);
static inline void bch2_copygc_wakeup(struct bch_fs *c)
{
- rcu_read_lock();
+ guard(rcu)();
struct task_struct *p = rcu_dereference(c->copygc_thread);
if (p)
wake_up_process(p);
- rcu_read_unlock();
}
void bch2_copygc_stop(struct bch_fs *);
diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c
index a84b69d6caef..c3f87c59922d 100644
--- a/fs/bcachefs/namei.c
+++ b/fs/bcachefs/namei.c
@@ -175,6 +175,16 @@ int bch2_create_trans(struct btree_trans *trans,
new_inode->bi_dir_offset = dir_offset;
}
+ if (S_ISDIR(mode)) {
+ ret = bch2_maybe_propagate_has_case_insensitive(trans,
+ (subvol_inum) {
+ new_inode->bi_subvol ?: dir.subvol,
+ new_inode->bi_inum },
+ new_inode);
+ if (ret)
+ goto err;
+ }
+
if (S_ISDIR(mode) &&
!new_inode->bi_subvol)
new_inode->bi_depth = dir_u->bi_depth + 1;
@@ -287,7 +297,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
}
if (deleting_subvol && !inode_u->bi_subvol) {
- ret = -BCH_ERR_ENOENT_not_subvol;
+ ret = bch_err_throw(c, ENOENT_not_subvol);
goto err;
}
@@ -425,8 +435,8 @@ int bch2_rename_trans(struct btree_trans *trans,
}
ret = bch2_dirent_rename(trans,
- src_dir, &src_hash, &src_dir_u->bi_size,
- dst_dir, &dst_hash, &dst_dir_u->bi_size,
+ src_dir, &src_hash,
+ dst_dir, &dst_hash,
src_name, &src_inum, &src_offset,
dst_name, &dst_inum, &dst_offset,
mode);
@@ -615,14 +625,26 @@ static int __bch2_inum_to_path(struct btree_trans *trans,
{
unsigned orig_pos = path->pos;
int ret = 0;
+ DARRAY(subvol_inum) inums = {};
+
+ if (!snapshot) {
+ ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot);
+ if (ret)
+ goto disconnected;
+ }
while (true) {
- if (!snapshot) {
- ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot);
- if (ret)
- goto disconnected;
+ subvol_inum n = (subvol_inum) { subvol ?: snapshot, inum };
+
+ if (darray_find_p(inums, i, i->subvol == n.subvol && i->inum == n.inum)) {
+ prt_str_reversed(path, "(loop)");
+ break;
}
+ ret = darray_push(&inums, n);
+ if (ret)
+ goto err;
+
struct bch_inode_unpacked inode;
ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0);
if (ret)
@@ -633,14 +655,16 @@ static int __bch2_inum_to_path(struct btree_trans *trans,
break;
if (!inode.bi_dir && !inode.bi_dir_offset) {
- ret = -BCH_ERR_ENOENT_inode_no_backpointer;
+ ret = bch_err_throw(trans->c, ENOENT_inode_no_backpointer);
goto disconnected;
}
inum = inode.bi_dir;
if (inode.bi_parent_subvol) {
subvol = inode.bi_parent_subvol;
- snapshot = 0;
+ ret = bch2_subvolume_get_snapshot(trans, inode.bi_parent_subvol, &snapshot);
+ if (ret)
+ goto disconnected;
}
struct btree_iter d_iter;
@@ -652,6 +676,7 @@ static int __bch2_inum_to_path(struct btree_trans *trans,
goto disconnected;
struct qstr dirent_name = bch2_dirent_get_name(d);
+
prt_bytes_reversed(path, dirent_name.name, dirent_name.len);
prt_char(path, '/');
@@ -667,8 +692,10 @@ out:
goto err;
reverse_bytes(path->buf + orig_pos, path->pos - orig_pos);
+ darray_exit(&inums);
return 0;
err:
+ darray_exit(&inums);
return ret;
disconnected:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -707,8 +734,7 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans,
if (inode_points_to_dirent(target, d))
return 0;
- if (!target->bi_dir &&
- !target->bi_dir_offset) {
+ if (!bch2_inode_has_backpointer(target)) {
fsck_err_on(S_ISDIR(target->bi_mode),
trans, inode_dir_missing_backpointer,
"directory with missing backpointer\n%s",
@@ -733,15 +759,6 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans,
return __bch2_fsck_write_inode(trans, target);
}
- if (bch2_inode_should_have_single_bp(target) &&
- !fsck_err(trans, inode_wrong_backpointer,
- "dirent points to inode that does not point back:\n%s",
- (bch2_bkey_val_to_text(&buf, c, d.s_c),
- prt_newline(&buf),
- bch2_inode_unpacked_to_text(&buf, target),
- buf.buf)))
- goto err;
-
struct bkey_s_c_dirent bp_dirent =
bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents,
SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot),
@@ -768,6 +785,7 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans,
ret = __bch2_fsck_write_inode(trans, target);
}
} else {
+ printbuf_reset(&buf);
bch2_bkey_val_to_text(&buf, c, d.s_c);
prt_newline(&buf);
bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
@@ -857,7 +875,8 @@ int __bch2_check_dirent_target(struct btree_trans *trans,
n->v.d_inum = cpu_to_le64(target->bi_inum);
}
- ret = bch2_trans_update(trans, dirent_iter, &n->k_i, 0);
+ ret = bch2_trans_update(trans, dirent_iter, &n->k_i,
+ BTREE_UPDATE_internal_snapshot_node);
if (ret)
goto err;
}
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 2a02606254b3..63f8e254495c 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -234,6 +234,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH_SB_CASEFOLD, false, \
NULL, "Dirent lookups are casefolded") \
+ x(casefold_disabled, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Disable casefolding filesystem wide") \
x(inodes_32bit, u8, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
@@ -379,6 +384,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Exit recovery immediately prior to journal replay")\
+ x(journal_rewind, u64, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_UINT(0, U64_MAX), \
+ BCH2_NO_SB_OPT, 0, \
+ NULL, "Rewind journal") \
x(recovery_passes, u64, \
OPT_FS|OPT_MOUNT, \
OPT_BITFIELD(bch2_recovery_passes), \
diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h
index 1ca476adbf6f..8f4e28d440ac 100644
--- a/fs/bcachefs/printbuf.h
+++ b/fs/bcachefs/printbuf.h
@@ -140,6 +140,14 @@ void bch2_prt_bitflags_vector(struct printbuf *, const char * const[],
.size = _size, \
})
+static inline struct printbuf bch2_printbuf_init(void)
+{
+ return PRINTBUF;
+}
+
+DEFINE_CLASS(printbuf, struct printbuf,
+ bch2_printbuf_exit(&_T), bch2_printbuf_init(), void)
+
/*
* Returns size remaining of output buffer:
*/
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 3d4755d73af7..f241efb1fb50 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -527,7 +527,7 @@ int bch2_fs_quota_read(struct bch_fs *c)
struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
if (!sb_quota) {
mutex_unlock(&c->sb_lock);
- return -BCH_ERR_ENOSPC_sb_quota;
+ return bch_err_throw(c, ENOSPC_sb_quota);
}
bch2_sb_quota_read(c);
@@ -572,7 +572,7 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags)
mutex_lock(&c->sb_lock);
sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
if (!sb_quota) {
- ret = -BCH_ERR_ENOSPC_sb_quota;
+ ret = bch_err_throw(c, ENOSPC_sb_quota);
goto unlock;
}
@@ -726,7 +726,7 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
mutex_lock(&c->sb_lock);
sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
if (!sb_quota) {
- ret = -BCH_ERR_ENOSPC_sb_quota;
+ ret = bch_err_throw(c, ENOSPC_sb_quota);
goto unlock;
}
diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c
index bef2aa1b8bcd..b1438be9d690 100644
--- a/fs/bcachefs/rcu_pending.c
+++ b/fs/bcachefs/rcu_pending.c
@@ -182,11 +182,6 @@ static inline void kfree_bulk(size_t nr, void ** p)
while (nr--)
kfree(*p);
}
-
-#define local_irq_save(flags) \
-do { \
- flags = 0; \
-} while (0)
#endif
static noinline void __process_finished_items(struct rcu_pending *pending,
@@ -429,9 +424,15 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN));
- local_irq_save(flags);
- p = this_cpu_ptr(pending->p);
- spin_lock(&p->lock);
+ /* We could technically be scheduled before taking the lock and end up
+ * using a different cpu's rcu_pending_pcpu: that's ok, it needs a lock
+ * anyways
+ *
+ * And we have to do it this way to avoid breaking PREEMPT_RT, which
+ * redefines how spinlocks work:
+ */
+ p = raw_cpu_ptr(pending->p);
+ spin_lock_irqsave(&p->lock, flags);
rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu);
restart:
if (may_sleep &&
@@ -520,9 +521,8 @@ check_expired:
goto free_node;
}
- local_irq_save(flags);
- p = this_cpu_ptr(pending->p);
- spin_lock(&p->lock);
+ p = raw_cpu_ptr(pending->p);
+ spin_lock_irqsave(&p->lock, flags);
goto restart;
}
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index de1ec9e0caa0..1c345b86b1c0 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -80,13 +80,12 @@ static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
unsigned ptr_bit = 1;
unsigned rewrite_ptrs = 0;
- rcu_read_lock();
+ guard(rcu)();
bkey_for_each_ptr(ptrs, ptr) {
if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
rewrite_ptrs |= ptr_bit;
ptr_bit <<= 1;
}
- rcu_read_unlock();
return rewrite_ptrs;
}
@@ -135,12 +134,11 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
}
incompressible:
if (opts->background_target) {
- rcu_read_lock();
+ guard(rcu)();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (!p.ptr.cached &&
!bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
sectors += p.crc.compressed_size;
- rcu_read_unlock();
}
return sectors;
@@ -445,7 +443,7 @@ static int do_rebalance_extent(struct moving_context *ctxt,
if (bch2_err_matches(ret, ENOMEM)) {
/* memory allocation failure, wait for some IO to finish */
bch2_move_ctxt_wait_for_io(ctxt);
- ret = -BCH_ERR_transaction_restart_nested;
+ ret = bch_err_throw(c, transaction_restart_nested);
}
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -527,7 +525,7 @@ static void rebalance_wait(struct bch_fs *c)
r->state = BCH_REBALANCE_waiting;
}
- bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
+ bch2_kthread_io_clock_wait_once(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
}
static bool bch2_rebalance_enabled(struct bch_fs *c)
@@ -544,6 +542,7 @@ static int do_rebalance(struct moving_context *ctxt)
struct bch_fs_rebalance *r = &c->rebalance;
struct btree_iter rebalance_work_iter, extent_iter = {};
struct bkey_s_c k;
+ u32 kick = r->kick;
int ret = 0;
bch2_trans_begin(trans);
@@ -593,7 +592,8 @@ static int do_rebalance(struct moving_context *ctxt)
if (!ret &&
!kthread_should_stop() &&
!atomic64_read(&r->work_stats.sectors_seen) &&
- !atomic64_read(&r->scan_stats.sectors_seen)) {
+ !atomic64_read(&r->scan_stats.sectors_seen) &&
+ kick == r->kick) {
bch2_moving_ctxt_flush_all(ctxt);
bch2_trans_unlock_long(trans);
rebalance_wait(c);
@@ -677,11 +677,12 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
}
prt_newline(out);
- rcu_read_lock();
- struct task_struct *t = rcu_dereference(c->rebalance.thread);
- if (t)
- get_task_struct(t);
- rcu_read_unlock();
+ struct task_struct *t;
+ scoped_guard(rcu) {
+ t = rcu_dereference(c->rebalance.thread);
+ if (t)
+ get_task_struct(t);
+ }
if (t) {
bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
@@ -794,7 +795,7 @@ static int check_rebalance_work_one(struct btree_trans *trans,
BTREE_ID_extents, POS_MIN,
BTREE_ITER_prefetch|
BTREE_ITER_all_snapshots);
- return -BCH_ERR_transaction_restart_nested;
+ return bch_err_throw(c, transaction_restart_nested);
}
if (!extent_k.k && !rebalance_k.k)
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index 5d9214fe1a22..7a565ea7dbfc 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -39,13 +39,11 @@ int bch2_set_fs_needs_rebalance(struct bch_fs *);
static inline void bch2_rebalance_wakeup(struct bch_fs *c)
{
- struct task_struct *p;
-
- rcu_read_lock();
- p = rcu_dereference(c->rebalance.thread);
+ c->rebalance.kick++;
+ guard(rcu)();
+ struct task_struct *p = rcu_dereference(c->rebalance.thread);
if (p)
wake_up_process(p);
- rcu_read_unlock();
}
void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *);
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
index 33d77286f1d5..c659da149fa3 100644
--- a/fs/bcachefs/rebalance_types.h
+++ b/fs/bcachefs/rebalance_types.h
@@ -18,6 +18,7 @@ enum bch_rebalance_states {
struct bch_fs_rebalance {
struct task_struct __rcu *thread;
+ u32 kick;
struct bch_pd_controller pd;
enum bch_rebalance_states state;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 4fca57575565..c94debb12d2f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -99,9 +99,11 @@ int bch2_btree_lost_data(struct bch_fs *c,
goto out;
case BTREE_ID_snapshots:
ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret;
goto out;
default:
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret;
goto out;
}
@@ -272,6 +274,28 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
struct btree_path *path = btree_iter_path(trans, &iter);
if (unlikely(!btree_path_node(path, k->level))) {
+ struct bch_fs *c = trans->c;
+
+ CLASS(printbuf, buf)();
+ prt_str(&buf, "btree=");
+ bch2_btree_id_to_text(&buf, k->btree_id);
+ prt_printf(&buf, " level=%u ", k->level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k->k));
+
+ if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)|
+ BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) {
+ bch_err(c, "have key in journal replay for btree depth that does not exist, confused\n%s",
+ buf.buf);
+ ret = -EINVAL;
+ }
+
+ if (!k->allocated) {
+ bch_notice(c, "dropping key in journal replay for depth that does not exist because we're recovering from scan\n%s",
+ buf.buf);
+ k->overwritten = true;
+ goto out;
+ }
+
bch2_trans_iter_exit(trans, &iter);
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
BTREE_MAX_DEPTH, 0, iter_flags);
@@ -594,6 +618,7 @@ static int read_btree_roots(struct bch_fs *c)
buf.buf, bch2_err_str(ret))) {
if (btree_id_is_alloc(i))
r->error = 0;
+ ret = 0;
}
}
@@ -679,7 +704,7 @@ static bool check_version_upgrade(struct bch_fs *c)
ret = true;
}
- if (new_version > c->sb.version_incompat &&
+ if (new_version > c->sb.version_incompat_allowed &&
c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) {
struct printbuf buf = PRINTBUF;
@@ -739,7 +764,24 @@ int bch2_fs_recovery(struct bch_fs *c)
? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read)
: BCH_RECOVERY_PASS_snapshots_read;
c->opts.nochanges = true;
+ }
+
+ if (c->opts.nochanges)
c->opts.read_only = true;
+
+ if (c->opts.journal_rewind) {
+ bch_info(c, "rewinding journal, fsck required");
+ c->opts.fsck = true;
+ }
+
+ if (go_rw_in_recovery(c)) {
+ /*
+ * start workqueues/kworkers early - kthread creation checks for
+ * pending signals, which is _very_ annoying
+ */
+ ret = bch2_fs_init_rw(c);
+ if (ret)
+ goto err;
}
mutex_lock(&c->sb_lock);
@@ -879,7 +921,7 @@ int bch2_fs_recovery(struct bch_fs *c)
use_clean:
if (!clean) {
bch_err(c, "no superblock clean section found");
- ret = -BCH_ERR_fsck_repair_impossible;
+ ret = bch_err_throw(c, fsck_repair_impossible);
goto err;
}
@@ -950,7 +992,7 @@ use_clean:
ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu",
journal_seq, last_seq, blacklist_seq - 1) ?:
- bch2_fs_journal_start(&c->journal, journal_seq);
+ bch2_fs_journal_start(&c->journal, last_seq, journal_seq);
if (ret)
goto err;
@@ -1093,13 +1135,6 @@ use_clean:
out:
bch2_flush_fsck_errs(c);
- if (!c->opts.retain_recovery_info) {
- bch2_journal_keys_put_initial(c);
- bch2_find_btree_nodes_exit(&c->found_btree_nodes);
- }
- if (!IS_ERR(clean))
- kfree(clean);
-
if (!ret &&
test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) &&
!c->opts.nochanges) {
@@ -1108,6 +1143,9 @@ out:
}
bch_err_fn(c, ret);
+final_out:
+ if (!IS_ERR(clean))
+ kfree(clean);
return ret;
err:
fsck_err:
@@ -1115,13 +1153,13 @@ fsck_err:
struct printbuf buf = PRINTBUF;
bch2_log_msg_start(c, &buf);
- prt_printf(&buf, "error in recovery: %s", bch2_err_str(ret));
+ prt_printf(&buf, "error in recovery: %s\n", bch2_err_str(ret));
bch2_fs_emergency_read_only2(c, &buf);
bch2_print_str(c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
}
- return ret;
+ goto final_out;
}
int bch2_fs_initialize(struct bch_fs *c)
@@ -1170,7 +1208,7 @@ int bch2_fs_initialize(struct bch_fs *c)
* journal_res_get() will crash if called before this has
* set up the journal.pin FIFO and journal.cur pointer:
*/
- ret = bch2_fs_journal_start(&c->journal, 1);
+ ret = bch2_fs_journal_start(&c->journal, 1, 1);
if (ret)
goto err;
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index dabb29b08ad0..6a039e011064 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -103,20 +103,20 @@ static void bch2_sb_recovery_passes_to_text(struct printbuf *out,
prt_tab(out);
bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC);
+
+ if (BCH_RECOVERY_PASS_NO_RATELIMIT(i))
+ prt_str(out, " (no ratelimit)");
+
prt_newline(out);
}
}
-static void bch2_sb_recovery_pass_complete(struct bch_fs *c,
- enum bch_recovery_pass pass,
- s64 start_time)
+static struct recovery_pass_entry *bch2_sb_recovery_pass_entry(struct bch_fs *c,
+ enum bch_recovery_pass pass)
{
enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass);
- s64 end_time = ktime_get_real_seconds();
- mutex_lock(&c->sb_lock);
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- __clear_bit_le64(stable, ext->recovery_passes_required);
+ lockdep_assert_held(&c->sb_lock);
struct bch_sb_field_recovery_passes *r =
bch2_sb_field_get(c->disk_sb.sb, recovery_passes);
@@ -127,15 +127,43 @@ static void bch2_sb_recovery_pass_complete(struct bch_fs *c,
r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s);
if (!r) {
bch_err(c, "error creating recovery_passes sb section");
- goto out;
+ return NULL;
}
}
- r->start[stable].last_run = cpu_to_le64(end_time);
- r->start[stable].last_runtime = cpu_to_le32(max(0, end_time - start_time));
-out:
+ return r->start + stable;
+}
+
+static void bch2_sb_recovery_pass_complete(struct bch_fs *c,
+ enum bch_recovery_pass pass,
+ s64 start_time)
+{
+ guard(mutex)(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ __clear_bit_le64(bch2_recovery_pass_to_stable(pass),
+ ext->recovery_passes_required);
+
+ struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass);
+ if (e) {
+ s64 end_time = ktime_get_real_seconds();
+ e->last_run = cpu_to_le64(end_time);
+ e->last_runtime = cpu_to_le32(max(0, end_time - start_time));
+ SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false);
+ }
+
bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
+}
+
+void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ guard(mutex)(&c->sb_lock);
+
+ struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass);
+ if (e && !BCH_RECOVERY_PASS_NO_RATELIMIT(e)) {
+ SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false);
+ bch2_write_super(c);
+ }
}
static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass)
@@ -157,6 +185,9 @@ static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recover
*/
ret = (u64) le32_to_cpu(i->last_runtime) * 100 >
ktime_get_real_seconds() - le64_to_cpu(i->last_run);
+
+ if (BCH_RECOVERY_PASS_NO_RATELIMIT(i))
+ ret = false;
}
return ret;
@@ -186,11 +217,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c)
set_bit(BCH_FS_may_go_rw, &c->flags);
- if (keys->nr ||
- !c->opts.read_only ||
- !c->sb.clean ||
- c->opts.recovery_passes ||
- (c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))) {
+ if (go_rw_in_recovery(c)) {
if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) {
bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate");
bch2_reconstruct_alloc(c);
@@ -263,8 +290,13 @@ static bool recovery_pass_needs_set(struct bch_fs *c,
enum bch_run_recovery_pass_flags *flags)
{
struct bch_fs_recovery *r = &c->recovery;
- bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
- bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent);
+
+ /*
+ * Never run scan_for_btree_nodes persistently: check_topology will run
+ * it if required
+ */
+ if (pass == BCH_RECOVERY_PASS_scan_for_btree_nodes)
+ *flags |= RUN_RECOVERY_PASS_nopersistent;
if ((*flags & RUN_RECOVERY_PASS_ratelimit) &&
!bch2_recovery_pass_want_ratelimit(c, pass))
@@ -279,6 +311,11 @@ static bool recovery_pass_needs_set(struct bch_fs *c,
* Otherwise, we run run_explicit_recovery_pass when we find damage, so
* it should run again even if it's already run:
*/
+ bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
+ bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent);
+ bool rewind = in_recovery &&
+ r->curr_pass > pass &&
+ !(r->passes_complete & BIT_ULL(pass));
if (persistent
? !(c->sb.recovery_passes_required & BIT_ULL(pass))
@@ -289,6 +326,9 @@ static bool recovery_pass_needs_set(struct bch_fs *c,
(r->passes_ratelimiting & BIT_ULL(pass)))
return true;
+ if (rewind)
+ return true;
+
return false;
}
@@ -315,10 +355,12 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
goto out;
bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
- bool rewind = in_recovery && r->curr_pass > pass;
+ bool rewind = in_recovery &&
+ r->curr_pass > pass &&
+ !(r->passes_complete & BIT_ULL(pass));
bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit;
- if (!(in_recovery && (flags & RUN_RECOVERY_PASS_nopersistent))) {
+ if (!(flags & RUN_RECOVERY_PASS_nopersistent)) {
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
__set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required);
}
@@ -327,7 +369,7 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
(!in_recovery || r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) {
prt_printf(out, "need recovery pass %s (%u), but already rw\n",
bch2_recovery_passes[pass], pass);
- ret = -BCH_ERR_cannot_rewind_recovery;
+ ret = bch_err_throw(c, cannot_rewind_recovery);
goto out;
}
@@ -347,7 +389,7 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
if (rewind) {
r->next_pass = pass;
r->passes_complete &= (1ULL << pass) >> 1;
- ret = -BCH_ERR_restart_recovery;
+ ret = bch_err_throw(c, restart_recovery);
}
} else {
prt_printf(out, "scheduling recovery pass %s (%u)%s\n",
@@ -371,10 +413,8 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c,
{
int ret = 0;
- scoped_guard(mutex, &c->sb_lock) {
- if (!recovery_pass_needs_set(c, pass, &flags))
- return 0;
-
+ if (recovery_pass_needs_set(c, pass, &flags)) {
+ guard(mutex)(&c->sb_lock);
ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags);
bch2_write_super(c);
}
@@ -382,9 +422,38 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c,
return ret;
}
+/*
+ * Returns 0 if @pass has run recently, otherwise one of
+ * -BCH_ERR_restart_recovery
+ * -BCH_ERR_recovery_pass_will_run
+ */
+int bch2_require_recovery_pass(struct bch_fs *c,
+ struct printbuf *out,
+ enum bch_recovery_pass pass)
+{
+ if (test_bit(BCH_FS_in_recovery, &c->flags) &&
+ c->recovery.passes_complete & BIT_ULL(pass))
+ return 0;
+
+ guard(mutex)(&c->sb_lock);
+
+ if (bch2_recovery_pass_want_ratelimit(c, pass))
+ return 0;
+
+ enum bch_run_recovery_pass_flags flags = 0;
+ int ret = 0;
+
+ if (recovery_pass_needs_set(c, pass, &flags)) {
+ ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags);
+ bch2_write_super(c);
+ }
+
+ return ret ?: bch_err_throw(c, recovery_pass_will_run);
+}
+
int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
{
- enum bch_run_recovery_pass_flags flags = RUN_RECOVERY_PASS_nopersistent;
+ enum bch_run_recovery_pass_flags flags = 0;
if (!recovery_pass_needs_set(c, pass, &flags))
return 0;
diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h
index dc0d2014ff9b..2117f0ce1922 100644
--- a/fs/bcachefs/recovery_passes.h
+++ b/fs/bcachefs/recovery_passes.h
@@ -10,11 +10,22 @@ u64 bch2_recovery_passes_from_stable(u64 v);
u64 bch2_fsck_recovery_passes(void);
+void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *, enum bch_recovery_pass);
+
enum bch_run_recovery_pass_flags {
RUN_RECOVERY_PASS_nopersistent = BIT(0),
RUN_RECOVERY_PASS_ratelimit = BIT(1),
};
+static inline bool go_rw_in_recovery(struct bch_fs *c)
+{
+ return (c->journal_keys.nr ||
+ !c->opts.read_only ||
+ !c->sb.clean ||
+ c->opts.recovery_passes ||
+ (c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))));
+}
+
int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *,
@@ -24,6 +35,9 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *,
enum bch_recovery_pass,
enum bch_run_recovery_pass_flags);
+int bch2_require_recovery_pass(struct bch_fs *, struct printbuf *,
+ enum bch_recovery_pass);
+
int bch2_run_online_recovery_passes(struct bch_fs *, u64);
int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass);
diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h
index c434eafbca19..b63c20558d3d 100644
--- a/fs/bcachefs/recovery_passes_format.h
+++ b/fs/bcachefs/recovery_passes_format.h
@@ -87,6 +87,8 @@ struct recovery_pass_entry {
__le32 flags;
};
+LE32_BITMASK(BCH_RECOVERY_PASS_NO_RATELIMIT, struct recovery_pass_entry, flags, 0, 1)
+
struct bch_sb_field_recovery_passes {
struct bch_sb_field field;
struct recovery_pass_entry start[];
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 3a13dbcab6ba..92b90cfe622b 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -64,6 +64,9 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
REFLINK_P_IDX(p.v),
le32_to_cpu(p.v->front_pad),
le32_to_cpu(p.v->back_pad));
+
+ if (REFLINK_P_ERROR(p.v))
+ prt_str(out, " error");
}
bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
@@ -269,13 +272,12 @@ struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans,
return k;
if (unlikely(!bkey_extent_is_reflink_data(k.k))) {
- unsigned size = min((u64) k.k->size,
- REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) -
- reflink_offset);
- bch2_key_resize(&iter->k, size);
+ u64 missing_end = min(k.k->p.offset,
+ REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad));
+ BUG_ON(reflink_offset == missing_end);
int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset,
- k.k->p.offset, should_commit);
+ missing_end, should_commit);
if (ret) {
bch2_trans_iter_exit(trans, iter);
return bkey_s_c_err(ret);
@@ -312,7 +314,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
if (!bkey_refcount_c(k)) {
if (!(flags & BTREE_TRIGGER_overwrite))
- ret = -BCH_ERR_missing_indirect_extent;
+ ret = bch_err_throw(c, missing_indirect_extent);
goto next;
}
@@ -612,7 +614,7 @@ s64 bch2_remap_range(struct bch_fs *c,
int ret = 0, ret2 = 0;
if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_reflink))
- return -BCH_ERR_erofs_no_writes;
+ return bch_err_throw(c, erofs_no_writes);
bch2_check_set_feature(c, BCH_FEATURE_reflink);
@@ -711,7 +713,8 @@ s64 bch2_remap_range(struct bch_fs *c,
SET_REFLINK_P_IDX(&dst_p->v, offset);
if (reflink_p_may_update_opts_field &&
- may_change_src_io_path_opts)
+ may_change_src_io_path_opts &&
+ REFLINK_P_MAY_UPDATE_OPTIONS(src_p.v))
SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true);
} else {
BUG();
@@ -847,7 +850,7 @@ int bch2_gc_reflink_start(struct bch_fs *c)
struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table,
c->reflink_gc_nr++, GFP_KERNEL);
if (!r) {
- ret = -BCH_ERR_ENOMEM_gc_reflink_start;
+ ret = bch_err_throw(c, ENOMEM_gc_reflink_start);
break;
}
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 477ef0997949..8383bd7fdb3f 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -119,7 +119,7 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
return 0;
bad:
bch2_replicas_entry_to_text(err, r);
- return -BCH_ERR_invalid_replicas_entry;
+ return bch_err_throw(c, invalid_replicas_entry);
}
void bch2_cpu_replicas_to_text(struct printbuf *out,
@@ -311,7 +311,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
!__replicas_has_entry(&c->replicas_gc, new_entry)) {
new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry);
if (!new_gc.entries) {
- ret = -BCH_ERR_ENOMEM_cpu_replicas;
+ ret = bch_err_throw(c, ENOMEM_cpu_replicas);
goto err;
}
}
@@ -319,7 +319,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
if (!__replicas_has_entry(&c->replicas, new_entry)) {
new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
if (!new_r.entries) {
- ret = -BCH_ERR_ENOMEM_cpu_replicas;
+ ret = bch_err_throw(c, ENOMEM_cpu_replicas);
goto err;
}
@@ -422,7 +422,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
if (!c->replicas_gc.entries) {
mutex_unlock(&c->sb_lock);
bch_err(c, "error allocating c->replicas_gc");
- return -BCH_ERR_ENOMEM_replicas_gc;
+ return bch_err_throw(c, ENOMEM_replicas_gc);
}
for_each_cpu_replicas_entry(&c->replicas, e)
@@ -458,7 +458,7 @@ retry:
new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL);
if (!new.entries) {
bch_err(c, "error allocating c->replicas_gc");
- return -BCH_ERR_ENOMEM_replicas_gc;
+ return bch_err_throw(c, ENOMEM_replicas_gc);
}
mutex_lock(&c->sb_lock);
@@ -622,7 +622,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
DIV_ROUND_UP(bytes, sizeof(u64)));
if (!sb_r)
- return -BCH_ERR_ENOSPC_sb_replicas;
+ return bch_err_throw(c, ENOSPC_sb_replicas);
bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0);
@@ -667,7 +667,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
sb_r = bch2_sb_field_resize(&c->disk_sb, replicas,
DIV_ROUND_UP(bytes, sizeof(u64)));
if (!sb_r)
- return -BCH_ERR_ENOSPC_sb_replicas;
+ return bch_err_throw(c, ENOSPC_sb_replicas);
bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas);
@@ -819,19 +819,18 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
if (e->data_type == BCH_DATA_cached)
continue;
- rcu_read_lock();
- for (unsigned i = 0; i < e->nr_devs; i++) {
- if (e->devs[i] == BCH_SB_MEMBER_INVALID) {
- nr_failed++;
- continue;
- }
+ scoped_guard(rcu)
+ for (unsigned i = 0; i < e->nr_devs; i++) {
+ if (e->devs[i] == BCH_SB_MEMBER_INVALID) {
+ nr_failed++;
+ continue;
+ }
- nr_online += test_bit(e->devs[i], devs.d);
+ nr_online += test_bit(e->devs[i], devs.d);
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
- nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
- }
- rcu_read_unlock();
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
+ nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
+ }
if (nr_online + nr_failed == e->nr_devs)
continue;
diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
index 7c0c9c842b4e..b868702a431a 100644
--- a/fs/bcachefs/sb-counters_format.h
+++ b/fs/bcachefs/sb-counters_format.h
@@ -26,6 +26,7 @@ enum counters_flags {
x(io_move_write_fail, 82, TYPE_COUNTER) \
x(io_move_start_fail, 39, TYPE_COUNTER) \
x(io_move_created_rebalance, 83, TYPE_COUNTER) \
+ x(io_move_evacuate_bucket, 84, TYPE_COUNTER) \
x(bucket_invalidate, 3, TYPE_COUNTER) \
x(bucket_discard, 4, TYPE_COUNTER) \
x(bucket_discard_fast, 79, TYPE_COUNTER) \
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 861fce1630f0..1506d05e0665 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -253,6 +253,7 @@ DOWNGRADE_TABLE()
static int downgrade_table_extra(struct bch_fs *c, darray_char *table)
{
+ unsigned dst_offset = table->nr;
struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table);
unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
int ret = 0;
@@ -268,6 +269,9 @@ static int downgrade_table_extra(struct bch_fs *c, darray_char *table)
if (ret)
return ret;
+ dst = (void *) &table->data[dst_offset];
+ dst->nr_errors = cpu_to_le16(nr_errors + 1);
+
/* open coded __set_bit_le64, as dst is packed and
* dst->recovery_passes is misaligned */
unsigned b = BCH_RECOVERY_PASS_STABLE_check_allocations;
@@ -278,7 +282,6 @@ static int downgrade_table_extra(struct bch_fs *c, darray_char *table)
break;
}
- dst->nr_errors = cpu_to_le16(nr_errors);
return ret;
}
@@ -417,7 +420,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c)
d = bch2_sb_field_resize(&c->disk_sb, downgrade, sb_u64s);
if (!d) {
- ret = -BCH_ERR_ENOSPC_sb_downgrade;
+ ret = bch_err_throw(c, ENOSPC_sb_downgrade);
goto out;
}
diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c
index 013a96883b4e..48853efdc105 100644
--- a/fs/bcachefs/sb-errors.c
+++ b/fs/bcachefs/sb-errors.c
@@ -78,6 +78,28 @@ const struct bch_sb_field_ops bch_sb_field_ops_errors = {
.to_text = bch2_sb_errors_to_text,
};
+void bch2_fs_errors_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ if (out->nr_tabstops < 1)
+ printbuf_tabstop_push(out, 48);
+ if (out->nr_tabstops < 2)
+ printbuf_tabstop_push(out, 8);
+ if (out->nr_tabstops < 3)
+ printbuf_tabstop_push(out, 16);
+
+ guard(mutex)(&c->fsck_error_counts_lock);
+
+ bch_sb_errors_cpu *e = &c->fsck_error_counts;
+ darray_for_each(*e, i) {
+ bch2_sb_error_id_to_text(out, i->id);
+ prt_tab(out);
+ prt_u64(out, i->nr);
+ prt_tab(out);
+ bch2_prt_datetime(out, i->last_error_time);
+ prt_newline(out);
+ }
+}
+
void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err)
{
bch_sb_errors_cpu *e = &c->fsck_error_counts;
diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h
index b2357b8e6107..e86267264692 100644
--- a/fs/bcachefs/sb-errors.h
+++ b/fs/bcachefs/sb-errors.h
@@ -7,6 +7,7 @@
extern const char * const bch2_sb_error_strs[];
void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id);
+void bch2_fs_errors_to_text(struct printbuf *, struct bch_fs *);
extern const struct bch_sb_field_ops bch_sb_field_ops_errors;
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 0bfb151da9cf..d154b7651d28 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -3,9 +3,10 @@
#define _BCACHEFS_SB_ERRORS_FORMAT_H
enum bch_fsck_flags {
- FSCK_CAN_FIX = 1 << 0,
- FSCK_CAN_IGNORE = 1 << 1,
- FSCK_AUTOFIX = 1 << 2,
+ FSCK_CAN_FIX = BIT(0),
+ FSCK_CAN_IGNORE = BIT(1),
+ FSCK_AUTOFIX = BIT(2),
+ FSCK_ERR_NO_LOG = BIT(3),
};
#define BCH_SB_ERRS() \
@@ -134,7 +135,7 @@ enum bch_fsck_flags {
x(bucket_gens_to_invalid_buckets, 121, FSCK_AUTOFIX) \
x(bucket_gens_nonzero_for_invalid_buckets, 122, FSCK_AUTOFIX) \
x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \
- x(need_discard_freespace_key_bad, 124, 0) \
+ x(need_discard_freespace_key_bad, 124, FSCK_AUTOFIX) \
x(discarding_bucket_not_in_need_discard_btree, 291, 0) \
x(backpointer_bucket_offset_wrong, 125, 0) \
x(backpointer_level_bad, 294, 0) \
@@ -165,7 +166,7 @@ enum bch_fsck_flags {
x(ptr_to_missing_replicas_entry, 149, FSCK_AUTOFIX) \
x(ptr_to_missing_stripe, 150, 0) \
x(ptr_to_incorrect_stripe, 151, 0) \
- x(ptr_gen_newer_than_bucket_gen, 152, 0) \
+ x(ptr_gen_newer_than_bucket_gen, 152, FSCK_AUTOFIX) \
x(ptr_too_stale, 153, 0) \
x(stale_dirty_ptr, 154, FSCK_AUTOFIX) \
x(ptr_bucket_data_type_mismatch, 155, 0) \
@@ -217,7 +218,7 @@ enum bch_fsck_flags {
x(inode_str_hash_invalid, 194, 0) \
x(inode_v3_fields_start_bad, 195, 0) \
x(inode_snapshot_mismatch, 196, 0) \
- x(snapshot_key_missing_inode_snapshot, 314, 0) \
+ x(snapshot_key_missing_inode_snapshot, 314, FSCK_AUTOFIX) \
x(inode_unlinked_but_clean, 197, 0) \
x(inode_unlinked_but_nlink_nonzero, 198, 0) \
x(inode_unlinked_and_not_open, 281, 0) \
@@ -232,10 +233,11 @@ enum bch_fsck_flags {
x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \
x(inode_dir_missing_backpointer, 284, FSCK_AUTOFIX) \
x(inode_dir_unlinked_but_not_empty, 286, FSCK_AUTOFIX) \
+ x(inode_dir_has_nonzero_i_size, 319, FSCK_AUTOFIX) \
x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \
x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \
x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \
- x(inode_has_child_snapshots_wrong, 287, 0) \
+ x(inode_has_child_snapshots_wrong, 287, FSCK_AUTOFIX) \
x(inode_unreachable, 210, FSCK_AUTOFIX) \
x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \
x(inode_i_sectors_underflow, 312, FSCK_AUTOFIX) \
@@ -243,26 +245,27 @@ enum bch_fsck_flags {
x(inode_parent_has_case_insensitive_not_set, 317, FSCK_AUTOFIX) \
x(vfs_inode_i_blocks_underflow, 311, FSCK_AUTOFIX) \
x(vfs_inode_i_blocks_not_zero_at_truncate, 313, FSCK_AUTOFIX) \
+ x(vfs_bad_inode_rm, 320, 0) \
x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \
x(deleted_inode_missing, 212, FSCK_AUTOFIX) \
x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \
x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \
x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \
x(extent_overlapping, 215, 0) \
- x(key_in_missing_inode, 216, 0) \
+ x(key_in_missing_inode, 216, FSCK_AUTOFIX) \
x(key_in_wrong_inode_type, 217, 0) \
- x(extent_past_end_of_inode, 218, 0) \
+ x(extent_past_end_of_inode, 218, FSCK_AUTOFIX) \
x(dirent_empty_name, 219, 0) \
x(dirent_val_too_big, 220, 0) \
x(dirent_name_too_long, 221, 0) \
x(dirent_name_embedded_nul, 222, 0) \
x(dirent_name_dot_or_dotdot, 223, 0) \
x(dirent_name_has_slash, 224, 0) \
- x(dirent_d_type_wrong, 225, 0) \
+ x(dirent_d_type_wrong, 225, FSCK_AUTOFIX) \
x(inode_bi_parent_wrong, 226, 0) \
x(dirent_in_missing_dir_inode, 227, 0) \
x(dirent_in_non_dir_inode, 228, 0) \
- x(dirent_to_missing_inode, 229, 0) \
+ x(dirent_to_missing_inode, 229, FSCK_AUTOFIX) \
x(dirent_to_overwritten_inode, 302, 0) \
x(dirent_to_missing_subvol, 230, 0) \
x(dirent_to_itself, 231, 0) \
@@ -277,8 +280,8 @@ enum bch_fsck_flags {
x(root_dir_missing, 239, 0) \
x(root_inode_not_dir, 240, 0) \
x(dir_loop, 241, 0) \
- x(hash_table_key_duplicate, 242, 0) \
- x(hash_table_key_wrong_offset, 243, 0) \
+ x(hash_table_key_duplicate, 242, FSCK_AUTOFIX) \
+ x(hash_table_key_wrong_offset, 243, FSCK_AUTOFIX) \
x(unlinked_inode_not_on_deleted_list, 244, FSCK_AUTOFIX) \
x(reflink_p_front_pad_bad, 245, 0) \
x(journal_entry_dup_same_device, 246, 0) \
@@ -298,7 +301,7 @@ enum bch_fsck_flags {
x(btree_node_bkey_bad_u64s, 260, 0) \
x(btree_node_topology_empty_interior_node, 261, 0) \
x(btree_ptr_v2_min_key_bad, 262, 0) \
- x(btree_root_unreadable_and_scan_found_nothing, 263, FSCK_AUTOFIX) \
+ x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \
x(snapshot_node_missing, 264, FSCK_AUTOFIX) \
x(dup_backpointer_to_bad_csum_extent, 265, 0) \
x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \
@@ -311,7 +314,7 @@ enum bch_fsck_flags {
x(accounting_mismatch, 272, FSCK_AUTOFIX) \
x(accounting_replicas_not_marked, 273, 0) \
x(accounting_to_invalid_device, 289, 0) \
- x(invalid_btree_id, 274, 0) \
+ x(invalid_btree_id, 274, FSCK_AUTOFIX) \
x(alloc_key_io_time_bad, 275, 0) \
x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \
x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \
@@ -328,7 +331,7 @@ enum bch_fsck_flags {
x(dirent_stray_data_after_cf_name, 305, 0) \
x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \
x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \
- x(MAX, 319, 0)
+ x(MAX, 321, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index 3398906660a5..6245e342a8a8 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -101,7 +101,7 @@ static int sb_members_v2_resize_entries(struct bch_fs *c)
mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
if (!mi)
- return -BCH_ERR_ENOSPC_sb_members_v2;
+ return bch_err_throw(c, ENOSPC_sb_members_v2);
for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) {
void *dst = (void *) mi->_members + (i * sizeof(struct bch_member));
@@ -325,9 +325,17 @@ static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb,
{
struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
- unsigned i;
- for (i = 0; i < sb->nr_devices; i++)
+ if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) {
+ prt_printf(out, "field ends before start of entries");
+ return;
+ }
+
+ unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / sizeof(mi->_members[0]);
+ if (nr != sb->nr_devices)
+ prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices);
+
+ for (unsigned i = 0; i < min(sb->nr_devices, nr); i++)
member_to_text(out, members_v1_get(mi, i), gi, sb, i);
}
@@ -341,9 +349,27 @@ static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb,
{
struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
- unsigned i;
- for (i = 0; i < sb->nr_devices; i++)
+ if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) {
+ prt_printf(out, "field ends before start of entries");
+ return;
+ }
+
+ if (!le16_to_cpu(mi->member_bytes)) {
+ prt_printf(out, "member_bytes 0");
+ return;
+ }
+
+ unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / le16_to_cpu(mi->member_bytes);
+ if (nr != sb->nr_devices)
+ prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices);
+
+ /*
+ * We call to_text() on superblock sections that haven't passed
+ * validate, so we can't trust sb->nr_devices.
+ */
+
+ for (unsigned i = 0; i < min(sb->nr_devices, nr); i++)
member_to_text(out, members_v2_get(mi, i), gi, sb, i);
}
@@ -378,14 +404,13 @@ void bch2_sb_members_from_cpu(struct bch_fs *c)
{
struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
- rcu_read_lock();
+ guard(rcu)();
for_each_member_device_rcu(c, ca, NULL) {
struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx);
for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++)
m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e]));
}
- rcu_read_unlock();
}
void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
@@ -443,20 +468,14 @@ void bch2_dev_errors_reset(struct bch_dev *ca)
bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k)
{
- bool ret = true;
- rcu_read_lock();
+ guard(rcu)();
bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- if (!ca)
- continue;
-
- if (!bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) {
- ret = false;
- break;
- }
+ if (ca &&
+ !bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c)))
+ return false;
}
- rcu_read_unlock();
- return ret;
+ return true;
}
static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev,
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index 6bd9b86aee5b..8d8a8a857648 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -28,12 +28,9 @@ static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned);
static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev)
{
- rcu_read_lock();
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu(c, dev);
- bool ret = ca && bch2_dev_is_online(ca);
- rcu_read_unlock();
-
- return ret;
+ return ca && bch2_dev_is_online(ca);
}
static inline bool bch2_dev_is_healthy(struct bch_dev *ca)
@@ -142,12 +139,10 @@ static inline void bch2_dev_put(struct bch_dev *ca)
static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca)
{
- rcu_read_lock();
+ guard(rcu)();
bch2_dev_put(ca);
if ((ca = __bch2_next_dev(c, ca, NULL)))
bch2_dev_get(ca);
- rcu_read_unlock();
-
return ca;
}
@@ -166,7 +161,7 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
unsigned state_mask,
int rw, unsigned ref_idx)
{
- rcu_read_lock();
+ guard(rcu)();
if (ca)
enumerated_ref_put(&ca->io_ref[rw], ref_idx);
@@ -174,7 +169,6 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
(!((1 << ca->mi.state) & state_mask) ||
!enumerated_ref_tryget(&ca->io_ref[rw], ref_idx)))
;
- rcu_read_unlock();
return ca;
}
@@ -239,11 +233,10 @@ static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev)
{
- rcu_read_lock();
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
if (ca)
bch2_dev_get(ca);
- rcu_read_unlock();
return ca;
}
@@ -299,19 +292,16 @@ static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev,
{
might_sleep();
- rcu_read_lock();
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu(c, dev);
- if (ca && !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx))
- ca = NULL;
- rcu_read_unlock();
+ if (!ca || !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx))
+ return NULL;
- if (ca &&
- (ca->mi.state == BCH_MEMBER_STATE_rw ||
- (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)))
+ if (ca->mi.state == BCH_MEMBER_STATE_rw ||
+ (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
return ca;
- if (ca)
- enumerated_ref_put(&ca->io_ref[rw], ref_idx);
+ enumerated_ref_put(&ca->io_ref[rw], ref_idx);
return NULL;
}
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 7c403427fbdb..538c324f4765 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -339,12 +339,9 @@ static inline bool six_owner_running(struct six_lock *lock)
* acquiring the lock and setting the owner field. If we're an RT task
* that will live-lock because we won't let the owner complete.
*/
- rcu_read_lock();
+ guard(rcu)();
struct task_struct *owner = READ_ONCE(lock->owner);
- bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current);
- rcu_read_unlock();
-
- return ret;
+ return owner ? owner_on_cpu(owner) : !rt_or_dl_task(current);
}
static inline bool six_optimistic_spin(struct six_lock *lock,
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 00d62d1190ef..4c43d2a2c1f5 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -54,7 +54,7 @@ int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
BTREE_ITER_with_updates, snapshot_tree, s);
if (bch2_err_matches(ret, ENOENT))
- ret = -BCH_ERR_ENOENT_snapshot_tree;
+ ret = bch_err_throw(trans->c, ENOENT_snapshot_tree);
return ret;
}
@@ -67,7 +67,7 @@ __bch2_snapshot_tree_create(struct btree_trans *trans)
struct bkey_i_snapshot_tree *s_t;
if (ret == -BCH_ERR_ENOSPC_btree_slot)
- ret = -BCH_ERR_ENOSPC_snapshot_tree;
+ ret = bch_err_throw(trans->c, ENOSPC_snapshot_tree);
if (ret)
return ERR_PTR(ret);
@@ -105,11 +105,8 @@ static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id,
static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
{
- rcu_read_lock();
- bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
- rcu_read_unlock();
-
- return ret;
+ guard(rcu)();
+ return __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
}
static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
@@ -138,28 +135,25 @@ static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
{
- bool ret;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ u32 orig_id = id;
+#endif
- rcu_read_lock();
+ guard(rcu)();
struct snapshot_table *t = rcu_dereference(c->snapshots);
- if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
- ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
- goto out;
- }
+ if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots))
+ return __bch2_snapshot_is_ancestor_early(t, id, ancestor);
if (likely(ancestor >= IS_ANCESTOR_BITMAP))
while (id && id < ancestor - IS_ANCESTOR_BITMAP)
id = get_ancestor_below(t, id, ancestor);
- ret = id && id < ancestor
+ bool ret = id && id < ancestor
? test_ancestor_bitmap(t, id, ancestor)
: id == ancestor;
- EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
-out:
- rcu_read_unlock();
-
+ EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, orig_id, ancestor));
return ret;
}
@@ -293,7 +287,7 @@ static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id)
mutex_lock(&c->snapshot_table_lock);
int ret = snapshot_t_mut(c, id)
? 0
- : -BCH_ERR_ENOMEM_mark_snapshot;
+ : bch_err_throw(c, ENOMEM_mark_snapshot);
mutex_unlock(&c->snapshot_table_lock);
return ret;
}
@@ -312,7 +306,7 @@ static int __bch2_mark_snapshot(struct btree_trans *trans,
t = snapshot_t_mut(c, id);
if (!t) {
- ret = -BCH_ERR_ENOMEM_mark_snapshot;
+ ret = bch_err_throw(c, ENOMEM_mark_snapshot);
goto err;
}
@@ -412,10 +406,10 @@ static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root,
snapshot_id_list *skip)
{
+ guard(rcu)();
u32 id, subvol = 0, s;
retry:
id = snapshot_root;
- rcu_read_lock();
while (id && bch2_snapshot_exists(c, id)) {
if (!(skip && snapshot_list_has_id(skip, id))) {
s = snapshot_t(c, id)->subvol;
@@ -427,7 +421,6 @@ retry:
if (id == snapshot_root)
break;
}
- rcu_read_unlock();
if (!subvol && skip) {
skip = NULL;
@@ -617,18 +610,14 @@ static int snapshot_tree_ptr_good(struct btree_trans *trans,
u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
{
- const struct snapshot_t *s;
-
if (!id)
return 0;
- rcu_read_lock();
- s = snapshot_t(c, id);
- if (s->parent)
- id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
- rcu_read_unlock();
-
- return id;
+ guard(rcu)();
+ const struct snapshot_t *s = snapshot_t(c, id);
+ return s->parent
+ ? bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth))
+ : id;
}
static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
@@ -882,7 +871,8 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN,
0, k, ret) {
- if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
+ if (k.k->type == KEY_TYPE_snapshot_tree &&
+ le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
tree_id = k.k->p.offset;
break;
}
@@ -910,7 +900,8 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
0, k, ret) {
- if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
+ if (k.k->type == KEY_TYPE_subvolume &&
+ le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
snapshot->v.subvol = cpu_to_le32(k.k->p.offset);
SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true);
break;
@@ -947,10 +938,7 @@ static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpo
static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
{
- darray_for_each(*l, i)
- if (snapshot_list_has_id(r, *i))
- return true;
- return false;
+ return darray_find_p(*l, i, snapshot_list_has_id(r, *i)) != NULL;
}
static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
@@ -1022,7 +1010,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c)
"snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
if (t->nr > 1) {
bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
- ret = -BCH_ERR_fsck_repair_unimplemented;
+ ret = bch_err_throw(c, fsck_repair_unimplemented);
goto err;
}
@@ -1061,24 +1049,73 @@ int __bch2_check_key_has_snapshot(struct btree_trans *trans,
ret = bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_internal_snapshot_node) ?: 1;
- /*
- * Snapshot missing: we should have caught this with btree_lost_data and
- * kicked off reconstruct_snapshots, so if we end up here we have no
- * idea what happened:
- */
- if (fsck_err_on(state == SNAPSHOT_ID_empty,
- trans, bkey_in_missing_snapshot,
- "key in missing snapshot %s, delete?",
- (bch2_btree_id_to_text(&buf, iter->btree_id),
- prt_char(&buf, ' '),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- ret = bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_internal_snapshot_node) ?: 1;
+ if (state == SNAPSHOT_ID_empty) {
+ /*
+ * Snapshot missing: we should have caught this with btree_lost_data and
+ * kicked off reconstruct_snapshots, so if we end up here we have no
+ * idea what happened.
+ *
+ * Do not delete unless we know that subvolumes and snapshots
+ * are consistent:
+ *
+ * XXX:
+ *
+ * We could be smarter here, and instead of using the generic
+ * recovery pass ratelimiting, track if there have been any
+ * changes to the snapshots or inodes btrees since those passes
+ * last ran.
+ */
+ ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_snapshots) ?: ret;
+ ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_subvols) ?: ret;
+
+ if (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))
+ ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret;
+
+ unsigned repair_flags = FSCK_CAN_IGNORE | (!ret ? FSCK_CAN_FIX : 0);
+
+ if (__fsck_err(trans, repair_flags, bkey_in_missing_snapshot,
+ "key in missing snapshot %s, delete?",
+ (bch2_btree_id_to_text(&buf, iter->btree_id),
+ prt_char(&buf, ' '),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_internal_snapshot_node) ?: 1;
+ }
+ }
fsck_err:
printbuf_exit(&buf);
return ret;
}
+int __bch2_get_snapshot_overwrites(struct btree_trans *trans,
+ enum btree_id btree, struct bpos pos,
+ snapshot_id_list *s)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ for_each_btree_key_reverse_norestart(trans, iter, btree, bpos_predecessor(pos),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (!bkey_eq(k.k->p, pos))
+ break;
+
+ if (!bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) ||
+ snapshot_list_has_ancestor(c, s, k.k->p.snapshot))
+ continue;
+
+ ret = snapshot_list_add(c, s, k.k->p.snapshot);
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ if (ret)
+ darray_exit(s);
+
+ return ret;
+}
+
/*
* Mark a snapshot as deleted, for future cleanup:
*/
@@ -1263,7 +1300,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
goto err;
if (!k.k || !k.k->p.offset) {
- ret = -BCH_ERR_ENOSPC_snapshot_create;
+ ret = bch_err_throw(c, ENOSPC_snapshot_create);
goto err;
}
@@ -1399,10 +1436,8 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
{
- darray_for_each(*l, i)
- if (i->id == id)
- return i->live_child;
- return 0;
+ struct snapshot_interior_delete *i = darray_find_p(*l, i, i->id == id);
+ return i ? i->live_child : 0;
}
static unsigned __live_child(struct snapshot_table *t, u32 id,
@@ -1434,11 +1469,9 @@ static unsigned live_child(struct bch_fs *c, u32 id)
{
struct snapshot_delete *d = &c->snapshot_delete;
- rcu_read_lock();
- u32 ret = __live_child(rcu_dereference(c->snapshots), id,
- &d->delete_leaves, &d->delete_interior);
- rcu_read_unlock();
- return ret;
+ guard(rcu)();
+ return __live_child(rcu_dereference(c->snapshots), id,
+ &d->delete_leaves, &d->delete_interior);
}
static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id)
@@ -1695,7 +1728,7 @@ static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s
static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
interior_delete_list *skip)
{
- rcu_read_lock();
+ guard(rcu)();
while (interior_delete_has_id(skip, id))
id = __bch2_snapshot_parent(c, id);
@@ -1704,7 +1737,6 @@ static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
id = __bch2_snapshot_parent(c, id);
} while (interior_delete_has_id(skip, id));
}
- rcu_read_unlock();
return id;
}
@@ -1870,6 +1902,8 @@ err:
d->running = false;
mutex_unlock(&d->progress_lock);
bch2_trans_put(trans);
+
+ bch2_recovery_pass_set_no_ratelimit(c, BCH_RECOVERY_PASS_check_snapshots);
out_unlock:
mutex_unlock(&d->lock);
if (!bch2_err_matches(ret, EROFS))
@@ -1905,7 +1939,7 @@ void bch2_delete_dead_snapshots_async(struct bch_fs *c)
BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
- if (!queue_work(c->write_ref_wq, &c->snapshot_delete.work))
+ if (!queue_work(system_long_wq, &c->snapshot_delete.work))
enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
}
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index 382a171f5413..6766bf673ed9 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -46,12 +46,9 @@ static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
{
- rcu_read_lock();
+ guard(rcu)();
const struct snapshot_t *s = snapshot_t(c, id);
- id = s ? s->tree : 0;
- rcu_read_unlock();
-
- return id;
+ return s ? s->tree : 0;
}
static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
@@ -62,11 +59,8 @@ static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
{
- rcu_read_lock();
- id = __bch2_snapshot_parent_early(c, id);
- rcu_read_unlock();
-
- return id;
+ guard(rcu)();
+ return __bch2_snapshot_parent_early(c, id);
}
static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
@@ -88,20 +82,15 @@ static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
{
- rcu_read_lock();
- id = __bch2_snapshot_parent(c, id);
- rcu_read_unlock();
-
- return id;
+ guard(rcu)();
+ return __bch2_snapshot_parent(c, id);
}
static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
{
- rcu_read_lock();
+ guard(rcu)();
while (n--)
id = __bch2_snapshot_parent(c, id);
- rcu_read_unlock();
-
return id;
}
@@ -110,13 +99,11 @@ u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32);
static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
{
- u32 parent;
+ guard(rcu)();
- rcu_read_lock();
+ u32 parent;
while ((parent = __bch2_snapshot_parent(c, id)))
id = parent;
- rcu_read_unlock();
-
return id;
}
@@ -128,11 +115,8 @@ static inline enum snapshot_id_state __bch2_snapshot_id_state(struct bch_fs *c,
static inline enum snapshot_id_state bch2_snapshot_id_state(struct bch_fs *c, u32 id)
{
- rcu_read_lock();
- enum snapshot_id_state ret = __bch2_snapshot_id_state(c, id);
- rcu_read_unlock();
-
- return ret;
+ guard(rcu)();
+ return __bch2_snapshot_id_state(c, id);
}
static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id)
@@ -142,12 +126,9 @@ static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id)
static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
{
- rcu_read_lock();
+ guard(rcu)();
const struct snapshot_t *s = snapshot_t(c, id);
- int ret = s ? s->children[0] : -BCH_ERR_invalid_snapshot_node;
- rcu_read_unlock();
-
- return ret;
+ return s ? s->children[0] : -BCH_ERR_invalid_snapshot_node;
}
static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
@@ -160,13 +141,8 @@ static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
{
- u32 depth;
-
- rcu_read_lock();
- depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
- rcu_read_unlock();
-
- return depth;
+ guard(rcu)();
+ return parent ? snapshot_t(c, parent)->depth + 1 : 0;
}
bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
@@ -180,20 +156,14 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances
static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
{
- rcu_read_lock();
+ guard(rcu)();
const struct snapshot_t *t = snapshot_t(c, id);
- bool ret = t && (t->children[0]|t->children[1]) != 0;
- rcu_read_unlock();
-
- return ret;
+ return t && (t->children[0]|t->children[1]) != 0;
}
static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
{
- darray_for_each(*s, i)
- if (*i == id)
- return true;
- return false;
+ return darray_find(*s, id) != NULL;
}
static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
@@ -258,6 +228,25 @@ static inline int bch2_check_key_has_snapshot(struct btree_trans *trans,
: __bch2_check_key_has_snapshot(trans, iter, k);
}
+int __bch2_get_snapshot_overwrites(struct btree_trans *,
+ enum btree_id, struct bpos,
+ snapshot_id_list *);
+
+/*
+ * Get a list of snapshot IDs that have overwritten a given key:
+ */
+static inline int bch2_get_snapshot_overwrites(struct btree_trans *trans,
+ enum btree_id btree, struct bpos pos,
+ snapshot_id_list *s)
+{
+ darray_init(s);
+
+ return bch2_snapshot_has_children(trans->c, pos.snapshot)
+ ? __bch2_get_snapshot_overwrites(trans, btree, pos, s)
+ : 0;
+
+}
+
int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos);
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
index 0cbf5508a32c..3e9f59226bdf 100644
--- a/fs/bcachefs/str_hash.c
+++ b/fs/bcachefs/str_hash.c
@@ -31,14 +31,16 @@ static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dir
}
}
-static noinline int fsck_rename_dirent(struct btree_trans *trans,
- struct snapshots_seen *s,
- const struct bch_hash_desc desc,
- struct bch_hash_info *hash_info,
- struct bkey_s_c_dirent old)
+static int bch2_fsck_rename_dirent(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc desc,
+ struct bch_hash_info *hash_info,
+ struct bkey_s_c_dirent old,
+ bool *updated_before_k_pos)
{
+ struct bch_fs *c = trans->c;
struct qstr old_name = bch2_dirent_get_name(old);
- struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32);
+ struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64));
int ret = PTR_ERR_OR_ZERO(new);
if (ret)
return ret;
@@ -47,28 +49,39 @@ static noinline int fsck_rename_dirent(struct btree_trans *trans,
dirent_copy_target(new, old);
new->k.p = old.k->p;
+ char *renamed_buf = bch2_trans_kmalloc(trans, old_name.len + 20);
+ ret = PTR_ERR_OR_ZERO(renamed_buf);
+ if (ret)
+ return ret;
+
for (unsigned i = 0; i < 1000; i++) {
- unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u",
- old_name.len, old_name.name, i);
- unsigned u64s = BKEY_U64s + dirent_val_u64s(len, 0);
+ new->k.u64s = BKEY_U64s_MAX;
- if (u64s > U8_MAX)
- return -EINVAL;
+ struct qstr renamed_name = (struct qstr) QSTR_INIT(renamed_buf,
+ sprintf(renamed_buf, "%.*s.fsck_renamed-%u",
+ old_name.len, old_name.name, i));
- new->k.u64s = u64s;
+ ret = bch2_dirent_init_name(c, new, hash_info, &renamed_name, NULL);
+ if (ret)
+ return ret;
ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
(subvol_inum) { 0, old.k->p.inode },
old.k->p.snapshot, &new->k_i,
- BTREE_UPDATE_internal_snapshot_node);
- if (!bch2_err_matches(ret, EEXIST))
+ BTREE_UPDATE_internal_snapshot_node|
+ STR_HASH_must_create);
+ if (ret && !bch2_err_matches(ret, EEXIST))
break;
+ if (!ret) {
+ if (bpos_lt(new->k.p, old.k->p))
+ *updated_before_k_pos = true;
+ break;
+ }
}
- if (ret)
- return ret;
-
- return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
+ ret = ret ?: bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
+ bch_err_fn(c, ret);
+ return ret;
}
static noinline int hash_pick_winner(struct btree_trans *trans,
@@ -186,7 +199,7 @@ int bch2_repair_inode_hash_info(struct btree_trans *trans,
#endif
bch2_print_str(c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
- ret = -BCH_ERR_fsck_repair_unimplemented;
+ ret = bch_err_throw(c, fsck_repair_unimplemented);
goto err;
}
@@ -221,11 +234,115 @@ static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans
return ret;
}
+/* Put a str_hash key in its proper location, checking for duplicates */
+int bch2_str_hash_repair_key(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc *desc,
+ struct bch_hash_info *hash_info,
+ struct btree_iter *k_iter, struct bkey_s_c k,
+ struct btree_iter *dup_iter, struct bkey_s_c dup_k,
+ bool *updated_before_k_pos)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ bool free_snapshots_seen = false;
+ int ret = 0;
+
+ if (!s) {
+ s = bch2_trans_kmalloc(trans, sizeof(*s));
+ ret = PTR_ERR_OR_ZERO(s);
+ if (ret)
+ goto out;
+
+ s->pos = k_iter->pos;
+ darray_init(&s->ids);
+
+ ret = bch2_get_snapshot_overwrites(trans, desc->btree_id, k_iter->pos, &s->ids);
+ if (ret)
+ goto out;
+
+ free_snapshots_seen = true;
+ }
+
+ if (!dup_k.k) {
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ goto out;
+
+ dup_k = bch2_hash_set_or_get_in_snapshot(trans, dup_iter, *desc, hash_info,
+ (subvol_inum) { 0, new->k.p.inode },
+ new->k.p.snapshot, new,
+ STR_HASH_must_create|
+ BTREE_ITER_with_updates|
+ BTREE_UPDATE_internal_snapshot_node);
+ ret = bkey_err(dup_k);
+ if (ret)
+ goto out;
+ if (dup_k.k)
+ goto duplicate_entries;
+
+ if (bpos_lt(new->k.p, k.k->p))
+ *updated_before_k_pos = true;
+
+ ret = bch2_insert_snapshot_whiteouts(trans, desc->btree_id,
+ k_iter->pos, new->k.p) ?:
+ bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
+ BTREE_ITER_with_updates|
+ BTREE_UPDATE_internal_snapshot_node) ?:
+ bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ -BCH_ERR_transaction_restart_commit;
+ } else {
+duplicate_entries:
+ ret = hash_pick_winner(trans, *desc, hash_info, k, dup_k);
+ if (ret < 0)
+ goto out;
+
+ if (!fsck_err(trans, hash_table_key_duplicate,
+ "duplicate hash table keys%s:\n%s",
+ ret != 2 ? "" : ", both point to valid inodes",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k),
+ prt_newline(&buf),
+ bch2_bkey_val_to_text(&buf, c, dup_k),
+ buf.buf)))
+ goto out;
+
+ switch (ret) {
+ case 0:
+ ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
+ break;
+ case 1:
+ ret = bch2_hash_delete_at(trans, *desc, hash_info, dup_iter, 0);
+ break;
+ case 2:
+ ret = bch2_fsck_rename_dirent(trans, s, *desc, hash_info,
+ bkey_s_c_to_dirent(k),
+ updated_before_k_pos) ?:
+ bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
+ BTREE_ITER_with_updates);
+ goto out;
+ }
+
+ ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
+ -BCH_ERR_transaction_restart_commit;
+ }
+out:
+fsck_err:
+ bch2_trans_iter_exit(trans, dup_iter);
+ printbuf_exit(&buf);
+ if (free_snapshots_seen)
+ darray_exit(&s->ids);
+ return ret;
+}
+
int __bch2_str_hash_check_key(struct btree_trans *trans,
struct snapshots_seen *s,
const struct bch_hash_desc *desc,
struct bch_hash_info *hash_info,
- struct btree_iter *k_iter, struct bkey_s_c hash_k)
+ struct btree_iter *k_iter, struct bkey_s_c hash_k,
+ bool *updated_before_k_pos)
{
struct bch_fs *c = trans->c;
struct btree_iter iter = {};
@@ -239,24 +356,31 @@ int __bch2_str_hash_check_key(struct btree_trans *trans,
for_each_btree_key_norestart(trans, iter, desc->btree_id,
SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
- BTREE_ITER_slots, k, ret) {
+ BTREE_ITER_slots|
+ BTREE_ITER_with_updates, k, ret) {
if (bkey_eq(k.k->p, hash_k.k->p))
break;
if (k.k->type == desc->key_type &&
- !desc->cmp_bkey(k, hash_k))
- goto duplicate_entries;
+ !desc->cmp_bkey(k, hash_k)) {
+ ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode,
+ hash_info) ?:
+ bch2_str_hash_repair_key(trans, s, desc, hash_info,
+ k_iter, hash_k,
+ &iter, k, updated_before_k_pos);
+ break;
+ }
- if (bkey_deleted(k.k)) {
- bch2_trans_iter_exit(trans, &iter);
+ if (bkey_deleted(k.k))
goto bad_hash;
- }
}
-out:
bch2_trans_iter_exit(trans, &iter);
+out:
+fsck_err:
printbuf_exit(&buf);
return ret;
bad_hash:
+ bch2_trans_iter_exit(trans, &iter);
/*
* Before doing any repair, check hash_info itself:
*/
@@ -265,64 +389,12 @@ bad_hash:
goto out;
if (fsck_err(trans, hash_table_key_wrong_offset,
- "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
- bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k);
- if (IS_ERR(new))
- return PTR_ERR(new);
-
- k = bch2_hash_set_or_get_in_snapshot(trans, &iter, *desc, hash_info,
- (subvol_inum) { 0, hash_k.k->p.inode },
- hash_k.k->p.snapshot, new,
- STR_HASH_must_create|
- BTREE_ITER_with_updates|
- BTREE_UPDATE_internal_snapshot_node);
- ret = bkey_err(k);
- if (ret)
- goto out;
- if (k.k)
- goto duplicate_entries;
-
- ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
- BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
- -BCH_ERR_transaction_restart_nested;
- goto out;
- }
-fsck_err:
- goto out;
-duplicate_entries:
- ret = hash_pick_winner(trans, *desc, hash_info, hash_k, k);
- if (ret < 0)
- goto out;
-
- if (!fsck_err(trans, hash_table_key_duplicate,
- "duplicate hash table keys%s:\n%s",
- ret != 2 ? "" : ", both point to valid inodes",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, hash_k),
- prt_newline(&buf),
- bch2_bkey_val_to_text(&buf, c, k),
- buf.buf)))
- goto out;
-
- switch (ret) {
- case 0:
- ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
- break;
- case 1:
- ret = bch2_hash_delete_at(trans, *desc, hash_info, &iter, 0);
- break;
- case 2:
- ret = fsck_rename_dirent(trans, s, *desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?:
- bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
- goto out;
- }
-
- ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
- -BCH_ERR_transaction_restart_nested;
+ "hash table key at wrong offset: should be at %llu\n%s",
+ hash,
+ (bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)))
+ ret = bch2_str_hash_repair_key(trans, s, desc, hash_info,
+ k_iter, hash_k,
+ &iter, bkey_s_c_null,
+ updated_before_k_pos);
goto out;
}
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 6762b3627e1b..8979ac2d7a3b 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -48,9 +48,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
struct bch_hash_info info = {
.inum_snapshot = bi->bi_snapshot,
.type = INODE_STR_HASH(bi),
-#ifdef CONFIG_UNICODE
.cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL,
-#endif
.siphash_key = { .k0 = bi->bi_hash_seed }
};
@@ -261,6 +259,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
struct bkey_i *insert,
enum btree_iter_update_trigger_flags flags)
{
+ struct bch_fs *c = trans->c;
struct btree_iter slot = {};
struct bkey_s_c k;
bool found = false;
@@ -288,7 +287,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
}
if (!ret)
- ret = -BCH_ERR_ENOSPC_str_hash_create;
+ ret = bch_err_throw(c, ENOSPC_str_hash_create);
out:
bch2_trans_iter_exit(trans, &slot);
bch2_trans_iter_exit(trans, iter);
@@ -300,7 +299,7 @@ not_found:
bch2_trans_iter_exit(trans, &slot);
return k;
} else if (!found && (flags & STR_HASH_must_replace)) {
- ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
+ ret = bch_err_throw(c, ENOENT_str_hash_set_must_replace);
} else {
if (!found && slot.path)
swap(*iter, slot);
@@ -328,7 +327,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans,
return ret;
if (k.k) {
bch2_trans_iter_exit(trans, &iter);
- return -BCH_ERR_EEXIST_str_hash_set;
+ return bch_err_throw(trans->c, EEXIST_str_hash_set);
}
return 0;
@@ -397,17 +396,27 @@ int bch2_hash_delete(struct btree_trans *trans,
int bch2_repair_inode_hash_info(struct btree_trans *, struct bch_inode_unpacked *);
struct snapshots_seen;
+int bch2_str_hash_repair_key(struct btree_trans *,
+ struct snapshots_seen *,
+ const struct bch_hash_desc *,
+ struct bch_hash_info *,
+ struct btree_iter *, struct bkey_s_c,
+ struct btree_iter *, struct bkey_s_c,
+ bool *);
+
int __bch2_str_hash_check_key(struct btree_trans *,
struct snapshots_seen *,
const struct bch_hash_desc *,
struct bch_hash_info *,
- struct btree_iter *, struct bkey_s_c);
+ struct btree_iter *, struct bkey_s_c,
+ bool *);
static inline int bch2_str_hash_check_key(struct btree_trans *trans,
struct snapshots_seen *s,
const struct bch_hash_desc *desc,
struct bch_hash_info *hash_info,
- struct btree_iter *k_iter, struct bkey_s_c hash_k)
+ struct btree_iter *k_iter, struct bkey_s_c hash_k,
+ bool *updated_before_k_pos)
{
if (hash_k.k->type != desc->key_type)
return 0;
@@ -415,7 +424,8 @@ static inline int bch2_str_hash_check_key(struct btree_trans *trans,
if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset))
return 0;
- return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k);
+ return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k,
+ updated_before_k_pos);
}
#endif /* _BCACHEFS_STR_HASH_H */
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 35c9f86a73c1..020587449123 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -130,10 +130,20 @@ static int check_subvol(struct btree_trans *trans,
"subvolume %llu points to missing subvolume root %llu:%u",
k.k->p.offset, le64_to_cpu(subvol.v->inode),
le32_to_cpu(subvol.v->snapshot))) {
- ret = bch2_subvolume_delete(trans, iter->pos.offset);
- bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
- ret = ret ?: -BCH_ERR_transaction_restart_nested;
- goto err;
+ /*
+ * Recreate - any contents that are still disconnected
+ * will then get reattached under lost+found
+ */
+ bch2_inode_init_early(c, &inode);
+ bch2_inode_init_late(c, &inode, bch2_current_time(c),
+ 0, 0, S_IFDIR|0700, 0, NULL);
+ inode.bi_inum = le64_to_cpu(subvol.v->inode);
+ inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot);
+ inode.bi_subvol = k.k->p.offset;
+ inode.bi_parent_subvol = le32_to_cpu(subvol.v->fs_path_parent);
+ ret = __bch2_fsck_write_inode(trans, &inode);
+ if (ret)
+ goto err;
}
} else {
goto err;
@@ -141,13 +151,9 @@ static int check_subvol(struct btree_trans *trans,
if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
- u32 snapshot_tree;
- struct bch_snapshot_tree st;
-
- rcu_read_lock();
- snapshot_tree = snapshot_t(c, snapshot_root)->tree;
- rcu_read_unlock();
+ u32 snapshot_tree = bch2_snapshot_tree(c, snapshot_root);
+ struct bch_snapshot_tree st;
ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
@@ -259,6 +265,13 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent));
prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent));
}
+
+ if (BCH_SUBVOLUME_RO(s.v))
+ prt_printf(out, " ro");
+ if (BCH_SUBVOLUME_SNAP(s.v))
+ prt_printf(out, " snapshot");
+ if (BCH_SUBVOLUME_UNLINKED(s.v))
+ prt_printf(out, " unlinked");
}
static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set)
@@ -486,9 +499,12 @@ err:
static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
{
- return bch2_subvolumes_reparent(trans, subvolid) ?:
+ int ret = bch2_subvolumes_reparent(trans, subvolid) ?:
commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_subvolume_delete(trans, subvolid));
+
+ bch2_recovery_pass_set_no_ratelimit(trans->c, BCH_RECOVERY_PASS_check_subvols);
+ return ret;
}
static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
@@ -597,7 +613,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
ret = bch2_bkey_get_empty_slot(trans, &dst_iter,
BTREE_ID_subvolumes, POS(0, U32_MAX));
if (ret == -BCH_ERR_ENOSPC_btree_slot)
- ret = -BCH_ERR_ENOSPC_subvolume_create;
+ ret = bch_err_throw(c, ENOSPC_subvolume_create);
if (ret)
return ret;
@@ -703,8 +719,9 @@ static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
return ret;
if (!bkey_is_inode(k.k)) {
- bch_err(trans->c, "root inode not found");
- ret = -BCH_ERR_ENOENT_inode;
+ struct bch_fs *c = trans->c;
+ bch_err(c, "root inode not found");
+ ret = bch_err_throw(c, ENOENT_inode);
goto err;
}
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 6687b9235d3c..6c2e1d647403 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1112,7 +1112,7 @@ int bch2_write_super(struct bch_fs *c)
prt_str(&buf, ")");
bch2_fs_fatal_error(c, ": %s", buf.buf);
printbuf_exit(&buf);
- ret = -BCH_ERR_sb_not_downgraded;
+ ret = bch_err_throw(c, sb_not_downgraded);
goto out;
}
@@ -1142,7 +1142,7 @@ int bch2_write_super(struct bch_fs *c)
if (c->opts.errors != BCH_ON_ERROR_continue &&
c->opts.errors != BCH_ON_ERROR_fix_safe) {
- ret = -BCH_ERR_erofs_sb_err;
+ ret = bch_err_throw(c, erofs_sb_err);
bch2_fs_fatal_error(c, "%s", buf.buf);
} else {
bch_err(c, "%s", buf.buf);
@@ -1161,7 +1161,7 @@ int bch2_write_super(struct bch_fs *c)
ca->disk_sb.seq);
bch2_fs_fatal_error(c, "%s", buf.buf);
printbuf_exit(&buf);
- ret = -BCH_ERR_erofs_sb_err;
+ ret = bch_err_throw(c, erofs_sb_err);
}
}
@@ -1215,7 +1215,7 @@ int bch2_write_super(struct bch_fs *c)
!can_mount_with_written), c,
": Unable to write superblock to sufficient devices (from %ps)",
(void *) _RET_IP_))
- ret = -BCH_ERR_erofs_sb_err;
+ ret = bch_err_throw(c, erofs_sb_err);
out:
/* Make new options visible after they're persistent: */
bch2_sb_update(c);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 11579b74c640..c46b1053a02c 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -104,7 +104,7 @@ const char * const bch2_dev_write_refs[] = {
#undef x
static void __bch2_print_str(struct bch_fs *c, const char *prefix,
- const char *str, bool nonblocking)
+ const char *str)
{
#ifdef __KERNEL__
struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
@@ -114,17 +114,12 @@ static void __bch2_print_str(struct bch_fs *c, const char *prefix,
return;
}
#endif
- bch2_print_string_as_lines(KERN_ERR, str, nonblocking);
+ bch2_print_string_as_lines(KERN_ERR, str);
}
void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str)
{
- __bch2_print_str(c, prefix, str, false);
-}
-
-void bch2_print_str_nonblocking(struct bch_fs *c, const char *prefix, const char *str)
-{
- __bch2_print_str(c, prefix, str, true);
+ __bch2_print_str(c, prefix, str);
}
__printf(2, 0)
@@ -215,27 +210,20 @@ static int bch2_dev_alloc(struct bch_fs *, unsigned);
static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
static void bch2_dev_io_ref_stop(struct bch_dev *, int);
static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
-static int bch2_fs_init_rw(struct bch_fs *);
struct bch_fs *bch2_dev_to_fs(dev_t dev)
{
- struct bch_fs *c;
-
- mutex_lock(&bch_fs_list_lock);
- rcu_read_lock();
+ guard(mutex)(&bch_fs_list_lock);
+ guard(rcu)();
+ struct bch_fs *c;
list_for_each_entry(c, &bch_fs_list, list)
for_each_member_device_rcu(c, ca, NULL)
if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) {
closure_get(&c->cl);
- goto found;
+ return c;
}
- c = NULL;
-found:
- rcu_read_unlock();
- mutex_unlock(&bch_fs_list_lock);
-
- return c;
+ return NULL;
}
static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid)
@@ -480,16 +468,16 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))
- return -BCH_ERR_erofs_no_alloc_info;
+ return bch_err_throw(c, erofs_no_alloc_info);
if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) {
bch_err(c, "cannot go rw, unfixed btree errors");
- return -BCH_ERR_erofs_unfixed_errors;
+ return bch_err_throw(c, erofs_unfixed_errors);
}
if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
bch_err(c, "cannot go rw, filesystem is an unresized image file");
- return -BCH_ERR_erofs_filesystem_full;
+ return bch_err_throw(c, erofs_filesystem_full);
}
if (test_bit(BCH_FS_rw, &c->flags))
@@ -507,13 +495,12 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
clear_bit(BCH_FS_clean_shutdown, &c->flags);
- rcu_read_lock();
- for_each_online_member_rcu(c, ca)
- if (ca->mi.state == BCH_MEMBER_STATE_rw) {
- bch2_dev_allocator_add(c, ca);
- enumerated_ref_start(&ca->io_ref[WRITE]);
- }
- rcu_read_unlock();
+ scoped_guard(rcu)
+ for_each_online_member_rcu(c, ca)
+ if (ca->mi.state == BCH_MEMBER_STATE_rw) {
+ bch2_dev_allocator_add(c, ca);
+ enumerated_ref_start(&ca->io_ref[WRITE]);
+ }
bch2_recalc_capacity(c);
@@ -571,13 +558,13 @@ int bch2_fs_read_write(struct bch_fs *c)
{
if (c->opts.recovery_pass_last &&
c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay)
- return -BCH_ERR_erofs_norecovery;
+ return bch_err_throw(c, erofs_norecovery);
if (c->opts.nochanges)
- return -BCH_ERR_erofs_nochanges;
+ return bch_err_throw(c, erofs_nochanges);
if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))
- return -BCH_ERR_erofs_no_alloc_info;
+ return bch_err_throw(c, erofs_no_alloc_info);
return __bch2_fs_read_write(c, false);
}
@@ -762,7 +749,7 @@ static int bch2_fs_online(struct bch_fs *c)
if (c->sb.multi_device &&
__bch2_uuid_to_fs(c->sb.uuid)) {
bch_err(c, "filesystem UUID already open");
- return -BCH_ERR_filesystem_uuid_already_open;
+ return bch_err_throw(c, filesystem_uuid_already_open);
}
ret = bch2_fs_chardev_init(c);
@@ -806,7 +793,7 @@ err:
return ret;
}
-static int bch2_fs_init_rw(struct bch_fs *c)
+int bch2_fs_init_rw(struct bch_fs *c)
{
if (test_bit(BCH_FS_rw_init_done, &c->flags))
return 0;
@@ -821,7 +808,7 @@ static int bch2_fs_init_rw(struct bch_fs *c)
WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
!(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
WQ_FREEZABLE, 0)))
- return -BCH_ERR_ENOMEM_fs_other_alloc;
+ return bch_err_throw(c, ENOMEM_fs_other_alloc);
int ret = bch2_fs_btree_interior_update_init(c) ?:
bch2_fs_btree_write_buffer_init(c) ?:
@@ -1002,7 +989,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts,
mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
c->opts.btree_node_size) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) {
- ret = -BCH_ERR_ENOMEM_fs_other_alloc;
+ ret = bch_err_throw(c, ENOMEM_fs_other_alloc);
goto err;
}
@@ -1027,21 +1014,29 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts,
if (ret)
goto err;
+ if (go_rw_in_recovery(c)) {
+ /*
+ * start workqueues/kworkers early - kthread creation checks for
+ * pending signals, which is _very_ annoying
+ */
+ ret = bch2_fs_init_rw(c);
+ if (ret)
+ goto err;
+ }
+
#ifdef CONFIG_UNICODE
- /* Default encoding until we can potentially have more as an option. */
- c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING);
- if (IS_ERR(c->cf_encoding)) {
- printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u",
- unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
- unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
- unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
- ret = -EINVAL;
- goto err;
+ if (bch2_fs_casefold_enabled(c)) {
+ /* Default encoding until we can potentially have more as an option. */
+ c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING);
+ if (IS_ERR(c->cf_encoding)) {
+ printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u",
+ unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
+ unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
+ unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
+ ret = -EINVAL;
+ goto err;
+ }
}
- bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u",
- unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
- unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
- unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
#else
if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) {
printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n");
@@ -1083,12 +1078,13 @@ noinline_for_stack
static void print_mount_opts(struct bch_fs *c)
{
enum bch_opt_id i;
- struct printbuf p = PRINTBUF;
- bool first = true;
+ CLASS(printbuf, p)();
+ bch2_log_msg_start(c, &p);
prt_str(&p, "starting version ");
bch2_version_to_text(&p, c->sb.version);
+ bool first = true;
for (i = 0; i < bch2_opts_nr; i++) {
const struct bch_option *opt = &bch2_opt_table[i];
u64 v = bch2_opt_get_by_id(&c->opts, i);
@@ -1105,17 +1101,24 @@ static void print_mount_opts(struct bch_fs *c)
}
if (c->sb.version_incompat_allowed != c->sb.version) {
- prt_printf(&p, "\n allowing incompatible features above ");
+ prt_printf(&p, "\nallowing incompatible features above ");
bch2_version_to_text(&p, c->sb.version_incompat_allowed);
}
if (c->opts.verbose) {
- prt_printf(&p, "\n features: ");
+ prt_printf(&p, "\nfeatures: ");
prt_bitflags(&p, bch2_sb_features, c->sb.features);
}
- bch_info(c, "%s", p.buf);
- printbuf_exit(&p);
+ if (c->sb.multi_device) {
+ prt_printf(&p, "\nwith devices");
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_bch2_online_devs) {
+ prt_char(&p, ' ');
+ prt_str(&p, ca->name);
+ }
+ }
+
+ bch2_print_str(c, KERN_INFO, p.buf);
}
static bool bch2_fs_may_start(struct bch_fs *c)
@@ -1159,8 +1162,14 @@ int bch2_fs_start(struct bch_fs *c)
print_mount_opts(c);
+ if (c->cf_encoding)
+ bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u",
+ unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
+ unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
+ unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
+
if (!bch2_fs_may_start(c))
- return -BCH_ERR_insufficient_devices_to_start;
+ return bch_err_throw(c, insufficient_devices_to_start);
down_write(&c->state_lock);
mutex_lock(&c->sb_lock);
@@ -1171,7 +1180,7 @@ int bch2_fs_start(struct bch_fs *c)
sizeof(struct bch_sb_field_ext) / sizeof(u64))) {
mutex_unlock(&c->sb_lock);
up_write(&c->state_lock);
- ret = -BCH_ERR_ENOSPC_sb;
+ ret = bch_err_throw(c, ENOSPC_sb);
goto err;
}
@@ -1182,22 +1191,20 @@ int bch2_fs_start(struct bch_fs *c)
goto err;
}
- rcu_read_lock();
- for_each_online_member_rcu(c, ca)
- bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
- cpu_to_le64(now);
- rcu_read_unlock();
+ scoped_guard(rcu)
+ for_each_online_member_rcu(c, ca)
+ bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
+ cpu_to_le64(now);
/*
* Dno't write superblock yet: recovery might have to downgrade
*/
mutex_unlock(&c->sb_lock);
- rcu_read_lock();
- for_each_online_member_rcu(c, ca)
- if (ca->mi.state == BCH_MEMBER_STATE_rw)
- bch2_dev_allocator_add(c, ca);
- rcu_read_unlock();
+ scoped_guard(rcu)
+ for_each_online_member_rcu(c, ca)
+ if (ca->mi.state == BCH_MEMBER_STATE_rw)
+ bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
up_write(&c->state_lock);
@@ -1215,7 +1222,7 @@ int bch2_fs_start(struct bch_fs *c)
goto err;
if (bch2_fs_init_fault("fs_start")) {
- ret = -BCH_ERR_injected_fs_start;
+ ret = bch_err_throw(c, injected_fs_start);
goto err;
}
@@ -1242,11 +1249,11 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
if (le16_to_cpu(sb->block_size) != block_sectors(c))
- return -BCH_ERR_mismatched_block_size;
+ return bch_err_throw(c, mismatched_block_size);
if (le16_to_cpu(m.bucket_size) <
BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
- return -BCH_ERR_bucket_size_too_small;
+ return bch_err_throw(c, bucket_size_too_small);
return 0;
}
@@ -1557,7 +1564,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
bch2_dev_attach(c, ca, dev_idx);
return 0;
err:
- return -BCH_ERR_ENOMEM_dev_alloc;
+ return bch_err_throw(c, ENOMEM_dev_alloc);
}
static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
@@ -1567,13 +1574,13 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
if (bch2_dev_is_online(ca)) {
bch_err(ca, "already have device online in slot %u",
sb->sb->dev_idx);
- return -BCH_ERR_device_already_online;
+ return bch_err_throw(ca->fs, device_already_online);
}
if (get_capacity(sb->bdev->bd_disk) <
ca->mi.bucket_size * ca->mi.nbuckets) {
bch_err(ca, "cannot online: device too small");
- return -BCH_ERR_device_size_too_small;
+ return bch_err_throw(ca->fs, device_size_too_small);
}
BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ]));
@@ -1725,7 +1732,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
return 0;
if (!bch2_dev_state_allowed(c, ca, new_state, flags))
- return -BCH_ERR_device_state_not_allowed;
+ return bch_err_throw(c, device_state_not_allowed);
if (new_state != BCH_MEMBER_STATE_rw)
__bch2_dev_read_only(c, ca);
@@ -1778,7 +1785,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
bch_err(ca, "Cannot remove without losing data");
- ret = -BCH_ERR_device_state_not_allowed;
+ ret = bch_err_throw(c, device_state_not_allowed);
goto err;
}
@@ -1914,7 +1921,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
if (list_empty(&c->list)) {
mutex_lock(&bch_fs_list_lock);
if (__bch2_uuid_to_fs(c->sb.uuid))
- ret = -BCH_ERR_filesystem_uuid_already_open;
+ ret = bch_err_throw(c, filesystem_uuid_already_open);
else
list_add(&c->list, &bch_fs_list);
mutex_unlock(&bch_fs_list_lock);
@@ -2001,6 +2008,22 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto err_late;
}
+ /*
+ * We just changed the superblock UUID, invalidate cache and send a
+ * uevent to update /dev/disk/by-uuid
+ */
+ invalidate_bdev(ca->disk_sb.bdev);
+
+ char uuid_str[37];
+ snprintf(uuid_str, sizeof(uuid_str), "UUID=%pUb", &c->sb.uuid);
+
+ char *envp[] = {
+ "CHANGE=uuid",
+ uuid_str,
+ NULL,
+ };
+ kobject_uevent_env(&ca->disk_sb.bdev->bd_device.kobj, KOBJ_CHANGE, envp);
+
up_write(&c->state_lock);
out:
printbuf_exit(&label);
@@ -2101,7 +2124,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
bch_err(ca, "Cannot offline required disk");
up_write(&c->state_lock);
- return -BCH_ERR_device_state_not_allowed;
+ return bch_err_throw(c, device_state_not_allowed);
}
__bch2_dev_offline(c, ca);
@@ -2140,7 +2163,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) {
bch_err(ca, "New device size too big (%llu greater than max %u)",
nbuckets, BCH_MEMBER_NBUCKETS_MAX);
- ret = -BCH_ERR_device_size_too_big;
+ ret = bch_err_throw(c, device_size_too_big);
goto err;
}
@@ -2148,7 +2171,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
get_capacity(ca->disk_sb.bdev->bd_disk) <
ca->mi.bucket_size * nbuckets) {
bch_err(ca, "New size larger than device");
- ret = -BCH_ERR_device_size_too_small;
+ ret = bch_err_throw(c, device_size_too_small);
goto err;
}
@@ -2383,7 +2406,7 @@ struct bch_fs *bch2_fs_open(darray_const_str *devices,
}
if (opts->nochanges && !opts->read_only) {
- ret = -BCH_ERR_erofs_nochanges;
+ ret = bch_err_throw(c, erofs_nochanges);
goto err_print;
}
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index dc52f06cb2b9..e90bab9afe78 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -46,6 +46,7 @@ void __bch2_fs_stop(struct bch_fs *);
void bch2_fs_free(struct bch_fs *);
void bch2_fs_stop(struct bch_fs *);
+int bch2_fs_init_rw(struct bch_fs *);
int bch2_fs_start(struct bch_fs *);
struct bch_fs *bch2_fs_open(darray_const_str *, struct bch_opts *);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 1a55196d69f1..05848375cea2 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -26,6 +26,7 @@
#include "disk_groups.h"
#include "ec.h"
#include "enumerated_ref.h"
+#include "error.h"
#include "inode.h"
#include "journal.h"
#include "journal_reclaim.h"
@@ -37,6 +38,7 @@
#include "rebalance.h"
#include "recovery_passes.h"
#include "replicas.h"
+#include "sb-errors.h"
#include "super-io.h"
#include "tests.h"
@@ -143,6 +145,7 @@ do { \
write_attribute(trigger_gc);
write_attribute(trigger_discards);
write_attribute(trigger_invalidates);
+write_attribute(trigger_journal_commit);
write_attribute(trigger_journal_flush);
write_attribute(trigger_journal_writes);
write_attribute(trigger_btree_cache_shrink);
@@ -151,6 +154,7 @@ write_attribute(trigger_btree_updates);
write_attribute(trigger_freelist_wakeup);
write_attribute(trigger_recalc_capacity);
write_attribute(trigger_delete_dead_snapshots);
+write_attribute(trigger_emergency_read_only);
read_attribute(gc_gens_pos);
read_attribute(uuid);
@@ -172,6 +176,7 @@ read_attribute(btree_write_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
+read_attribute(errors);
read_attribute(journal_debug);
read_attribute(btree_cache);
read_attribute(btree_key_cache);
@@ -353,6 +358,9 @@ SHOW(bch2_fs)
if (attr == &sysfs_compression_stats)
bch2_compression_stats_to_text(out, c);
+ if (attr == &sysfs_errors)
+ bch2_fs_errors_to_text(out, c);
+
if (attr == &sysfs_new_stripes)
bch2_new_stripes_to_text(out, c);
@@ -428,6 +436,9 @@ STORE(bch2_fs)
if (attr == &sysfs_trigger_invalidates)
bch2_do_invalidates(c);
+ if (attr == &sysfs_trigger_journal_commit)
+ bch2_journal_flush(&c->journal);
+
if (attr == &sysfs_trigger_journal_flush) {
bch2_journal_flush_all_pins(&c->journal);
bch2_journal_meta(&c->journal);
@@ -448,6 +459,16 @@ STORE(bch2_fs)
if (attr == &sysfs_trigger_delete_dead_snapshots)
__bch2_delete_dead_snapshots(c);
+ if (attr == &sysfs_trigger_emergency_read_only) {
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+
+ prt_printf(&buf, "shutdown by sysfs\n");
+ bch2_fs_emergency_read_only2(c, &buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ }
+
#ifdef CONFIG_BCACHEFS_TESTS
if (attr == &sysfs_perf_test) {
char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -483,6 +504,7 @@ struct attribute *bch2_fs_files[] = {
&sysfs_recovery_status,
&sysfs_compression_stats,
+ &sysfs_errors,
#ifdef CONFIG_BCACHEFS_TESTS
&sysfs_perf_test,
@@ -571,6 +593,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_trigger_gc,
&sysfs_trigger_discards,
&sysfs_trigger_invalidates,
+ &sysfs_trigger_journal_commit,
&sysfs_trigger_journal_flush,
&sysfs_trigger_journal_writes,
&sysfs_trigger_btree_cache_shrink,
@@ -579,6 +602,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_trigger_freelist_wakeup,
&sysfs_trigger_recalc_capacity,
&sysfs_trigger_delete_dead_snapshots,
+ &sysfs_trigger_emergency_read_only,
&sysfs_gc_gens_pos,
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 8cb5b40704fd..9c5a9c551f03 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -199,6 +199,50 @@ DECLARE_EVENT_CLASS(bio,
(unsigned long long)__entry->sector, __entry->nr_sector)
);
+/* errors */
+
+TRACE_EVENT(error_throw,
+ TP_PROTO(struct bch_fs *c, int bch_err, unsigned long ip),
+ TP_ARGS(c, bch_err, ip),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(int, err )
+ __array(char, err_str, 32 )
+ __array(char, ip, 32 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->err = bch_err;
+ strscpy(__entry->err_str, bch2_err_str(bch_err), sizeof(__entry->err_str));
+ snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
+ ),
+
+ TP_printk("%d,%d %s ret %s", MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ip, __entry->err_str)
+);
+
+TRACE_EVENT(error_downcast,
+ TP_PROTO(int bch_err, int std_err, unsigned long ip),
+ TP_ARGS(bch_err, std_err, ip),
+
+ TP_STRUCT__entry(
+ __array(char, bch_err, 32 )
+ __array(char, std_err, 32 )
+ __array(char, ip, 32 )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err));
+ strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err));
+ snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
+ ),
+
+ TP_printk("%s ret %s -> %s %s", __entry->ip,
+ __entry->bch_err, __entry->std_err, __entry->ip)
+);
+
/* disk_accounting.c */
TRACE_EVENT(accounting_mem_insert,
@@ -1036,34 +1080,14 @@ TRACE_EVENT(trans_blocked_journal_reclaim,
__entry->must_wait)
);
-TRACE_EVENT(trans_restart_journal_preres_get,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- unsigned flags),
- TP_ARGS(trans, caller_ip, flags),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(unsigned, flags )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->flags = flags;
- ),
-
- TP_printk("%s %pS %x", __entry->trans_fn,
- (void *) __entry->caller_ip,
- __entry->flags)
-);
-
+#if 0
+/* todo: bring back dynamic fault injection */
DEFINE_EVENT(transaction_event, trans_restart_fault_inject,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
TP_ARGS(trans, caller_ip)
);
+#endif
DEFINE_EVENT(transaction_event, trans_traverse_all,
TP_PROTO(struct btree_trans *trans,
@@ -1151,19 +1175,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill,
TP_ARGS(trans, caller_ip, path)
);
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_event, trans_restart_key_cache_upgrade,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
-);
-
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
@@ -1185,13 +1196,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent,
TP_ARGS(trans, caller_ip, path)
);
-DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
-);
-
DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
@@ -1250,44 +1254,6 @@ TRACE_EVENT(trans_restart_mem_realloced,
__entry->bytes)
);
-TRACE_EVENT(trans_restart_key_cache_key_realloced,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path,
- unsigned old_u64s,
- unsigned new_u64s),
- TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(enum btree_id, btree_id )
- TRACE_BPOS_entries(pos)
- __field(u32, old_u64s )
- __field(u32, new_u64s )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
-
- __entry->btree_id = path->btree_id;
- TRACE_BPOS_assign(pos, path->pos);
- __entry->old_u64s = old_u64s;
- __entry->new_u64s = new_u64s;
- ),
-
- TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u",
- __entry->trans_fn,
- (void *) __entry->caller_ip,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot,
- __entry->old_u64s,
- __entry->new_u64s)
-);
-
DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
@@ -1431,28 +1397,44 @@ DEFINE_EVENT(fs_str, data_update,
TP_ARGS(c, str)
);
+DEFINE_EVENT(fs_str, io_move_pred,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
+
DEFINE_EVENT(fs_str, io_move_created_rebalance,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
-TRACE_EVENT(error_downcast,
- TP_PROTO(int bch_err, int std_err, unsigned long ip),
- TP_ARGS(bch_err, std_err, ip),
+DEFINE_EVENT(fs_str, io_move_evacuate_bucket,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
- TP_STRUCT__entry(
- __array(char, bch_err, 32 )
- __array(char, std_err, 32 )
- __array(char, ip, 32 )
- ),
+DEFINE_EVENT(fs_str, extent_trim_atomic,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
- TP_fast_assign(
- strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err));
- strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err));
- snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
- ),
+DEFINE_EVENT(fs_str, btree_iter_peek_slot,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
+
+DEFINE_EVENT(fs_str, __btree_iter_peek,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
- TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip)
+DEFINE_EVENT(fs_str, btree_iter_peek_max,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
+
+DEFINE_EVENT(fs_str, btree_iter_peek_prev_min,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
#ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS
@@ -1867,21 +1849,6 @@ TRACE_EVENT(btree_path_free,
__entry->dup_locked)
);
-TRACE_EVENT(btree_path_free_trans_begin,
- TP_PROTO(btree_path_idx_t path),
- TP_ARGS(path),
-
- TP_STRUCT__entry(
- __field(btree_path_idx_t, idx )
- ),
-
- TP_fast_assign(
- __entry->idx = path;
- ),
-
- TP_printk(" path %3u", __entry->idx)
-);
-
#else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
#ifndef _TRACE_BCACHEFS_H
@@ -1899,7 +1866,6 @@ static inline void trace_btree_path_traverse_start(struct btree_trans *trans, st
static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {}
static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {}
-static inline void trace_btree_path_free_trans_begin(btree_path_idx_t path) {}
#endif
#endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index dc3817f545fa..df9a6071fe18 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -262,8 +262,7 @@ static bool string_is_spaces(const char *str)
return true;
}
-void bch2_print_string_as_lines(const char *prefix, const char *lines,
- bool nonblocking)
+void bch2_print_string_as_lines(const char *prefix, const char *lines)
{
bool locked = false;
const char *p;
@@ -273,12 +272,7 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines,
return;
}
- if (!nonblocking) {
- console_lock();
- locked = true;
- } else {
- locked = console_trylock();
- }
+ locked = console_trylock();
while (*lines) {
p = strchrnul(lines, '\n');
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 25cf61ebd40c..6488f098d140 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -17,6 +17,7 @@
#include <linux/random.h>
#include <linux/ratelimit.h>
#include <linux/slab.h>
+#include <linux/sort.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
@@ -213,7 +214,7 @@ u64 bch2_read_flag_list(const char *, const char * const[]);
void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
void bch2_prt_u64_base2(struct printbuf *, u64);
-void bch2_print_string_as_lines(const char *, const char *, bool);
+void bch2_print_string_as_lines(const char *, const char *);
typedef DARRAY(unsigned long) bch_stacktrace;
int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t);
@@ -672,8 +673,6 @@ static inline void percpu_memset(void __percpu *p, int c, size_t bytes)
u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
-#define cmp_int(l, r) ((l > r) - (l < r))
-
static inline int u8_cmp(u8 l, u8 r)
{
return cmp_int(l, r);
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index fa66a09e496a..d33d6bde992b 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -27,7 +27,7 @@ const struct file_operations bfs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.splice_read = filemap_splice_read,
};
@@ -170,9 +170,10 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to)
truncate_pagecache(inode, inode->i_size);
}
-static int bfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int bfs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int ret;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index a43363d593e5..264fba0d44bd 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -519,7 +519,7 @@ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
/* Sanity check the number of program headers... */
/* ...and their total size. */
size = sizeof(struct elf_phdr) * elf_ex->e_phnum;
- if (size == 0 || size > 65536 || size > ELF_MIN_ALIGN)
+ if (size == 0 || size > 65536)
goto out;
elf_phdata = kmalloc(size, GFP_KERNEL);
@@ -646,7 +646,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
if (!elf_check_arch(interp_elf_ex) ||
elf_check_fdpic(interp_elf_ex))
goto out;
- if (!interpreter->f_op->mmap)
+ if (!can_mmap_file(interpreter))
goto out;
total_size = total_mapping_size(interp_elf_phdata,
@@ -848,7 +848,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
goto out;
if (elf_check_fdpic(elf_ex))
goto out;
- if (!bprm->file->f_op->mmap)
+ if (!can_mmap_file(bprm->file))
goto out;
elf_phdata = load_elf_phdrs(elf_ex, bprm->file);
@@ -1450,8 +1450,8 @@ static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset)
phdr->p_align = 4;
}
-static void fill_note(struct memelfnote *note, const char *name, int type,
- unsigned int sz, void *data)
+static void __fill_note(struct memelfnote *note, const char *name, int type,
+ unsigned int sz, void *data)
{
note->name = name;
note->type = type;
@@ -1459,6 +1459,9 @@ static void fill_note(struct memelfnote *note, const char *name, int type,
note->data = data;
}
+#define fill_note(note, type, sz, data) \
+ __fill_note(note, NN_ ## type, NT_ ## type, sz, data)
+
/*
* fill up all the fields in prstatus from the given task struct, except
* registers which need to be filled up separately.
@@ -1549,14 +1552,14 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
do
i += 2;
while (auxv[i - 2] != AT_NULL);
- fill_note(note, NN_AUXV, NT_AUXV, i * sizeof(elf_addr_t), auxv);
+ fill_note(note, AUXV, i * sizeof(elf_addr_t), auxv);
}
static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
const kernel_siginfo_t *siginfo)
{
copy_siginfo_to_external(csigdata, siginfo);
- fill_note(note, NN_SIGINFO, NT_SIGINFO, sizeof(*csigdata), csigdata);
+ fill_note(note, SIGINFO, sizeof(*csigdata), csigdata);
}
/*
@@ -1652,7 +1655,7 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm
}
size = name_curpos - (char *)data;
- fill_note(note, NN_FILE, NT_FILE, size, data);
+ fill_note(note, FILE, size, data);
return 0;
}
@@ -1713,8 +1716,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
regset_get(t->task, &view->regsets[0],
sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg);
- fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS,
- PRSTATUS_SIZE, &t->prstatus);
+ fill_note(&t->notes[0], PRSTATUS, PRSTATUS_SIZE, &t->prstatus);
info->size += notesize(&t->notes[0]);
do_thread_regset_writeback(t->task, &view->regsets[0]);
@@ -1727,6 +1729,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
for (view_iter = 1; view_iter < view->n; ++view_iter) {
const struct user_regset *regset = &view->regsets[view_iter];
int note_type = regset->core_note_type;
+ const char *note_name = regset->core_note_name;
bool is_fpreg = note_type == NT_PRFPREG;
void *data;
int ret;
@@ -1747,8 +1750,16 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
if (is_fpreg)
SET_PR_FPVALID(&t->prstatus);
- fill_note(&t->notes[note_iter], is_fpreg ? NN_PRFPREG : "LINUX",
- note_type, ret, data);
+ /* There should be a note name, but if not, guess: */
+ if (WARN_ON_ONCE(!note_name))
+ note_name = "LINUX";
+ else
+ /* Warn on non-legacy-compatible names, for now. */
+ WARN_ON_ONCE(strcmp(note_name,
+ is_fpreg ? "CORE" : "LINUX"));
+
+ __fill_note(&t->notes[note_iter], note_name, note_type,
+ ret, data);
info->size += notesize(&t->notes[note_iter]);
note_iter++;
@@ -1767,8 +1778,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
fill_prstatus(&t->prstatus.common, p, signr);
elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
- fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus),
- &(t->prstatus));
+ fill_note(&t->notes[0], PRSTATUS, sizeof(t->prstatus), &t->prstatus);
info->size += notesize(&t->notes[0]);
fpu = kzalloc(sizeof(elf_fpregset_t), GFP_KERNEL);
@@ -1778,7 +1788,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
}
t->prstatus.pr_fpvalid = 1;
- fill_note(&t->notes[1], NN_PRFPREG, NT_PRFPREG, sizeof(*fpu), fpu);
+ fill_note(&t->notes[1], PRFPREG, sizeof(*fpu), fpu);
info->size += notesize(&t->notes[1]);
return 1;
@@ -1798,7 +1808,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
if (!psinfo)
return 0;
- fill_note(&info->psinfo, NN_PRPSINFO, NT_PRPSINFO, sizeof(*psinfo), psinfo);
+ fill_note(&info->psinfo, PRPSINFO, sizeof(*psinfo), psinfo);
#ifdef CORE_DUMP_USE_REGSET
view = task_user_regset_view(dump_task);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 9133f3827f90..48fd2de3bca0 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -109,7 +109,7 @@ static int is_elf(struct elfhdr *hdr, struct file *file)
return 0;
if (!elf_check_arch(hdr))
return 0;
- if (!file->f_op->mmap)
+ if (!can_mmap_file(file))
return 0;
return 1;
}
@@ -1275,8 +1275,8 @@ static inline void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offs
return;
}
-static inline void fill_note(struct memelfnote *note, const char *name, int type,
- unsigned int sz, void *data)
+static inline void __fill_note(struct memelfnote *note, const char *name, int type,
+ unsigned int sz, void *data)
{
note->name = name;
note->type = type;
@@ -1285,6 +1285,9 @@ static inline void fill_note(struct memelfnote *note, const char *name, int type
return;
}
+#define fill_note(note, type, sz, data) \
+ __fill_note(note, NN_ ## type, NT_ ## type, sz, data)
+
/*
* fill up all the fields in prstatus from the given task struct, except
* registers which need to be filled up separately.
@@ -1398,8 +1401,7 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_
regset_get(p, &view->regsets[0],
sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg);
- fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus),
- &t->prstatus);
+ fill_note(&t->notes[0], PRSTATUS, sizeof(t->prstatus), &t->prstatus);
t->num_notes++;
*sz += notesize(&t->notes[0]);
@@ -1416,8 +1418,7 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_
}
if (t->prstatus.pr_fpvalid) {
- fill_note(&t->notes[1], NN_PRFPREG, NT_PRFPREG, sizeof(t->fpu),
- &t->fpu);
+ fill_note(&t->notes[1], PRFPREG, sizeof(t->fpu), &t->fpu);
t->num_notes++;
*sz += notesize(&t->notes[1]);
}
@@ -1531,7 +1532,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
*/
fill_psinfo(psinfo, current->group_leader, current->mm);
- fill_note(&psinfo_note, NN_PRPSINFO, NT_PRPSINFO, sizeof(*psinfo), psinfo);
+ fill_note(&psinfo_note, PRPSINFO, sizeof(*psinfo), psinfo);
thread_status_size += notesize(&psinfo_note);
auxv = (elf_addr_t *) current->mm->saved_auxv;
@@ -1539,7 +1540,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
do
i += 2;
while (auxv[i - 2] != AT_NULL);
- fill_note(&auxv_note, NN_AUXV, NT_AUXV, i * sizeof(elf_addr_t), auxv);
+ fill_note(&auxv_note, AUXV, i * sizeof(elf_addr_t), auxv);
thread_status_size += notesize(&auxv_note);
offset = sizeof(*elf); /* ELF header */
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 432fbf4fc334..a839f960cd4a 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -675,44 +675,6 @@ static void bm_evict_inode(struct inode *inode)
}
/**
- * unlink_binfmt_dentry - remove the dentry for the binary type handler
- * @dentry: dentry associated with the binary type handler
- *
- * Do the actual filesystem work to remove a dentry for a registered binary
- * type handler. Since binfmt_misc only allows simple files to be created
- * directly under the root dentry of the filesystem we ensure that we are
- * indeed passed a dentry directly beneath the root dentry, that the inode
- * associated with the root dentry is locked, and that it is a regular file we
- * are asked to remove.
- */
-static void unlink_binfmt_dentry(struct dentry *dentry)
-{
- struct dentry *parent = dentry->d_parent;
- struct inode *inode, *parent_inode;
-
- /* All entries are immediate descendants of the root dentry. */
- if (WARN_ON_ONCE(dentry->d_sb->s_root != parent))
- return;
-
- /* We only expect to be called on regular files. */
- inode = d_inode(dentry);
- if (WARN_ON_ONCE(!S_ISREG(inode->i_mode)))
- return;
-
- /* The parent inode must be locked. */
- parent_inode = d_inode(parent);
- if (WARN_ON_ONCE(!inode_is_locked(parent_inode)))
- return;
-
- if (simple_positive(dentry)) {
- dget(dentry);
- simple_unlink(parent_inode, dentry);
- d_delete(dentry);
- dput(dentry);
- }
-}
-
-/**
* remove_binfmt_handler - remove a binary type handler
* @misc: handle to binfmt_misc instance
* @e: binary type handler to remove
@@ -729,7 +691,7 @@ static void remove_binfmt_handler(struct binfmt_misc *misc, Node *e)
write_lock(&misc->entries_lock);
list_del_init(&e->list);
write_unlock(&misc->entries_lock);
- unlink_binfmt_dentry(e->dentry);
+ locked_recursive_removal(e->dentry, NULL);
}
/* /<entry> */
@@ -772,7 +734,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
case 3:
/* Delete this handler. */
inode = d_inode(inode->i_sb->s_root);
- inode_lock(inode);
+ inode_lock_nested(inode, I_MUTEX_PARENT);
/*
* In order to add new element or remove elements from the list
@@ -922,7 +884,7 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer,
case 3:
/* Delete all handlers. */
inode = d_inode(file_inode(file)->i_sb->s_root);
- inode_lock(inode);
+ inode_lock_nested(inode, I_MUTEX_PARENT);
/*
* In order to add new element or remove elements from the list
diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c
index 08412532db1b..1e36a12b88f7 100644
--- a/fs/bpf_fs_kfuncs.c
+++ b/fs/bpf_fs_kfuncs.c
@@ -9,6 +9,7 @@
#include <linux/fs.h>
#include <linux/fsnotify.h>
#include <linux/file.h>
+#include <linux/kernfs.h>
#include <linux/mm.h>
#include <linux/xattr.h>
@@ -322,6 +323,39 @@ __bpf_kfunc int bpf_remove_dentry_xattr(struct dentry *dentry, const char *name_
return ret;
}
+#ifdef CONFIG_CGROUPS
+/**
+ * bpf_cgroup_read_xattr - read xattr of a cgroup's node in cgroupfs
+ * @cgroup: cgroup to get xattr from
+ * @name__str: name of the xattr
+ * @value_p: output buffer of the xattr value
+ *
+ * Get xattr *name__str* of *cgroup* and store the output in *value_ptr*.
+ *
+ * For security reasons, only *name__str* with prefix "user." is allowed.
+ *
+ * Return: length of the xattr value on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__str,
+ struct bpf_dynptr *value_p)
+{
+ struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p;
+ u32 value_len;
+ void *value;
+
+ /* Only allow reading "user.*" xattrs */
+ if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+ return -EPERM;
+
+ value_len = __bpf_dynptr_size(value_ptr);
+ value = __bpf_dynptr_data_rw(value_ptr, value_len);
+ if (!value)
+ return -EINVAL;
+
+ return kernfs_xattr_get(cgroup->kn, name__str, value, value_len);
+}
+#endif /* CONFIG_CGROUPS */
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(bpf_fs_kfunc_set_ids)
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index c352f3ae0385..ea95c90c8474 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -114,6 +114,8 @@ config BTRFS_EXPERIMENTAL
- extent tree v2 - complex rework of extent tracking
+ - large folio support
+
If unsure, say N.
config BTRFS_FS_REF_VERIFY
diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c
index e3716516ca38..861c7d92c437 100644
--- a/fs/btrfs/accessors.c
+++ b/fs/btrfs/accessors.c
@@ -9,27 +9,24 @@
#include "fs.h"
#include "accessors.h"
-static bool check_setget_bounds(const struct extent_buffer *eb,
- const void *ptr, unsigned off, int size)
+static void __cold report_setget_bounds(const struct extent_buffer *eb,
+ const void *ptr, unsigned off, int size)
{
- const unsigned long member_offset = (unsigned long)ptr + off;
+ unsigned long member_offset = (unsigned long)ptr + off;
- if (unlikely(member_offset + size > eb->len)) {
- btrfs_warn(eb->fs_info,
- "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d",
- (member_offset > eb->len ? "start" : "end"),
- (unsigned long)ptr, eb->start, member_offset, size);
- return false;
- }
-
- return true;
+ btrfs_warn(eb->fs_info,
+ "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d",
+ (member_offset > eb->len ? "start" : "end"),
+ (unsigned long)ptr, eb->start, member_offset, size);
}
-void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb)
+/* Copy bytes from @src1 and @src2 to @dest. */
+static __always_inline void memcpy_split_src(char *dest, const char *src1,
+ const char *src2, const size_t len1,
+ const size_t total)
{
- token->eb = eb;
- token->kaddr = folio_address(eb->folios[0]);
- token->offset = 0;
+ memcpy(dest, src1, len1);
+ memcpy(dest + len1, src2, total - len1);
}
/*
@@ -41,11 +38,6 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e
* - btrfs_set_8 (for 8/16/32/64)
* - btrfs_get_8 (for 8/16/32/64)
*
- * Generic helpers with a token (cached address of the most recently accessed
- * page):
- * - btrfs_set_token_8 (for 8/16/32/64)
- * - btrfs_get_token_8 (for 8/16/32/64)
- *
* The set/get functions handle data spanning two pages transparently, in case
* metadata block size is larger than page. Every pointer to metadata items is
* an offset into the extent buffer page array, cast to a specific type. This
@@ -57,118 +49,66 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e
*/
#define DEFINE_BTRFS_SETGET_BITS(bits) \
-u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
- const void *ptr, unsigned long off) \
-{ \
- const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
- const unsigned long oil = get_eb_offset_in_folio(token->eb, \
- member_offset);\
- const int unit_size = token->eb->folio_size; \
- const int unit_shift = token->eb->folio_shift; \
- const int size = sizeof(u##bits); \
- u8 lebytes[sizeof(u##bits)]; \
- const int part = unit_size - oil; \
- \
- ASSERT(token); \
- ASSERT(token->kaddr); \
- ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
- if (token->offset <= member_offset && \
- member_offset + size <= token->offset + unit_size) { \
- return get_unaligned_le##bits(token->kaddr + oil); \
- } \
- token->kaddr = folio_address(token->eb->folios[idx]); \
- token->offset = idx << unit_shift; \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \
- return get_unaligned_le##bits(token->kaddr + oil); \
- \
- memcpy(lebytes, token->kaddr + oil, part); \
- token->kaddr = folio_address(token->eb->folios[idx + 1]); \
- token->offset = (idx + 1) << unit_shift; \
- memcpy(lebytes + part, token->kaddr, size - part); \
- return get_unaligned_le##bits(lebytes); \
-} \
u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
const void *ptr, unsigned long off) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
const unsigned long idx = get_eb_folio_index(eb, member_offset);\
- const unsigned long oil = get_eb_offset_in_folio(eb, \
- member_offset);\
- const int unit_size = eb->folio_size; \
- char *kaddr = folio_address(eb->folios[idx]); \
- const int size = sizeof(u##bits); \
- const int part = unit_size - oil; \
- u8 lebytes[sizeof(u##bits)]; \
- \
- ASSERT(check_setget_bounds(eb, ptr, off, size)); \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \
- return get_unaligned_le##bits(kaddr + oil); \
- \
- memcpy(lebytes, kaddr + oil, part); \
- kaddr = folio_address(eb->folios[idx + 1]); \
- memcpy(lebytes + part, kaddr, size - part); \
- return get_unaligned_le##bits(lebytes); \
-} \
-void btrfs_set_token_##bits(struct btrfs_map_token *token, \
- const void *ptr, unsigned long off, \
- u##bits val) \
-{ \
- const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
- const unsigned long oil = get_eb_offset_in_folio(token->eb, \
+ const unsigned long oif = get_eb_offset_in_folio(eb, \
member_offset);\
- const int unit_size = token->eb->folio_size; \
- const int unit_shift = token->eb->folio_shift; \
- const int size = sizeof(u##bits); \
+ char *kaddr = folio_address(eb->folios[idx]) + oif; \
+ const int part = eb->folio_size - oif; \
u8 lebytes[sizeof(u##bits)]; \
- const int part = unit_size - oil; \
\
- ASSERT(token); \
- ASSERT(token->kaddr); \
- ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
- if (token->offset <= member_offset && \
- member_offset + size <= token->offset + unit_size) { \
- put_unaligned_le##bits(val, token->kaddr + oil); \
- return; \
+ if (unlikely(member_offset + sizeof(u##bits) > eb->len)) { \
+ report_setget_bounds(eb, ptr, off, sizeof(u##bits)); \
+ return 0; \
} \
- token->kaddr = folio_address(token->eb->folios[idx]); \
- token->offset = idx << unit_shift; \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || \
- oil + size <= unit_size) { \
- put_unaligned_le##bits(val, token->kaddr + oil); \
- return; \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || sizeof(u##bits) == 1 || \
+ likely(sizeof(u##bits) <= part)) \
+ return get_unaligned_le##bits(kaddr); \
+ \
+ if (sizeof(u##bits) == 2) { \
+ lebytes[0] = *kaddr; \
+ kaddr = folio_address(eb->folios[idx + 1]); \
+ lebytes[1] = *kaddr; \
+ } else { \
+ memcpy_split_src(lebytes, kaddr, \
+ folio_address(eb->folios[idx + 1]), \
+ part, sizeof(u##bits)); \
} \
- put_unaligned_le##bits(val, lebytes); \
- memcpy(token->kaddr + oil, lebytes, part); \
- token->kaddr = folio_address(token->eb->folios[idx + 1]); \
- token->offset = (idx + 1) << unit_shift; \
- memcpy(token->kaddr, lebytes + part, size - part); \
+ return get_unaligned_le##bits(lebytes); \
} \
void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
unsigned long off, u##bits val) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
const unsigned long idx = get_eb_folio_index(eb, member_offset);\
- const unsigned long oil = get_eb_offset_in_folio(eb, \
+ const unsigned long oif = get_eb_offset_in_folio(eb, \
member_offset);\
- const int unit_size = eb->folio_size; \
- char *kaddr = folio_address(eb->folios[idx]); \
- const int size = sizeof(u##bits); \
- const int part = unit_size - oil; \
+ char *kaddr = folio_address(eb->folios[idx]) + oif; \
+ const int part = eb->folio_size - oif; \
u8 lebytes[sizeof(u##bits)]; \
\
- ASSERT(check_setget_bounds(eb, ptr, off, size)); \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || \
- oil + size <= unit_size) { \
- put_unaligned_le##bits(val, kaddr + oil); \
+ if (unlikely(member_offset + sizeof(u##bits) > eb->len)) { \
+ report_setget_bounds(eb, ptr, off, sizeof(u##bits)); \
+ return; \
+ } \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || sizeof(u##bits) == 1 || \
+ likely(sizeof(u##bits) <= part)) { \
+ put_unaligned_le##bits(val, kaddr); \
return; \
} \
- \
put_unaligned_le##bits(val, lebytes); \
- memcpy(kaddr + oil, lebytes, part); \
- kaddr = folio_address(eb->folios[idx + 1]); \
- memcpy(kaddr, lebytes + part, size - part); \
+ if (sizeof(u##bits) == 2) { \
+ *kaddr = lebytes[0]; \
+ kaddr = folio_address(eb->folios[idx + 1]); \
+ *kaddr = lebytes[1]; \
+ } else { \
+ memcpy(kaddr, lebytes, part); \
+ kaddr = folio_address(eb->folios[idx + 1]); \
+ memcpy(kaddr, lebytes + part, sizeof(u##bits) - part); \
+ } \
}
DEFINE_BTRFS_SETGET_BITS(8)
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 15ea6348800b..99b3ced12805 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -16,14 +16,6 @@
struct extent_buffer;
-struct btrfs_map_token {
- struct extent_buffer *eb;
- char *kaddr;
- unsigned long offset;
-};
-
-void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb);
-
/*
* Some macros to generate set/get functions for the struct fields. This
* assumes there is a lefoo_to_cpu for every type, so lets make a simple one
@@ -56,11 +48,6 @@ static inline void put_unaligned_le8(u8 val, void *p)
sizeof_field(type, member)))
#define DECLARE_BTRFS_SETGET_BITS(bits) \
-u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
- const void *ptr, unsigned long off); \
-void btrfs_set_token_##bits(struct btrfs_map_token *token, \
- const void *ptr, unsigned long off, \
- u##bits val); \
u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
const void *ptr, unsigned long off); \
void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
@@ -83,18 +70,6 @@ static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \
{ \
static_assert(sizeof(u##bits) == sizeof_field(type, member)); \
btrfs_set_##bits(eb, s, offsetof(type, member), val); \
-} \
-static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \
- const type *s) \
-{ \
- static_assert(sizeof(u##bits) == sizeof_field(type, member)); \
- return btrfs_get_token_##bits(token, s, offsetof(type, member));\
-} \
-static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
- type *s, u##bits val) \
-{ \
- static_assert(sizeof(u##bits) == sizeof_field(type, member)); \
- btrfs_set_token_##bits(token, s, offsetof(type, member), val); \
}
#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
@@ -479,18 +454,6 @@ static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \
int slot, u32 val) \
{ \
btrfs_set_raw_item_##member(eb, btrfs_item_nr(eb, slot), val); \
-} \
-static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \
- int slot) \
-{ \
- struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \
- return btrfs_token_raw_item_##member(token, item); \
-} \
-static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \
- int slot, u32 val) \
-{ \
- struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \
- btrfs_set_token_raw_item_##member(token, item, val); \
}
BTRFS_ITEM_SETGET_FUNCS(offset)
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ed497f5f8d1b..6a450be293b1 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -733,7 +733,6 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx,
struct preftrees *preftrees,
struct share_check *sc)
{
- int err;
int ret = 0;
struct ulist *parents;
struct ulist_node *node;
@@ -752,6 +751,7 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx,
*/
while ((rnode = rb_first_cached(&preftrees->indirect.root))) {
struct prelim_ref *ref;
+ int ret2;
ref = rb_entry(rnode, struct prelim_ref, rbnode);
if (WARN(ref->parent,
@@ -773,18 +773,18 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx,
ret = BACKREF_FOUND_SHARED;
goto out;
}
- err = resolve_indirect_ref(ctx, path, preftrees, ref, parents);
+ ret2 = resolve_indirect_ref(ctx, path, preftrees, ref, parents);
/*
* we can only tolerate ENOENT,otherwise,we should catch error
* and return directly.
*/
- if (err == -ENOENT) {
+ if (ret2 == -ENOENT) {
prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref,
NULL);
continue;
- } else if (err) {
+ } else if (ret2) {
free_pref(ref);
- ret = err;
+ ret = ret2;
goto out;
}
@@ -2201,7 +2201,6 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
int ret;
u64 flags;
u64 size = 0;
- u32 item_size;
const struct extent_buffer *eb;
struct btrfs_extent_item *ei;
struct btrfs_key key;
@@ -2244,7 +2243,6 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
}
eb = path->nodes[0];
- item_size = btrfs_item_size(eb, path->slots[0]);
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
flags = btrfs_extent_flags(eb, ei);
@@ -2252,7 +2250,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
btrfs_debug(fs_info,
"logical %llu is at position %llu within the extent (%llu EXTENT_ITEM %llu) flags %#llx size %u",
logical, logical - found_key->objectid, found_key->objectid,
- found_key->offset, flags, item_size);
+ found_key->offset, flags, btrfs_item_size(eb, path->slots[0]));
WARN_ON(!flags_ret);
if (flags_ret) {
@@ -2548,17 +2546,20 @@ static int build_ino_list(u64 inum, u64 offset, u64 num_bytes, u64 root, void *c
}
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
void *ctx, bool ignore_offset)
{
struct btrfs_backref_walk_ctx walk_ctx = { 0 };
int ret;
u64 flags = 0;
struct btrfs_key found_key;
- int search_commit_root = path->search_commit_root;
+ struct btrfs_path *path;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);
- btrfs_release_path(path);
+ btrfs_free_path(path);
if (ret < 0)
return ret;
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
@@ -2571,8 +2572,7 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
walk_ctx.extent_item_pos = logical - found_key.objectid;
walk_ctx.fs_info = fs_info;
- return iterate_extent_inodes(&walk_ctx, search_commit_root,
- build_ino_list, ctx);
+ return iterate_extent_inodes(&walk_ctx, false, build_ino_list, ctx);
}
static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
@@ -3161,18 +3161,14 @@ void btrfs_backref_release_cache(struct btrfs_backref_cache *cache)
ASSERT(!cache->nr_edges);
}
-void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
- struct btrfs_backref_node *lower,
- struct btrfs_backref_node *upper,
- int link_which)
+static void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
+ struct btrfs_backref_node *lower,
+ struct btrfs_backref_node *upper)
{
ASSERT(upper && lower && upper->level == lower->level + 1);
edge->node[LOWER] = lower;
edge->node[UPPER] = upper;
- if (link_which & LINK_LOWER)
- list_add_tail(&edge->list[LOWER], &lower->upper);
- if (link_which & LINK_UPPER)
- list_add_tail(&edge->list[UPPER], &upper->lower);
+ list_add_tail(&edge->list[LOWER], &lower->upper);
}
/*
* Handle direct tree backref
@@ -3242,7 +3238,7 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache,
ASSERT(upper->checked);
INIT_LIST_HEAD(&edge->list[UPPER]);
}
- btrfs_backref_link_edge(edge, cur, upper, LINK_LOWER);
+ btrfs_backref_link_edge(edge, cur, upper);
return 0;
}
@@ -3412,7 +3408,7 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
if (!upper->owner)
upper->owner = btrfs_header_owner(eb);
}
- btrfs_backref_link_edge(edge, lower, upper, LINK_LOWER);
+ btrfs_backref_link_edge(edge, lower, upper);
if (rb_node) {
btrfs_put_root(root);
@@ -3570,7 +3566,7 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
ASSERT(start->checked);
- rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, &start->rb_node);
+ rb_node = rb_simple_insert(&cache->rb_root, &start->simple_node);
if (rb_node)
btrfs_backref_panic(cache->fs_info, start->bytenr, -EEXIST);
@@ -3621,8 +3617,7 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
return -EUCLEAN;
}
- rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr,
- &upper->rb_node);
+ rb_node = rb_simple_insert(&cache->rb_root, &upper->simple_node);
if (unlikely(rb_node)) {
btrfs_backref_panic(cache->fs_info, upper->bytenr, -EEXIST);
return -EUCLEAN;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 953637115956..34b0193a181c 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -226,8 +226,7 @@ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx,
iterate_extent_inodes_t *iterate, void *user_ctx);
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
- struct btrfs_path *path, void *ctx,
- bool ignore_offset);
+ void *ctx, bool ignore_offset);
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
@@ -313,10 +312,15 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter);
* Represent a tree block in the backref cache
*/
struct btrfs_backref_node {
- struct {
- struct rb_node rb_node;
- u64 bytenr;
- }; /* Use rb_simple_node for search/insert */
+ union{
+ /* Use rb_simple_node for search/insert */
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ };
+
+ struct rb_simple_node simple_node;
+ };
/*
* This is a sanity check, whenever we COW a block we will update
@@ -423,13 +427,6 @@ struct btrfs_backref_node *btrfs_backref_alloc_node(
struct btrfs_backref_edge *btrfs_backref_alloc_edge(
struct btrfs_backref_cache *cache);
-#define LINK_LOWER (1U << 0)
-#define LINK_UPPER (1U << 1)
-
-void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
- struct btrfs_backref_node *lower,
- struct btrfs_backref_node *upper,
- int link_which);
void btrfs_backref_free_node(struct btrfs_backref_cache *cache,
struct btrfs_backref_node *node);
void btrfs_backref_free_edge(struct btrfs_backref_cache *cache,
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index f7d8958b7327..50b5fc1c06d7 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -27,12 +27,12 @@ struct btrfs_failed_bio {
};
/* Is this a data path I/O that needs storage layer checksum and repair? */
-static inline bool is_data_bbio(struct btrfs_bio *bbio)
+static inline bool is_data_bbio(const struct btrfs_bio *bbio)
{
return bbio->inode && is_data_inode(bbio->inode);
}
-static bool bbio_has_ordered_extent(struct btrfs_bio *bbio)
+static bool bbio_has_ordered_extent(const struct btrfs_bio *bbio)
{
return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE;
}
@@ -134,14 +134,14 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
}
}
-static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
+static int next_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror)
{
if (cur_mirror == fbio->num_copies)
return cur_mirror + 1 - fbio->num_copies;
return cur_mirror + 1;
}
-static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
+static int prev_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror)
{
if (cur_mirror == 1)
return fbio->num_copies;
@@ -165,12 +165,6 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio);
int mirror = repair_bbio->mirror_num;
- /*
- * We can only trigger this for data bio, which doesn't support larger
- * folios yet.
- */
- ASSERT(folio_order(page_folio(bv->bv_page)) == 0);
-
if (repair_bbio->bio.bi_status ||
!btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) {
bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
@@ -301,7 +295,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
btrfs_bio_end_io(bbio, bbio->bio.bi_status);
}
-static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
+static void btrfs_log_dev_io_error(const struct bio *bio, struct btrfs_device *dev)
{
if (!dev || !dev->bdev)
return;
@@ -316,8 +310,8 @@ static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
}
-static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info,
- struct bio *bio)
+static struct workqueue_struct *btrfs_end_io_wq(const struct btrfs_fs_info *fs_info,
+ const struct bio *bio)
{
if (bio->bi_opf & REQ_META)
return fs_info->endio_meta_workers;
@@ -439,7 +433,7 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
ASSERT(btrfs_dev_is_sequential(dev, physical));
bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
}
- btrfs_debug_in_rcu(dev->fs_info,
+ btrfs_debug(dev->fs_info,
"%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
__func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
(unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
@@ -845,7 +839,7 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
goto out_bio_uninit;
}
- btrfs_info_rl_in_rcu(fs_info,
+ btrfs_info_rl(fs_info,
"read error corrected: ino %llu off %llu (dev %s sector %llu)",
ino, start, btrfs_dev_name(smap.dev),
smap.physical >> SECTOR_SHIFT);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 5b0cb04b2b93..9bf282d2453c 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -34,6 +34,19 @@ int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group
}
#endif
+static inline bool has_unwritten_metadata(struct btrfs_block_group *block_group)
+{
+ /* The meta_write_pointer is available only on the zoned setup. */
+ if (!btrfs_is_zoned(block_group->fs_info))
+ return false;
+
+ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
+ return false;
+
+ return block_group->start + block_group->alloc_offset >
+ block_group->meta_write_pointer;
+}
+
/*
* Return target flags in extended format or 0 if restripe for this chunk_type
* is not in progress
@@ -832,8 +845,8 @@ out:
static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg)
{
- btrfs_clear_extent_bits(&bg->fs_info->excluded_extents, bg->start,
- bg->start + bg->length - 1, EXTENT_DIRTY);
+ btrfs_clear_extent_bit(&bg->fs_info->excluded_extents, bg->start,
+ bg->start + bg->length - 1, EXTENT_DIRTY, NULL);
}
static noinline void caching_thread(struct btrfs_work *work)
@@ -877,7 +890,7 @@ static noinline void caching_thread(struct btrfs_work *work)
*/
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
!(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
- ret = load_free_space_tree(caching_ctl);
+ ret = btrfs_load_free_space_tree(caching_ctl);
else
ret = load_extent_tree_free(caching_ctl);
done:
@@ -1235,7 +1248,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* another task to attempt to create another block group with the same
* item key (and failing with -EEXIST and a transaction abort).
*/
- ret = remove_block_group_free_space(trans, block_group);
+ ret = btrfs_remove_block_group_free_space(trans, block_group);
if (ret)
goto out;
@@ -1244,6 +1257,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
goto out;
spin_lock(&block_group->lock);
+ /*
+ * Hitting this WARN means we removed a block group with an unwritten
+ * region. It will cause "unable to find chunk map for logical" errors.
+ */
+ if (WARN_ON(has_unwritten_metadata(block_group)))
+ btrfs_warn(fs_info,
+ "block group %llu is removed before metadata write out",
+ block_group->start);
+
set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
/*
@@ -1403,7 +1425,7 @@ out:
if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
btrfs_info(cache->fs_info,
"unable to make block group %llu ro", cache->start);
- btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
+ btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, false);
}
return ret;
}
@@ -1436,14 +1458,14 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
*/
mutex_lock(&fs_info->unused_bg_unpin_mutex);
if (prev_trans) {
- ret = btrfs_clear_extent_bits(&prev_trans->pinned_extents, start, end,
- EXTENT_DIRTY);
+ ret = btrfs_clear_extent_bit(&prev_trans->pinned_extents, start, end,
+ EXTENT_DIRTY, NULL);
if (ret)
goto out;
}
- ret = btrfs_clear_extent_bits(&trans->transaction->pinned_extents, start, end,
- EXTENT_DIRTY);
+ ret = btrfs_clear_extent_bit(&trans->transaction->pinned_extents, start, end,
+ EXTENT_DIRTY, NULL);
out:
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
if (prev_trans)
@@ -1586,8 +1608,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* needing to allocate extents from the block group.
*/
used = btrfs_space_info_used(space_info, true);
- if (space_info->total_bytes - block_group->length < used &&
- block_group->zone_unusable < block_group->length) {
+ if ((space_info->total_bytes - block_group->length < used &&
+ block_group->zone_unusable < block_group->length) ||
+ has_unwritten_metadata(block_group)) {
/*
* Add a reference for the list, compensate for the ref
* drop under the "next" label for the
@@ -1616,8 +1639,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
ret = btrfs_zone_finish(block_group);
if (ret < 0) {
btrfs_dec_block_group_ro(block_group);
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN) {
+ btrfs_link_bg_list(block_group, &retry_list);
ret = 0;
+ }
goto next;
}
@@ -1843,7 +1868,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
*/
list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
while (!list_empty(&fs_info->reclaim_bgs)) {
- u64 zone_unusable;
u64 used;
u64 reserved;
int ret = 0;
@@ -1910,16 +1934,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
goto next;
}
- /*
- * Cache the zone_unusable value before turning the block group
- * to read only. As soon as the block group is read only it's
- * zone_unusable value gets moved to the block group's read-only
- * bytes and isn't available for calculations anymore. We also
- * cache it before unlocking the block group, to prevent races
- * (reports from KCSAN and such tools) with tasks updating it.
- */
- zone_unusable = bg->zone_unusable;
-
spin_unlock(&bg->lock);
spin_unlock(&space_info->lock);
@@ -1963,14 +1977,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
reserved = bg->reserved;
spin_unlock(&bg->lock);
- btrfs_info(fs_info,
- "reclaiming chunk %llu with %llu%% used %llu%% reserved %llu%% unusable",
- bg->start,
- div64_u64(used * 100, bg->length),
- div64_u64(reserved * 100, bg->length),
- div64_u64(zone_unusable * 100, bg->length));
trace_btrfs_reclaim_block_group(bg);
- ret = btrfs_relocate_chunk(fs_info, bg->start);
+ ret = btrfs_relocate_chunk(fs_info, bg->start, false);
if (ret) {
btrfs_dec_block_group_ro(bg);
btrfs_err(fs_info, "error relocating chunk %llu",
@@ -2372,7 +2380,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
cache->space_info = btrfs_find_space_info(info, cache->flags);
- set_free_space_tree_thresholds(cache);
+ btrfs_set_free_space_tree_thresholds(cache);
if (need_clear) {
/*
@@ -2791,7 +2799,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
block_group->length);
if (ret)
btrfs_abort_transaction(trans, ret);
- add_block_group_free_space(trans, block_group);
+ btrfs_add_block_group_free_space(trans, block_group);
/*
* If we restriped during balance, we may have added a new raid
@@ -2889,7 +2897,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags);
cache->length = size;
- set_free_space_tree_thresholds(cache);
+ btrfs_set_free_space_tree_thresholds(cache);
cache->flags = type;
cache->cached = BTRFS_CACHE_FINISHED;
cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
@@ -3636,9 +3644,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
ret = update_block_group_item(trans, path, cache);
- }
- if (ret)
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ } else if (ret) {
btrfs_abort_transaction(trans, ret);
+ }
}
/* If its not on the io list, we need to put the block group */
@@ -4298,7 +4308,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans,
if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
left, bytes, type);
- btrfs_dump_space_info(fs_info, info, 0, 0);
+ btrfs_dump_space_info(fs_info, info, 0, false);
}
if (left < bytes) {
@@ -4443,7 +4453,7 @@ static void check_removing_space_info(struct btrfs_space_info *space_info)
* indicates a real bug if this happens.
*/
if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_may_use > 0))
- btrfs_dump_space_info(info, space_info, 0, 0);
+ btrfs_dump_space_info(info, space_info, 0, false);
/*
* If there was a failure to cleanup a log tree, very likely due to an
@@ -4454,7 +4464,7 @@ static void check_removing_space_info(struct btrfs_space_info *space_info)
if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
!BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
if (WARN_ON(space_info->bytes_reserved > 0))
- btrfs_dump_space_info(info, space_info, 0, 0);
+ btrfs_dump_space_info(info, space_info, 0, false);
}
WARN_ON(space_info->reclaim_size > 0);
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 9de356bcb411..a8bb8429c966 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -83,6 +83,8 @@ enum btrfs_block_group_flags {
BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
/* Does the block group need to be added to the free space tree? */
BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
+ /* Set after we add a new block group to the free space tree. */
+ BLOCK_GROUP_FLAG_FREE_SPACE_ADDED,
/* Indicate that the block group is placed on a sequential zone */
BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE,
/*
@@ -244,6 +246,11 @@ struct btrfs_block_group {
/* Lock for free space tree operations. */
struct mutex free_space_lock;
+ /* Protected by @free_space_lock. */
+ bool using_free_space_bitmaps;
+ /* Protected by @free_space_lock. */
+ bool using_free_space_bitmaps_cached;
+
/*
* Number of extents in this block group used for swap files.
* All accesses protected by the spinlock 'lock'.
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index a79fa0726f1d..b99fb0273292 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -525,6 +525,19 @@ static inline void btrfs_update_inode_mapping_flags(struct btrfs_inode *inode)
mapping_set_stable_writes(inode->vfs_inode.i_mapping);
}
+static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode)
+{
+ /* Metadata inode should not reach here. */
+ ASSERT(is_data_inode(inode));
+
+ /* We only allow BITS_PER_LONGS blocks for each bitmap. */
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ mapping_set_folio_order_range(inode->vfs_inode.i_mapping, 0,
+ ilog2(((BITS_PER_LONG << inode->root->fs_info->sectorsize_bits)
+ >> PAGE_SHIFT)));
+#endif
+}
+
/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 48d07939fee4..d09d622016ef 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -282,8 +282,8 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb)
{
struct inode *inode = &cb->bbio.inode->vfs_inode;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- unsigned long index = cb->start >> PAGE_SHIFT;
- unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
+ pgoff_t index = cb->start >> PAGE_SHIFT;
+ const pgoff_t end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
struct folio_batch fbatch;
int i;
int ret;
@@ -415,7 +415,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
int *memstall, unsigned long *pflags)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- unsigned long end_index;
+ pgoff_t end_index;
struct bio *orig_bio = &cb->orig_bbio->bio;
u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size;
u64 isize = i_size_read(inode);
@@ -446,8 +446,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
while (cur < compressed_end) {
- u64 page_end;
- u64 pg_index = cur >> PAGE_SHIFT;
+ pgoff_t page_end;
+ pgoff_t pg_index = cur >> PAGE_SHIFT;
u32 add_size;
if (pg_index > end_index)
@@ -789,8 +789,8 @@ static void btrfs_init_workspace_manager(int type)
*/
workspace = alloc_workspace(type, 0);
if (IS_ERR(workspace)) {
- pr_warn(
- "BTRFS: cannot preallocate compression workspace, will try later\n");
+ btrfs_warn(NULL,
+ "cannot preallocate compression workspace, will try later");
} else {
atomic_set(&wsm->total_ws, 1);
wsm->free_ws = 1;
@@ -888,9 +888,9 @@ again:
/* once per minute */ 60 * HZ,
/* no burst */ 1);
- if (__ratelimit(&_rs)) {
- pr_warn("BTRFS: no compression workspaces, low memory, retrying\n");
- }
+ if (__ratelimit(&_rs))
+ btrfs_warn(NULL,
+ "no compression workspaces, low memory, retrying");
}
goto again;
}
@@ -975,7 +975,7 @@ static int btrfs_compress_set_level(unsigned int type, int level)
if (level == 0)
level = ops->default_level;
else
- level = min(max(level, ops->min_level), ops->max_level);
+ level = clamp(level, ops->min_level, ops->max_level);
return level;
}
@@ -1482,7 +1482,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
struct heuristic_ws *ws)
{
struct page *page;
- u64 index, index_end;
+ pgoff_t index, index_end;
u32 i, curr_sample_pos;
u8 *in_data;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index d34c4341eaf4..1b38e707bbd9 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -13,6 +13,7 @@
#include <linux/wait.h>
#include <linux/pagemap.h>
#include "bio.h"
+#include "fs.h"
#include "messages.h"
struct address_space;
@@ -77,12 +78,10 @@ struct compressed_bio {
/* @range_end must be exclusive. */
static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u64 cur)
{
- const u64 folio_end = folio_pos(folio) + folio_size(folio);
-
/* @cur must be inside the folio. */
ASSERT(folio_pos(folio) <= cur);
- ASSERT(cur < folio_end);
- return min(range_end, folio_end) - cur;
+ ASSERT(cur < folio_end(folio));
+ return min(range_end, folio_end(folio)) - cur;
}
int __init btrfs_init_compress(void);
@@ -114,6 +113,8 @@ enum btrfs_compression_type {
BTRFS_COMPRESS_LZO = 2,
BTRFS_COMPRESS_ZSTD = 3,
BTRFS_NR_COMPRESS_TYPES = 4,
+
+ BTRFS_DEFRAG_DONT_COMPRESS,
};
struct workspace_manager {
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a2e7979372cc..74e6d7f3d266 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -198,7 +198,7 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
* the inc_not_zero dance and if it doesn't work then
* synchronize_rcu and try again.
*/
- if (atomic_inc_not_zero(&eb->refs)) {
+ if (refcount_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
break;
}
@@ -283,15 +283,26 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
- WARN_ON(btrfs_header_generation(buf) > trans->transid);
- if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+ if (unlikely(btrfs_header_generation(buf) > trans->transid)) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
ret = btrfs_inc_ref(trans, root, cow, 1);
- else
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ } else {
ret = btrfs_inc_ref(trans, root, cow, 0);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
if (ret) {
btrfs_tree_unlock(cow);
free_extent_buffer(cow);
- btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -303,9 +314,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
/*
* check if the tree block can be shared by multiple trees
*/
-bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf)
+bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans,
+ const struct btrfs_root *root,
+ const struct extent_buffer *buf)
{
const u64 buf_gen = btrfs_header_generation(buf);
@@ -549,7 +560,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
- atomic_inc(&cow->refs);
+ refcount_inc(&cow->refs);
rcu_assign_pointer(root->node, cow);
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
@@ -602,9 +613,9 @@ error_unlock_cow:
return ret;
}
-static inline int should_cow_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf)
+static inline int should_cow_block(const struct btrfs_trans_handle *trans,
+ const struct btrfs_root *root,
+ const struct extent_buffer *buf)
{
if (btrfs_is_testing(root->fs_info))
return 0;
@@ -724,7 +735,7 @@ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_ke
* Slot may point to the total number of items (i.e. one position beyond the last
* key) if the key is bigger than the last key in the extent buffer.
*/
-int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
+int btrfs_bin_search(const struct extent_buffer *eb, int first_slot,
const struct btrfs_key *key, int *slot)
{
unsigned long p;
@@ -1081,7 +1092,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* update the path */
if (left) {
if (btrfs_header_nritems(left) > orig_slot) {
- atomic_inc(&left->refs);
+ refcount_inc(&left->refs);
/* left was locked after cow */
path->nodes[level] = left;
path->slots[level + 1] -= 1;
@@ -1268,7 +1279,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
* to the block in 'slot', and triggering ra on them.
*/
static void reada_for_search(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
+ const struct btrfs_path *path,
int level, int slot, u64 objectid)
{
struct extent_buffer *node;
@@ -1350,7 +1361,7 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
}
}
-static noinline void reada_for_balance(struct btrfs_path *path, int level)
+static noinline void reada_for_balance(const struct btrfs_path *path, int level)
{
struct extent_buffer *parent;
int slot;
@@ -1446,8 +1457,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
u64 blocknr;
struct extent_buffer *tmp = NULL;
int ret = 0;
+ int ret2;
int parent_level;
- int err;
bool read_tmp = false;
bool tmp_locked = false;
bool path_released = false;
@@ -1505,9 +1516,9 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
}
/* Now we're allowed to do a blocking uptodate check. */
- err = btrfs_read_extent_buffer(tmp, &check);
- if (err) {
- ret = err;
+ ret2 = btrfs_read_extent_buffer(tmp, &check);
+ if (ret2) {
+ ret = ret2;
goto out;
}
@@ -1548,9 +1559,9 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
}
/* Now we're allowed to do a blocking uptodate check. */
- err = btrfs_read_extent_buffer(tmp, &check);
- if (err) {
- ret = err;
+ ret2 = btrfs_read_extent_buffer(tmp, &check);
+ if (ret2) {
+ ret = ret2;
goto out;
}
@@ -1685,7 +1696,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
if (p->search_commit_root) {
b = root->commit_root;
- atomic_inc(&b->refs);
+ refcount_inc(&b->refs);
level = btrfs_header_level(b);
/*
* Ensure that all callers have set skip_locking when
@@ -1794,7 +1805,7 @@ static int finish_need_commit_sem_search(struct btrfs_path *path)
return 0;
}
-static inline int search_for_key_slot(struct extent_buffer *eb,
+static inline int search_for_key_slot(const struct extent_buffer *eb,
int search_low_slot,
const struct btrfs_key *key,
int prev_cmp,
@@ -1928,15 +1939,14 @@ static int search_leaf(struct btrfs_trans_handle *trans,
ASSERT(leaf_free_space >= 0);
if (leaf_free_space < ins_len) {
- int err;
-
- err = split_leaf(trans, root, key, path, ins_len,
- (ret == 0));
- ASSERT(err <= 0);
- if (WARN_ON(err > 0))
- err = -EUCLEAN;
- if (err)
- ret = err;
+ int ret2;
+
+ ret2 = split_leaf(trans, root, key, path, ins_len, (ret == 0));
+ ASSERT(ret2 <= 0);
+ if (WARN_ON(ret2 > 0))
+ ret2 = -EUCLEAN;
+ if (ret2)
+ ret = ret2;
}
}
@@ -1982,7 +1992,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *b;
int slot;
int ret;
- int err;
int level;
int lowest_unlock = 1;
/* everything at write_lock_level or lower must be write locked */
@@ -2053,6 +2062,7 @@ again:
while (b) {
int dec = 0;
+ int ret2;
level = btrfs_header_level(b);
@@ -2081,16 +2091,15 @@ again:
}
if (last_level)
- err = btrfs_cow_block(trans, root, b, NULL, 0,
- &b,
- BTRFS_NESTING_COW);
+ ret2 = btrfs_cow_block(trans, root, b, NULL, 0,
+ &b, BTRFS_NESTING_COW);
else
- err = btrfs_cow_block(trans, root, b,
- p->nodes[level + 1],
- p->slots[level + 1], &b,
- BTRFS_NESTING_COW);
- if (err) {
- ret = err;
+ ret2 = btrfs_cow_block(trans, root, b,
+ p->nodes[level + 1],
+ p->slots[level + 1], &b,
+ BTRFS_NESTING_COW);
+ if (ret2) {
+ ret = ret2;
goto done;
}
}
@@ -2138,12 +2147,12 @@ cow_done:
slot--;
}
p->slots[level] = slot;
- err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
- &write_lock_level);
- if (err == -EAGAIN)
+ ret2 = setup_nodes_for_search(trans, root, p, b, level, ins_len,
+ &write_lock_level);
+ if (ret2 == -EAGAIN)
goto again;
- if (err) {
- ret = err;
+ if (ret2) {
+ ret = ret2;
goto done;
}
b = p->nodes[level];
@@ -2169,11 +2178,11 @@ cow_done:
goto done;
}
- err = read_block_for_search(root, p, &b, slot, key);
- if (err == -EAGAIN && !p->nowait)
+ ret2 = read_block_for_search(root, p, &b, slot, key);
+ if (ret2 == -EAGAIN && !p->nowait)
goto again;
- if (err) {
- ret = err;
+ if (ret2) {
+ ret = ret2;
goto done;
}
@@ -2236,7 +2245,6 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
struct extent_buffer *b;
int slot;
int ret;
- int err;
int level;
int lowest_unlock = 1;
u8 lowest_level = 0;
@@ -2261,6 +2269,7 @@ again:
while (b) {
int dec = 0;
+ int ret2;
level = btrfs_header_level(b);
p->nodes[level] = b;
@@ -2296,11 +2305,11 @@ again:
goto done;
}
- err = read_block_for_search(root, p, &b, slot, key);
- if (err == -EAGAIN && !p->nowait)
+ ret2 = read_block_for_search(root, p, &b, slot, key);
+ if (ret2 == -EAGAIN && !p->nowait)
goto again;
- if (err) {
- ret = err;
+ if (ret2) {
+ ret = ret2;
goto done;
}
@@ -2872,6 +2881,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
if (ret < 0) {
int ret2;
+ btrfs_clear_buffer_dirty(trans, c);
ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
if (ret2 < 0)
btrfs_abort_transaction(trans, ret2);
@@ -2885,7 +2895,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
free_extent_buffer(old);
add_root_to_dirty_list(root);
- atomic_inc(&c->refs);
+ refcount_inc(&c->refs);
path->nodes[level] = c;
path->locks[level] = BTRFS_WRITE_LOCK;
path->slots[level] = 0;
@@ -3100,7 +3110,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = right->fs_info;
struct extent_buffer *left = path->nodes[0];
struct extent_buffer *upper = path->nodes[1];
- struct btrfs_map_token token;
struct btrfs_disk_key disk_key;
int slot;
u32 i;
@@ -3174,13 +3183,12 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
copy_leaf_items(right, left, 0, left_nritems - push_items, push_items);
/* update the item pointers */
- btrfs_init_map_token(&token, right);
right_nritems += push_items;
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
- push_space -= btrfs_token_item_size(&token, i);
- btrfs_set_token_item_offset(&token, i, push_space);
+ push_space -= btrfs_item_size(right, i);
+ btrfs_set_item_offset(right, i, push_space);
}
left_nritems -= push_items;
@@ -3323,7 +3331,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
int ret = 0;
u32 this_item_size;
u32 old_left_item_size;
- struct btrfs_map_token token;
if (empty)
nr = min(right_nritems, max_slot);
@@ -3371,13 +3378,12 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
old_left_nritems = btrfs_header_nritems(left);
BUG_ON(old_left_nritems <= 0);
- btrfs_init_map_token(&token, left);
old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i,
+ ioff = btrfs_item_offset(left, i);
+ btrfs_set_item_offset(left, i,
ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
}
btrfs_set_header_nritems(left, old_left_nritems + push_items);
@@ -3398,13 +3404,12 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
btrfs_header_nritems(right) - push_items);
}
- btrfs_init_map_token(&token, right);
right_nritems -= push_items;
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
- push_space = push_space - btrfs_token_item_size(&token, i);
- btrfs_set_token_item_offset(&token, i, push_space);
+ push_space = push_space - btrfs_item_size(right, i);
+ btrfs_set_item_offset(right, i, push_space);
}
btrfs_mark_buffer_dirty(trans, left);
@@ -3518,7 +3523,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
int i;
int ret;
struct btrfs_disk_key disk_key;
- struct btrfs_map_token token;
nritems = nritems - mid;
btrfs_set_header_nritems(right, nritems);
@@ -3531,12 +3535,11 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
- btrfs_init_map_token(&token, right);
for (i = 0; i < nritems; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
+ ioff = btrfs_item_offset(right, i);
+ btrfs_set_item_offset(right, i, ioff + rt_data_off);
}
btrfs_set_header_nritems(l, mid);
@@ -4002,7 +4005,6 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans,
unsigned int old_size;
unsigned int size_diff;
int i;
- struct btrfs_map_token token;
leaf = path->nodes[0];
slot = path->slots[0];
@@ -4025,12 +4027,11 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans,
* item0..itemN ... dataN.offset..dataN.size .. data0.size
*/
/* first correct the data pointers */
- btrfs_init_map_token(&token, leaf);
for (i = slot; i < nritems; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i, ioff + size_diff);
+ ioff = btrfs_item_offset(leaf, i);
+ btrfs_set_item_offset(leaf, i, ioff + size_diff);
}
/* shift the data */
@@ -4093,7 +4094,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
unsigned int old_data;
unsigned int old_size;
int i;
- struct btrfs_map_token token;
leaf = path->nodes[0];
@@ -4119,12 +4119,11 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
* item0..itemN ... dataN.offset..dataN.size .. data0.size
*/
/* first correct the data pointers */
- btrfs_init_map_token(&token, leaf);
for (i = slot; i < nritems; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i, ioff - data_size);
+ ioff = btrfs_item_offset(leaf, i);
+ btrfs_set_item_offset(leaf, i, ioff - data_size);
}
/* shift the data */
@@ -4164,7 +4163,6 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
struct extent_buffer *leaf;
int slot;
- struct btrfs_map_token token;
u32 total_size;
/*
@@ -4192,7 +4190,6 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans,
BUG();
}
- btrfs_init_map_token(&token, leaf);
if (slot != nritems) {
unsigned int old_data = btrfs_item_data_end(leaf, slot);
@@ -4210,8 +4207,8 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans,
for (i = slot; i < nritems; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i,
+ ioff = btrfs_item_offset(leaf, i);
+ btrfs_set_item_offset(leaf, i,
ioff - batch->total_data_size);
}
/* shift the items */
@@ -4228,8 +4225,8 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans,
btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
btrfs_set_item_key(leaf, &disk_key, slot + i);
data_end -= batch->data_sizes[i];
- btrfs_set_token_item_offset(&token, slot + i, data_end);
- btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
+ btrfs_set_item_offset(leaf, slot + i, data_end);
+ btrfs_set_item_size(leaf, slot + i, batch->data_sizes[i]);
}
btrfs_set_header_nritems(leaf, nritems + batch->nr);
@@ -4442,7 +4439,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
root_sub_used_bytes(root);
- atomic_inc(&leaf->refs);
+ refcount_inc(&leaf->refs);
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
free_extent_buffer_stale(leaf);
if (ret < 0)
@@ -4469,7 +4466,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (slot + nr != nritems) {
const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
const int data_end = leaf_data_end(leaf);
- struct btrfs_map_token token;
u32 dsize = 0;
int i;
@@ -4479,12 +4475,11 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
memmove_leaf_data(leaf, data_end + dsize, data_end,
last_off - data_end);
- btrfs_init_map_token(&token, leaf);
for (i = slot + nr; i < nritems; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i, ioff + dsize);
+ ioff = btrfs_item_offset(leaf, i);
+ btrfs_set_item_offset(leaf, i, ioff + dsize);
}
memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr);
@@ -4527,7 +4522,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
* for possible call to btrfs_del_ptr below
*/
slot = path->slots[1];
- atomic_inc(&leaf->refs);
+ refcount_inc(&leaf->refs);
/*
* We want to be able to at least push one item to the
* left neighbour leaf, and that's the first item.
@@ -4585,16 +4580,13 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
/*
* A helper function to walk down the tree starting at min_key, and looking
- * for nodes or leaves that are have a minimum transaction id.
+ * for leaves that have a minimum transaction id.
* This is used by the btree defrag code, and tree logging
*
* This does not cow, but it does stuff the starting key it finds back
* into min_key, so you can call btrfs_search_slot with cow=1 on the
* key and get a writable path.
*
- * This honors path->lowest_level to prevent descent past a given level
- * of the tree.
- *
* min_trans indicates the oldest transaction that you are interested
* in walking through. Any nodes or leaves older than min_trans are
* skipped over (without reading them).
@@ -4615,6 +4607,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
int keep_locks = path->keep_locks;
ASSERT(!path->nowait);
+ ASSERT(path->lowest_level == 0);
path->keep_locks = 1;
again:
cur = btrfs_read_lock_root_node(root);
@@ -4636,8 +4629,8 @@ again:
goto out;
}
- /* at the lowest level, we're done, setup the path and exit */
- if (level == path->lowest_level) {
+ /* At level 0 we're done, setup the path and exit. */
+ if (level == 0) {
if (slot >= nritems)
goto find_next_key;
ret = 0;
@@ -4678,12 +4671,6 @@ find_next_key:
goto out;
}
}
- if (level == path->lowest_level) {
- ret = 0;
- /* Save our key for returning back. */
- btrfs_node_key_to_cpu(cur, min_key, slot);
- goto out;
- }
cur = btrfs_read_node_slot(cur, slot);
if (IS_ERR(cur)) {
ret = PTR_ERR(cur);
@@ -4699,7 +4686,7 @@ find_next_key:
out:
path->keep_locks = keep_locks;
if (ret == 0)
- btrfs_unlock_up_safe(path, path->lowest_level + 1);
+ btrfs_unlock_up_safe(path, 1);
return ret;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 71fa42ca04fe..fe70b593c7cd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -224,16 +224,10 @@ struct btrfs_root {
struct list_head root_list;
- /*
- * Xarray that keeps track of in-memory inodes, protected by the lock
- * @inode_lock.
- */
+ /* Xarray that keeps track of in-memory inodes. */
struct xarray inodes;
- /*
- * Xarray that keeps track of delayed nodes of every inode, protected
- * by @inode_lock.
- */
+ /* Xarray that keeps track of delayed nodes of every inode. */
struct xarray delayed_nodes;
/*
* right now this just gets used so that a root has its own devid
@@ -508,7 +502,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
int __init btrfs_ctree_init(void);
void __cold btrfs_ctree_exit(void);
-int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
+int btrfs_bin_search(const struct extent_buffer *eb, int first_slot,
const struct btrfs_key *key, int *slot);
int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
@@ -576,9 +570,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
struct extent_buffer **cow_ret, u64 new_root_objectid);
-bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf);
+bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans,
+ const struct btrfs_root *root,
+ const struct extent_buffer *buf);
int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int level, int slot);
void btrfs_extend_item(struct btrfs_trans_handle *trans,
@@ -727,13 +721,18 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
}
int btrfs_leaf_free_space(const struct extent_buffer *leaf);
-static inline int is_fstree(u64 rootid)
+static inline bool btrfs_is_fstree(u64 rootid)
{
- if (rootid == BTRFS_FS_TREE_OBJECTID ||
- ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID &&
- !btrfs_qgroup_level(rootid)))
- return 1;
- return 0;
+ if (rootid == BTRFS_FS_TREE_OBJECTID)
+ return true;
+
+ if ((s64)rootid < (s64)BTRFS_FIRST_FREE_OBJECTID)
+ return false;
+
+ if (btrfs_qgroup_level(rootid) != 0)
+ return false;
+
+ return true;
}
static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 1831618579cb..738179a5e170 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -60,6 +60,14 @@ static int compare_inode_defrag(const struct inode_defrag *defrag1,
return 0;
}
+static int inode_defrag_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct inode_defrag *new_defrag = rb_entry(new, struct inode_defrag, rb_node);
+ const struct inode_defrag *existing_defrag = rb_entry(existing, struct inode_defrag, rb_node);
+
+ return compare_inode_defrag(new_defrag, existing_defrag);
+}
+
/*
* Insert a record for an inode into the defrag tree. The lock must be held
* already.
@@ -71,37 +79,23 @@ static int btrfs_insert_inode_defrag(struct btrfs_inode *inode,
struct inode_defrag *defrag)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct inode_defrag *entry;
- struct rb_node **p;
- struct rb_node *parent = NULL;
- int ret;
+ struct rb_node *node;
- p = &fs_info->defrag_inodes.rb_node;
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct inode_defrag, rb_node);
+ node = rb_find_add(&defrag->rb_node, &fs_info->defrag_inodes, inode_defrag_cmp);
+ if (node) {
+ struct inode_defrag *entry;
- ret = compare_inode_defrag(defrag, entry);
- if (ret < 0)
- p = &parent->rb_left;
- else if (ret > 0)
- p = &parent->rb_right;
- else {
- /*
- * If we're reinserting an entry for an old defrag run,
- * make sure to lower the transid of our existing
- * record.
- */
- if (defrag->transid < entry->transid)
- entry->transid = defrag->transid;
- entry->extent_thresh = min(defrag->extent_thresh,
- entry->extent_thresh);
- return -EEXIST;
- }
+ entry = rb_entry(node, struct inode_defrag, rb_node);
+ /*
+ * If we're reinserting an entry for an old defrag run, make
+ * sure to lower the transid of our existing record.
+ */
+ if (defrag->transid < entry->transid)
+ entry->transid = defrag->transid;
+ entry->extent_thresh = min(defrag->extent_thresh, entry->extent_thresh);
+ return -EEXIST;
}
set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
- rb_link_node(&defrag->rb_node, parent, p);
- rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
return 0;
}
@@ -854,8 +848,8 @@ static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t
{
struct address_space *mapping = inode->vfs_inode.i_mapping;
gfp_t mask = btrfs_alloc_write_mask(mapping);
- u64 folio_start;
- u64 folio_end;
+ u64 lock_start;
+ u64 lock_end;
struct extent_state *cached_state = NULL;
struct folio *folio;
int ret;
@@ -891,15 +885,15 @@ again:
return ERR_PTR(ret);
}
- folio_start = folio_pos(folio);
- folio_end = folio_pos(folio) + folio_size(folio) - 1;
+ lock_start = folio_pos(folio);
+ lock_end = folio_end(folio) - 1;
/* Wait for any existing ordered extent in the range */
while (1) {
struct btrfs_ordered_extent *ordered;
- btrfs_lock_extent(&inode->io_tree, folio_start, folio_end, &cached_state);
- ordered = btrfs_lookup_ordered_range(inode, folio_start, folio_size(folio));
- btrfs_unlock_extent(&inode->io_tree, folio_start, folio_end, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, lock_start, lock_end, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, lock_start, folio_size(folio));
+ btrfs_unlock_extent(&inode->io_tree, lock_start, lock_end, &cached_state);
if (!ordered)
break;
@@ -953,7 +947,7 @@ struct defrag_target_range {
* @extent_thresh: file extent size threshold, any extent size >= this value
* will be ignored
* @newer_than: only defrag extents newer than this value
- * @do_compress: whether the defrag is doing compression
+ * @do_compress: whether the defrag is doing compression or no-compression
* if true, @extent_thresh will be ignored and all regular
* file extents meeting @newer_than will be targets.
* @locked: if the range has already held extent lock
@@ -1184,8 +1178,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
if (!folio)
break;
- if (start >= folio_pos(folio) + folio_size(folio) ||
- start + len <= folio_pos(folio))
+ if (start >= folio_end(folio) || start + len <= folio_pos(folio))
continue;
btrfs_folio_clamp_clear_checked(fs_info, folio, start, len);
btrfs_folio_clamp_set_dirty(fs_info, folio, start, len);
@@ -1226,7 +1219,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
folios[i] = NULL;
goto free_folios;
}
- cur = folio_pos(folios[i]) + folio_size(folios[i]);
+ cur = folio_end(folios[i]);
}
for (int i = 0; i < nr_pages; i++) {
if (!folios[i])
@@ -1371,6 +1364,7 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra,
u64 cur;
u64 last_byte;
bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS);
+ bool no_compress = (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS);
int compress_type = BTRFS_COMPRESS_ZLIB;
int compress_level = 0;
int ret = 0;
@@ -1401,6 +1395,9 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra,
if (range->compress_type)
compress_type = range->compress_type;
}
+ } else if (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS) {
+ compress_type = BTRFS_DEFRAG_DONT_COMPRESS;
+ compress_level = 1;
}
if (extent_thresh == 0)
@@ -1451,13 +1448,14 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra,
btrfs_inode_unlock(inode, 0);
break;
}
- if (do_compress) {
+ if (do_compress || no_compress) {
inode->defrag_compress = compress_type;
inode->defrag_compress_level = compress_level;
}
ret = defrag_one_cluster(inode, ra, cur,
cluster_end + 1 - cur, extent_thresh,
- newer_than, do_compress, &sectors_defragged,
+ newer_than, do_compress || no_compress,
+ &sectors_defragged,
max_to_defrag, &last_scanned);
if (sectors_defragged > prev_sectors_defragged)
@@ -1496,7 +1494,7 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra,
btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
ret = sectors_defragged;
}
- if (do_compress) {
+ if (do_compress || no_compress) {
btrfs_inode_lock(inode, 0);
inode->defrag_compress = BTRFS_COMPRESS_NONE;
btrfs_inode_unlock(inode, 0);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index c7cc24a5dd5e..0f8d8e275143 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -334,6 +334,20 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
return item;
}
+static int delayed_item_index_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *index = key;
+ const struct btrfs_delayed_item *delayed_item = rb_entry(node,
+ struct btrfs_delayed_item, rb_node);
+
+ if (delayed_item->index < *index)
+ return 1;
+ else if (delayed_item->index > *index)
+ return -1;
+
+ return 0;
+}
+
/*
* Look up the delayed item by key.
*
@@ -347,21 +361,10 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
struct rb_root *root,
u64 index)
{
- struct rb_node *node = root->rb_node;
- struct btrfs_delayed_item *delayed_item = NULL;
+ struct rb_node *node;
- while (node) {
- delayed_item = rb_entry(node, struct btrfs_delayed_item,
- rb_node);
- if (delayed_item->index < index)
- node = node->rb_right;
- else if (delayed_item->index > index)
- node = node->rb_left;
- else
- return delayed_item;
- }
-
- return NULL;
+ node = rb_find(&index, root, delayed_item_index_cmp);
+ return rb_entry_safe(node, struct btrfs_delayed_item, rb_node);
}
static int btrfs_delayed_item_cmp(const struct rb_node *new,
@@ -369,14 +372,8 @@ static int btrfs_delayed_item_cmp(const struct rb_node *new,
{
const struct btrfs_delayed_item *new_item =
rb_entry(new, struct btrfs_delayed_item, rb_node);
- const struct btrfs_delayed_item *exist_item =
- rb_entry(exist, struct btrfs_delayed_item, rb_node);
- if (new_item->index < exist_item->index)
- return -1;
- if (new_item->index > exist_item->index)
- return 1;
- return 0;
+ return delayed_item_index_cmp(&new_item->index, exist);
}
static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
@@ -1008,8 +1005,16 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_inode(trans, root, path, &key, mod);
if (ret > 0)
ret = -ENOENT;
- if (ret < 0)
+ if (ret < 0) {
+ /*
+ * If we fail to update the delayed inode we need to abort the
+ * transaction, because we could leave the inode with the
+ * improper counts behind.
+ */
+ if (ret != -ENOENT)
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
leaf = path->nodes[0];
inode_item = btrfs_item_ptr(leaf, path->slots[0],
@@ -1034,8 +1039,10 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
goto err_out;
+ }
ASSERT(ret > 0);
ASSERT(path->slots[0] > 0);
ret = 0;
@@ -1057,21 +1064,14 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
* in the same item doesn't exist.
*/
ret = btrfs_del_item(trans, root, path);
+ if (ret < 0)
+ btrfs_abort_transaction(trans, ret);
out:
btrfs_release_delayed_iref(node);
btrfs_release_path(path);
err_out:
btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0));
btrfs_release_delayed_inode(node);
-
- /*
- * If we fail to update the delayed inode we need to abort the
- * transaction, because we could leave the inode with the improper
- * counts behind.
- */
- if (ret && ret != -ENOENT)
- btrfs_abort_transaction(trans, ret);
-
return ret;
}
@@ -1377,7 +1377,10 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info)
{
- WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root));
+ struct btrfs_delayed_node *node = btrfs_first_delayed_node(fs_info->delayed_root);
+
+ if (WARN_ON(node))
+ refcount_dec(&node->refs);
}
static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
@@ -1537,8 +1540,8 @@ release_node:
return ret;
}
-static int btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node,
- u64 index)
+static bool btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node,
+ u64 index)
{
struct btrfs_delayed_item *item;
@@ -1546,7 +1549,7 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node,
item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index);
if (!item) {
mutex_unlock(&node->mutex);
- return 1;
+ return false;
}
/*
@@ -1581,7 +1584,7 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node,
}
mutex_unlock(&node->mutex);
- return 0;
+ return true;
}
int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
@@ -1595,9 +1598,10 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
if (IS_ERR(node))
return PTR_ERR(node);
- ret = btrfs_delete_delayed_insertion_item(node, index);
- if (!ret)
+ if (btrfs_delete_delayed_insertion_item(node, index)) {
+ ret = 0;
goto end;
+ }
item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM);
if (!item) {
@@ -1614,7 +1618,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
*/
if (ret < 0) {
btrfs_err(trans->fs_info,
-"metadata reservation failed for delayed dir item deltiona, should have been reserved");
+"metadata reservation failed for delayed dir item deletion, index: %llu, root: %llu, inode: %llu, error: %d",
+ index, btrfs_root_id(node->root), node->inode_id, ret);
btrfs_release_delayed_item(item);
goto end;
}
@@ -1623,9 +1628,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
ret = __btrfs_add_delayed_item(node, item);
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
- "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
- index, btrfs_root_id(node->root),
- node->inode_id, ret);
+"failed to add delayed dir index item, root: %llu, inode: %llu, index: %llu, error: %d",
+ index, btrfs_root_id(node->root), node->inode_id, ret);
btrfs_delayed_item_release_metadata(dir->root, item);
btrfs_release_delayed_item(item);
}
@@ -1730,17 +1734,16 @@ void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode,
downgrade_write(&inode->vfs_inode.i_rwsem);
}
-int btrfs_should_delete_dir_index(const struct list_head *del_list,
- u64 index)
+bool btrfs_should_delete_dir_index(const struct list_head *del_list, u64 index)
{
struct btrfs_delayed_item *curr;
- int ret = 0;
+ bool ret = false;
list_for_each_entry(curr, del_list, readdir_list) {
if (curr->index > index)
break;
if (curr->index == index) {
- ret = 1;
+ ret = true;
break;
}
}
@@ -1750,15 +1753,14 @@ int btrfs_should_delete_dir_index(const struct list_head *del_list,
/*
* Read dir info stored in the delayed tree.
*/
-int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- const struct list_head *ins_list)
+bool btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
+ const struct list_head *ins_list)
{
struct btrfs_dir_item *di;
struct btrfs_delayed_item *curr, *next;
struct btrfs_key location;
char *name;
int name_len;
- int over = 0;
unsigned char d_type;
/*
@@ -1767,6 +1769,8 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
* directory, nobody can delete any directory indexes now.
*/
list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
+ bool over;
+
list_del(&curr->readdir_list);
if (curr->index < ctx->pos) {
@@ -1784,17 +1788,16 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
d_type = fs_ftype_to_dtype(btrfs_dir_flags_to_ftype(di->type));
btrfs_disk_key_to_cpu(&location, &di->location);
- over = !dir_emit(ctx, name, name_len,
- location.objectid, d_type);
+ over = !dir_emit(ctx, name, name_len, location.objectid, d_type);
if (refcount_dec_and_test(&curr->refs))
kfree(curr);
if (over)
- return 1;
+ return true;
ctx->pos++;
}
- return 0;
+ return false;
}
static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index c4b4ba122beb..e6e763ad2d42 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -150,10 +150,9 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode,
void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode,
struct list_head *ins_list,
struct list_head *del_list);
-int btrfs_should_delete_dir_index(const struct list_head *del_list,
- u64 index);
-int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- const struct list_head *ins_list);
+bool btrfs_should_delete_dir_index(const struct list_head *del_list, u64 index);
+bool btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
+ const struct list_head *ins_list);
/* Used during directory logging. */
void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 739c9e29aaa3..ca382c5b186f 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -928,7 +928,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
- if (is_fstree(generic_ref->ref_root))
+ if (btrfs_is_fstree(generic_ref->ref_root))
seq = atomic64_read(&fs_info->tree_mod_seq);
refcount_set(&ref->refs, 1);
@@ -958,8 +958,8 @@ void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root,
#endif
generic_ref->tree_ref.level = level;
generic_ref->type = BTRFS_REF_METADATA;
- if (skip_qgroup || !(is_fstree(generic_ref->ref_root) &&
- (!mod_root || is_fstree(mod_root))))
+ if (skip_qgroup || !(btrfs_is_fstree(generic_ref->ref_root) &&
+ (!mod_root || btrfs_is_fstree(mod_root))))
generic_ref->skip_qgroup = true;
else
generic_ref->skip_qgroup = false;
@@ -976,8 +976,8 @@ void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset,
generic_ref->data_ref.objectid = ino;
generic_ref->data_ref.offset = offset;
generic_ref->type = BTRFS_REF_DATA;
- if (skip_qgroup || !(is_fstree(generic_ref->ref_root) &&
- (!mod_root || is_fstree(mod_root))))
+ if (skip_qgroup || !(btrfs_is_fstree(generic_ref->ref_root) &&
+ (!mod_root || btrfs_is_fstree(mod_root))))
generic_ref->skip_qgroup = true;
else
generic_ref->skip_qgroup = false;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 78cc23837610..552ec4fa645d 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -420,7 +420,7 @@ bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
u64 root, u64 parent);
void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans);
-static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
+static inline u64 btrfs_delayed_ref_owner(const struct btrfs_delayed_ref_node *node)
{
if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
node->type == BTRFS_SHARED_DATA_REF_KEY)
@@ -428,7 +428,7 @@ static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
return node->tree_ref.level;
}
-static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node)
+static inline u64 btrfs_delayed_ref_offset(const struct btrfs_delayed_ref_node *node)
{
if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
node->type == BTRFS_SHARED_DATA_REF_KEY)
@@ -436,7 +436,7 @@ static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node)
return 0;
}
-static inline u8 btrfs_ref_type(struct btrfs_ref *ref)
+static inline u8 btrfs_ref_type(const struct btrfs_ref *ref)
{
ASSERT(ref->type == BTRFS_REF_DATA || ref->type == BTRFS_REF_METADATA);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 2decb9fff445..4675bcd5f92e 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -250,7 +250,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
}
bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
- fs_info->bdev_holder, NULL);
+ fs_info->sb, &fs_holder_ops);
if (IS_ERR(bdev_file)) {
btrfs_err(fs_info, "target device %s is invalid!", device_path);
return PTR_ERR(bdev_file);
@@ -327,7 +327,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return 0;
error:
- fput(bdev_file);
+ bdev_fput(bdev_file);
return ret;
}
@@ -600,7 +600,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return PTR_ERR(src_device);
if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"cannot replace device %s (devid %llu) due to active swapfile",
btrfs_dev_name(src_device), src_device->devid);
return -ETXTBSY;
@@ -647,7 +647,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
dev_replace->srcdev = src_device;
dev_replace->tgtdev = tgt_device;
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"dev_replace from %s (devid %llu) to %s started",
btrfs_dev_name(src_device),
src_device->devid,
@@ -943,7 +943,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
tgt_device);
} else {
if (scrub_ret != -ECANCELED)
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"btrfs_scrub_dev(%s, %llu, %s) failed %d",
btrfs_dev_name(src_device),
src_device->devid,
@@ -961,7 +961,7 @@ error:
return scrub_ret;
}
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"dev_replace from %s (devid %llu) to %s finished",
btrfs_dev_name(src_device),
src_device->devid,
@@ -1109,7 +1109,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
* btrfs_dev_replace_finishing() will handle the
* cleanup part
*/
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"dev_replace from %s (devid %llu) to %s canceled",
btrfs_dev_name(src_device), src_device->devid,
btrfs_dev_name(tgt_device));
@@ -1143,7 +1143,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
ret = btrfs_commit_transaction(trans);
WARN_ON(ret);
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"suspended dev_replace from %s (devid %llu) to %s canceled",
btrfs_dev_name(src_device), src_device->devid,
btrfs_dev_name(tgt_device));
@@ -1247,7 +1247,7 @@ static int btrfs_dev_replace_kthread(void *data)
progress = btrfs_dev_replace_progress(fs_info);
progress = div_u64(progress, 10);
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"continuing dev_replace from %s (devid %llu) to target %s @%u%%",
btrfs_dev_name(dev_replace->srcdev),
dev_replace->srcdev->devid,
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index b29cc31a7c4a..69863e398e22 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -227,7 +227,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
return di;
}
-int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir_ino,
const struct fscrypt_str *name)
{
int ret;
@@ -242,7 +242,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
if (!path)
return -ENOMEM;
- key.objectid = dir;
+ key.objectid = dir_ino;
key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name->name, name->len);
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
index 8462579a95f4..e52174a8baf9 100644
--- a/fs/btrfs/dir-item.h
+++ b/fs/btrfs/dir-item.h
@@ -14,7 +14,7 @@ struct btrfs_inode;
struct btrfs_root;
struct btrfs_trans_handle;
-int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir_ino,
const struct fscrypt_str *name);
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
const struct fscrypt_str *name, struct btrfs_inode *dir,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1beb9458f622..70fc4e7cc5a0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -884,7 +884,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
btrfs_set_root_used(&root->root_item, leaf->len);
btrfs_set_root_last_snapshot(&root->root_item, 0);
btrfs_set_root_dirid(&root->root_item, 0);
- if (is_fstree(objectid))
+ if (btrfs_is_fstree(objectid))
generate_random_guid(root->root_item.uuid);
else
export_guid(root->root_item.uuid, &guid_null);
@@ -1104,7 +1104,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
!btrfs_is_data_reloc_root(root) &&
- is_fstree(btrfs_root_id(root))) {
+ btrfs_is_fstree(btrfs_root_id(root))) {
set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
btrfs_check_and_init_root_item(&root->root_item);
}
@@ -1113,7 +1113,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
* Don't assign anonymous block device to roots that are not exposed to
* userspace, the id pool is limited to 1M
*/
- if (is_fstree(btrfs_root_id(root)) &&
+ if (btrfs_is_fstree(btrfs_root_id(root)) &&
btrfs_root_refs(&root->root_item) > 0) {
if (!anon_dev) {
ret = get_anon_bdev(&root->anon_dev);
@@ -1246,6 +1246,8 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
{
struct percpu_counter *em_counter = &fs_info->evictable_extent_maps;
+ if (fs_info->fs_devices)
+ btrfs_close_devices(fs_info->fs_devices);
percpu_counter_destroy(&fs_info->stats_read_blocks);
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
@@ -1315,7 +1317,7 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
* This is namely for free-space-tree and quota tree, which can change
* at runtime and should only be grabbed from fs_info.
*/
- if (!is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (!btrfs_is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
return ERR_PTR(-ENOENT);
again:
root = btrfs_lookup_fs_root(fs_info, objectid);
@@ -1835,6 +1837,8 @@ void btrfs_put_root(struct btrfs_root *root)
if (refcount_dec_and_test(&root->refs)) {
if (WARN_ON(!xa_empty(&root->inodes)))
xa_destroy(&root->inodes);
+ if (WARN_ON(!xa_empty(&root->delayed_nodes)))
+ xa_destroy(&root->delayed_nodes);
WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
if (root->anon_dev)
free_anon_bdev(root->anon_dev);
@@ -1945,7 +1949,6 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
fs_info->qgroup_tree = RB_ROOT;
INIT_LIST_HEAD(&fs_info->dirty_qgroups);
fs_info->qgroup_seq = 1;
- fs_info->qgroup_ulist = NULL;
fs_info->qgroup_rescan_running = false;
fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT;
mutex_init(&fs_info->qgroup_rescan_lock);
@@ -2026,14 +2029,10 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
fs_info->csum_shash = csum_shash;
- /*
- * Check if the checksum implementation is a fast accelerated one.
- * As-is this is a bit of a hack and should be replaced once the csum
- * implementations provide that information themselves.
- */
+ /* Check if the checksum implementation is a fast accelerated one. */
switch (csum_type) {
case BTRFS_CSUM_TYPE_CRC32:
- if (!strstr(crypto_shash_driver_name(csum_shash), "generic"))
+ if (crc32_optimizations() & CRC32C_OPTIMIZATION)
set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
break;
case BTRFS_CSUM_TYPE_XXHASH:
@@ -2156,8 +2155,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
found = true;
root = read_tree_root_path(tree_root, path, &key);
if (IS_ERR(root)) {
- if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
- ret = PTR_ERR(root);
+ ret = PTR_ERR(root);
break;
}
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
@@ -3395,6 +3393,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
fs_info->nodesize = nodesize;
+ fs_info->nodesize_bits = ilog2(nodesize);
fs_info->sectorsize = sectorsize;
fs_info->sectorsize_bits = ilog2(sectorsize);
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
@@ -3560,6 +3559,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_sysfs;
}
+ btrfs_zoned_reserve_data_reloc_bg(fs_info);
btrfs_free_zone_cache(fs_info);
btrfs_check_active_zone_reservation(fs_info);
@@ -3680,7 +3680,6 @@ fail_alloc:
iput(fs_info->btree_inode);
fail:
- btrfs_close_devices(fs_info->fs_devices);
ASSERT(ret < 0);
return ret;
}
@@ -3693,7 +3692,7 @@ static void btrfs_end_super_write(struct bio *bio)
bio_for_each_folio_all(fi, bio) {
if (bio->bi_status) {
- btrfs_warn_rl_in_rcu(device->fs_info,
+ btrfs_warn_rl(device->fs_info,
"lost super block write due to IO error on %s (%d)",
btrfs_dev_name(device),
blk_status_to_errno(bio->bi_status));
@@ -3991,7 +3990,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
}
if (min_tolerated == INT_MAX) {
- pr_warn("BTRFS: unknown raid flag: %llu", flags);
+ btrfs_warn(NULL, "unknown raid flag: %llu", flags);
min_tolerated = 0;
}
@@ -4310,8 +4309,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
*
* So wait for all ongoing ordered extents to complete and then run
* delayed iputs. This works because once we reach this point no one
- * can either create new ordered extents nor create delayed iputs
- * through some other means.
+ * can create new ordered extents, but delayed iputs can still be added
+ * by a reclaim worker (see comments further below).
*
* Also note that btrfs_wait_ordered_roots() is not safe here, because
* it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
@@ -4322,15 +4321,29 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_flush_workqueue(fs_info->endio_write_workers);
/* Ordered extents for free space inodes. */
btrfs_flush_workqueue(fs_info->endio_freespace_worker);
+ /*
+ * Run delayed iputs in case an async reclaim worker is waiting for them
+ * to be run as mentioned above.
+ */
btrfs_run_delayed_iputs(fs_info);
- /* There should be no more workload to generate new delayed iputs. */
- set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state);
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
cancel_work_sync(&fs_info->preempt_reclaim_work);
cancel_work_sync(&fs_info->em_shrinker_work);
+ /*
+ * Run delayed iputs again because an async reclaim worker may have
+ * added new ones if it was flushing delalloc:
+ *
+ * shrink_delalloc() -> btrfs_start_delalloc_roots() ->
+ * start_delalloc_inodes() -> btrfs_add_delayed_iput()
+ */
+ btrfs_run_delayed_iputs(fs_info);
+
+ /* There should be no more workload to generate new delayed iputs. */
+ set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state);
+
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
@@ -4413,7 +4426,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
iput(fs_info->btree_inode);
btrfs_mapping_tree_free(fs_info);
- btrfs_close_devices(fs_info->fs_devices);
}
void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
@@ -4625,7 +4637,7 @@ static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end,
mark, NULL)) {
- btrfs_clear_extent_bits(dirty_pages, start, end, mark);
+ btrfs_clear_extent_bit(dirty_pages, start, end, mark, NULL);
while (start <= end) {
eb = find_extent_buffer(fs_info, start);
start += fs_info->nodesize;
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index b1b96eb5f64e..66361325f6dc 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -43,7 +43,8 @@ static inline void btrfs_extent_state_leak_debug_check(void)
while (!list_empty(&states)) {
state = list_first_entry(&states, struct extent_state, leak_list);
- pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
+ btrfs_err(NULL,
+ "state leak: start %llu end %llu state %u in tree %d refs %d",
state->start, state->end, state->state,
extent_state_in_tree(state),
refcount_read(&state->refs));
@@ -1882,12 +1883,11 @@ int btrfs_clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 e
bool btrfs_try_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_state **cached)
{
- int err;
+ int ret;
u64 failed_start;
- err = set_extent_bit(tree, start, end, bits, &failed_start, NULL,
- cached, NULL);
- if (err == -EEXIST) {
+ ret = set_extent_bit(tree, start, end, bits, &failed_start, NULL, cached, NULL);
+ if (ret == -EEXIST) {
if (failed_start > start)
btrfs_clear_extent_bit(tree, start, failed_start - 1,
bits, cached);
@@ -1904,21 +1904,21 @@ int btrfs_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32
struct extent_state **cached_state)
{
struct extent_state *failed_state = NULL;
- int err;
+ int ret;
u64 failed_start;
- err = set_extent_bit(tree, start, end, bits, &failed_start,
+ ret = set_extent_bit(tree, start, end, bits, &failed_start,
&failed_state, cached_state, NULL);
- while (err == -EEXIST) {
+ while (ret == -EEXIST) {
if (failed_start != start)
btrfs_clear_extent_bit(tree, start, failed_start - 1,
bits, cached_state);
wait_extent_bit(tree, failed_start, end, bits, &failed_state);
- err = set_extent_bit(tree, start, end, bits, &failed_start,
+ ret = set_extent_bit(tree, start, end, bits, &failed_start,
&failed_state, cached_state, NULL);
}
- return err;
+ return ret;
}
/*
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 0a18ca9c59c3..36facca37973 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -19,7 +19,8 @@ enum {
ENUM_BIT(EXTENT_DIRTY),
ENUM_BIT(EXTENT_LOCKED),
ENUM_BIT(EXTENT_DIO_LOCKED),
- ENUM_BIT(EXTENT_NEW),
+ ENUM_BIT(EXTENT_DIRTY_LOG1),
+ ENUM_BIT(EXTENT_DIRTY_LOG2),
ENUM_BIT(EXTENT_DELALLOC),
ENUM_BIT(EXTENT_DEFRAG),
ENUM_BIT(EXTENT_BOUNDARY),
@@ -191,12 +192,6 @@ static inline int btrfs_unlock_extent(struct extent_io_tree *tree, u64 start, u6
cached, NULL);
}
-static inline int btrfs_clear_extent_bits(struct extent_io_tree *tree, u64 start,
- u64 end, u32 bits)
-{
- return btrfs_clear_extent_bit(tree, start, end, bits, NULL);
-}
-
int btrfs_set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset);
int btrfs_set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cb6128778a83..97d517cdf2df 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -46,7 +46,7 @@
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extra_op);
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
struct extent_buffer *leaf,
@@ -56,12 +56,12 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 flags, u64 owner, u64 offset,
struct btrfs_key *ins, int ref_mod, u64 oref_root);
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op);
-static int find_next_key(struct btrfs_path *path, int level,
+static int find_next_key(const struct btrfs_path *path, int level,
struct btrfs_key *key);
-static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
+static int block_group_bits(const struct btrfs_block_group *cache, u64 bits)
{
return (cache->flags & bits) == bits;
}
@@ -329,7 +329,7 @@ search_again:
* is_data == BTRFS_REF_TYPE_ANY, either type is OK.
*/
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
- struct btrfs_extent_inline_ref *iref,
+ const struct btrfs_extent_inline_ref *iref,
enum btrfs_inline_ref_type is_data)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
@@ -401,16 +401,16 @@ u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
return ((u64)high_crc << 31) ^ (u64)low_crc;
}
-static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
- struct btrfs_extent_data_ref *ref)
+static u64 hash_extent_data_ref_item(const struct extent_buffer *leaf,
+ const struct btrfs_extent_data_ref *ref)
{
return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
btrfs_extent_data_ref_objectid(leaf, ref),
btrfs_extent_data_ref_offset(leaf, ref));
}
-static bool match_extent_data_ref(struct extent_buffer *leaf,
- struct btrfs_extent_data_ref *ref,
+static bool match_extent_data_ref(const struct extent_buffer *leaf,
+ const struct btrfs_extent_data_ref *ref,
u64 root_objectid, u64 owner, u64 offset)
{
if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
@@ -497,7 +497,7 @@ fail:
static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
u64 bytenr)
{
struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
@@ -617,13 +617,13 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
return ret;
}
-static noinline u32 extent_data_ref_count(struct btrfs_path *path,
- struct btrfs_extent_inline_ref *iref)
+static noinline u32 extent_data_ref_count(const struct btrfs_path *path,
+ const struct btrfs_extent_inline_ref *iref)
{
struct btrfs_key key;
struct extent_buffer *leaf;
- struct btrfs_extent_data_ref *ref1;
- struct btrfs_shared_data_ref *ref2;
+ const struct btrfs_extent_data_ref *ref1;
+ const struct btrfs_shared_data_ref *ref2;
u32 num_refs = 0;
int type;
@@ -638,10 +638,10 @@ static noinline u32 extent_data_ref_count(struct btrfs_path *path,
type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
ASSERT(type != BTRFS_REF_TYPE_INVALID);
if (type == BTRFS_EXTENT_DATA_REF_KEY) {
- ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
+ ref1 = (const struct btrfs_extent_data_ref *)(&iref->offset);
num_refs = btrfs_extent_data_ref_count(leaf, ref1);
} else {
- ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
+ ref2 = (const struct btrfs_shared_data_ref *)(iref + 1);
num_refs = btrfs_shared_data_ref_count(leaf, ref2);
}
} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
@@ -684,7 +684,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
u64 bytenr)
{
struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
@@ -722,7 +722,7 @@ static inline int extent_ref_type(u64 parent, u64 owner)
return type;
}
-static int find_next_key(struct btrfs_path *path, int level,
+static int find_next_key(const struct btrfs_path *path, int level,
struct btrfs_key *key)
{
@@ -1480,7 +1480,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
*
*/
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op)
{
BTRFS_PATH_AUTO_FREE(path);
@@ -1522,19 +1522,21 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
/* now insert the actual backref */
- if (owner < BTRFS_FIRST_FREE_OBJECTID)
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
ret = insert_tree_block_ref(trans, path, node, bytenr);
- else
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ } else {
ret = insert_extent_data_ref(trans, path, node, bytenr);
-
- if (ret)
- btrfs_abort_transaction(trans, ret);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
return ret;
}
static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_head *href)
+ const struct btrfs_delayed_ref_head *href)
{
u64 root = href->owning_root;
@@ -1543,7 +1545,7 @@ static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
* where it has already been unset.
*/
if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE ||
- !href->is_data || !is_fstree(root))
+ !href->is_data || !btrfs_is_fstree(root))
return;
btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes,
@@ -1552,7 +1554,7 @@ static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
bool insert_reserved)
{
@@ -1620,7 +1622,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
}
static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_head *head,
+ const struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1707,7 +1709,7 @@ again:
static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
bool insert_reserved)
{
@@ -1754,7 +1756,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
/* helper function to actually process a single delayed ref entry */
static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
bool insert_reserved)
{
@@ -2998,7 +3000,7 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
return ret;
}
- ret = add_to_free_space_tree(trans, bytenr, num_bytes);
+ ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -3079,7 +3081,7 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
*/
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *info = trans->fs_info;
@@ -3649,6 +3651,21 @@ btrfs_release_block_group(struct btrfs_block_group *cache,
btrfs_put_block_group(cache);
}
+static bool find_free_extent_check_size_class(const struct find_free_extent_ctl *ffe_ctl,
+ const struct btrfs_block_group *bg)
+{
+ if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
+ return true;
+ if (!btrfs_block_group_should_use_size_class(bg))
+ return true;
+ if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
+ return true;
+ if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
+ bg->size_class == BTRFS_BG_SZ_NONE)
+ return true;
+ return ffe_ctl->size_class == bg->size_class;
+}
+
/*
* Helper function for find_free_extent().
*
@@ -3670,7 +3687,8 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg,
if (!cluster_bg)
goto refill_cluster;
if (cluster_bg != bg && (cluster_bg->ro ||
- !block_group_bits(cluster_bg, ffe_ctl->flags)))
+ !block_group_bits(cluster_bg, ffe_ctl->flags) ||
+ !find_free_extent_check_size_class(ffe_ctl, cluster_bg)))
goto release_cluster;
offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
@@ -4227,21 +4245,6 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
return -ENOSPC;
}
-static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl,
- struct btrfs_block_group *bg)
-{
- if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
- return true;
- if (!btrfs_block_group_should_use_size_class(bg))
- return true;
- if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
- return true;
- if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
- bg->size_class == BTRFS_BG_SZ_NONE)
- return true;
- return ffe_ctl->size_class == bg->size_class;
-}
-
static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_space_info *space_info,
@@ -4782,7 +4785,7 @@ static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
- ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
+ ret = btrfs_remove_from_free_space_tree(trans, bytenr, num_bytes);
if (ret)
return ret;
@@ -4873,7 +4876,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
}
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -4961,7 +4964,7 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID);
- if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root))
+ if (btrfs_is_data_reloc_root(root) && btrfs_is_fstree(root->relocation_src_root))
generic_ref.owning_root = root->relocation_src_root;
btrfs_init_data_ref(&generic_ref, owner, offset, 0, false);
@@ -4983,7 +4986,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
int ret;
struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
- struct btrfs_squota_delta delta = {
+ const struct btrfs_squota_delta delta = {
.root = root_objectid,
.num_bytes = ins->offset,
.generation = trans->transid,
@@ -5111,11 +5114,11 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (buf->log_index == 0)
btrfs_set_extent_bit(&root->dirty_log_pages, buf->start,
buf->start + buf->len - 1,
- EXTENT_DIRTY, NULL);
+ EXTENT_DIRTY_LOG1, NULL);
else
btrfs_set_extent_bit(&root->dirty_log_pages, buf->start,
buf->start + buf->len - 1,
- EXTENT_NEW, NULL);
+ EXTENT_DIRTY_LOG2, NULL);
} else {
buf->log_index = -1;
btrfs_set_extent_bit(&trans->transaction->dirty_pages, buf->start,
@@ -5552,7 +5555,7 @@ again:
goto again;
}
- exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent);
+ exists = btrfs_find_delayed_tree_ref(head, btrfs_root_id(root), parent);
mutex_unlock(&head->mutex);
out:
spin_unlock(&delayed_refs->lock);
@@ -5872,15 +5875,20 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (wc->refs[level] == 1) {
if (level == 0) {
- if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
ret = btrfs_dec_ref(trans, root, eb, 1);
- else
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ } else {
ret = btrfs_dec_ref(trans, root, eb, 0);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- return ret;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
}
- if (is_fstree(btrfs_root_id(root))) {
+ if (btrfs_is_fstree(btrfs_root_id(root))) {
ret = btrfs_qgroup_trace_leaf_items(trans, eb);
if (ret) {
btrfs_err_rl(fs_info,
@@ -6341,7 +6349,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
btrfs_assert_tree_write_locked(parent);
parent_level = btrfs_header_level(parent);
- atomic_inc(&parent->refs);
+ refcount_inc(&parent->refs);
path->nodes[parent_level] = parent;
path->slots[parent_level] = btrfs_header_nritems(parent);
@@ -6442,7 +6450,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
/* Check if there are any CHUNK_* bits left */
if (start > device->total_bytes) {
DEBUG_WARN();
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
start, end - start + 1,
btrfs_dev_name(device),
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 72914074c304..82d3a82dc712 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -97,7 +97,7 @@ enum btrfs_inline_ref_type {
};
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
- struct btrfs_extent_inline_ref *iref,
+ const struct btrfs_extent_inline_ref *iref,
enum btrfs_inline_ref_type is_data);
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 849199768664..835b0deef9bb 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -75,9 +75,9 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
while (!list_empty(&fs_info->allocated_ebs)) {
eb = list_first_entry(&fs_info->allocated_ebs,
struct extent_buffer, leak_list);
- pr_err(
- "BTRFS: buffer leak start %llu len %u refs %d bflags %lu owner %llu\n",
- eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
+ btrfs_err(fs_info,
+ "buffer leak start %llu len %u refs %d bflags %lu owner %llu",
+ eb->start, eb->len, refcount_read(&eb->refs), eb->bflags,
btrfs_header_owner(eb));
list_del(&eb->leak_list);
WARN_ON_ONCE(1);
@@ -110,6 +110,7 @@ struct btrfs_bio_ctrl {
* This is to avoid touching ranges covered by compression/inline.
*/
unsigned long submit_bitmap;
+ struct readahead_control *ractl;
};
static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
@@ -266,8 +267,7 @@ static noinline int lock_delalloc_folios(struct inode *inode,
goto out;
}
range_start = max_t(u64, folio_pos(folio), start);
- range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
- end + 1) - range_start;
+ range_len = min_t(u64, folio_end(folio), end + 1) - range_start;
btrfs_folio_set_lock(fs_info, folio, range_start, range_len);
processed_end = range_start + range_len - 1;
@@ -321,7 +321,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
ASSERT(orig_end > orig_start);
/* The range should at least cover part of the folio */
- ASSERT(!(orig_start >= folio_pos(locked_folio) + folio_size(locked_folio) ||
+ ASSERT(!(orig_start >= folio_end(locked_folio) ||
orig_end <= folio_pos(locked_folio)));
again:
/* step one, find a bunch of delalloc bytes starting at start */
@@ -419,7 +419,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
ASSERT(folio_pos(folio) <= start &&
- start + len <= folio_pos(folio) + folio_size(folio));
+ start + len <= folio_end(folio));
if (uptodate && btrfs_verify_folio(folio, start, len))
btrfs_folio_set_uptodate(fs_info, folio, start, len);
@@ -782,7 +782,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
static int attach_extent_buffer_folio(struct extent_buffer *eb,
struct folio *folio,
- struct btrfs_subpage *prealloc)
+ struct btrfs_folio_state *prealloc)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
int ret = 0;
@@ -806,7 +806,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb,
/* Already mapped, just free prealloc */
if (folio_test_private(folio)) {
- btrfs_free_subpage(prealloc);
+ btrfs_free_folio_state(prealloc);
return 0;
}
@@ -815,7 +815,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb,
folio_attach_private(folio, prealloc);
else
/* Do new allocation to attach subpage */
- ret = btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA);
+ ret = btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA);
return ret;
}
@@ -831,7 +831,7 @@ int set_folio_extent_mapped(struct folio *folio)
fs_info = folio_to_fs_info(folio);
if (btrfs_is_subpage(fs_info, folio))
- return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA);
+ return btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA);
folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE);
return 0;
@@ -848,7 +848,7 @@ void clear_folio_extent_mapped(struct folio *folio)
fs_info = folio_to_fs_info(folio);
if (btrfs_is_subpage(fs_info, folio))
- return btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA);
+ return btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA);
folio_detach_private(folio);
}
@@ -882,6 +882,25 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode,
return em;
}
+
+static void btrfs_readahead_expand(struct readahead_control *ractl,
+ const struct extent_map *em)
+{
+ const u64 ra_pos = readahead_pos(ractl);
+ const u64 ra_end = ra_pos + readahead_length(ractl);
+ const u64 em_end = em->start + em->ram_bytes;
+
+ /* No expansion for holes and inline extents. */
+ if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE)
+ return;
+
+ ASSERT(em_end >= ra_pos,
+ "extent_map %llu %llu ends before current readahead position %llu",
+ em->start, em->len, ra_pos);
+ if (em_end > ra_end)
+ readahead_expand(ractl, ra_pos, em_end - ra_pos);
+}
+
/*
* basic readpage implementation. Locked extent state structs are inserted
* into the tree that are removed when the IO is done (by the end_io
@@ -945,6 +964,16 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
compress_type = btrfs_extent_map_compression(em);
+ /*
+ * Only expand readahead for extents which are already creating
+ * the pages anyway in add_ra_bio_pages, which is compressed
+ * extents in the non subpage case.
+ */
+ if (bio_ctrl->ractl &&
+ !btrfs_is_subpage(fs_info, folio) &&
+ compress_type != BTRFS_COMPRESS_NONE)
+ btrfs_readahead_expand(bio_ctrl->ractl, em);
+
if (compress_type != BTRFS_COMPRESS_NONE)
disk_bytenr = em->disk_bytenr;
else
@@ -1086,7 +1115,7 @@ static bool can_skip_one_ordered_range(struct btrfs_inode *inode,
* finished our folio read and unlocked the folio.
*/
if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) {
- u64 range_len = min(folio_pos(folio) + folio_size(folio),
+ u64 range_len = min(folio_end(folio),
ordered->file_offset + ordered->num_bytes) - cur;
ret = true;
@@ -1108,7 +1137,7 @@ static bool can_skip_one_ordered_range(struct btrfs_inode *inode,
* So we return true and update @next_ret to the OE/folio boundary.
*/
if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
- u64 range_len = min(folio_pos(folio) + folio_size(folio),
+ u64 range_len = min(folio_end(folio),
ordered->file_offset + ordered->num_bytes) - cur;
/*
@@ -1663,7 +1692,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
int ret;
size_t pg_offset;
loff_t i_size = i_size_read(&inode->vfs_inode);
- unsigned long end_index = i_size >> PAGE_SHIFT;
+ const pgoff_t end_index = i_size >> PAGE_SHIFT;
const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc);
@@ -1704,7 +1733,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
btrfs_err_rl(fs_info,
"root %lld ino %llu folio %llu is marked dirty without notifying the fs",
- inode->root->root_key.objectid,
+ btrfs_root_id(inode->root),
btrfs_ino(inode), folio_pos(folio));
ret = -EUCLEAN;
goto done;
@@ -1774,7 +1803,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
*/
spin_lock(&eb->refs_lock);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
- XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
+ XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
unsigned long flags;
set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
@@ -1874,7 +1903,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
+ XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
unsigned long flags;
xas_lock_irqsave(&xas, flags);
@@ -1886,7 +1915,7 @@ static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits);
+ XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
unsigned long flags;
xas_lock_irqsave(&xas, flags);
@@ -1961,7 +1990,7 @@ retry:
if (!eb)
return NULL;
- if (!atomic_inc_not_zero(&eb->refs)) {
+ if (!refcount_inc_not_zero(&eb->refs)) {
xas_reset(xas);
goto retry;
}
@@ -1986,7 +2015,7 @@ static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info,
rcu_read_lock();
while ((eb = find_get_eb(&xas, end, tag)) != NULL) {
if (!eb_batch_add(batch, eb)) {
- *start = ((eb->start + eb->len) >> fs_info->sectorsize_bits);
+ *start = ((eb->start + eb->len) >> fs_info->nodesize_bits);
goto out;
}
}
@@ -2008,11 +2037,11 @@ static struct extent_buffer *find_extent_buffer_nolock(
struct btrfs_fs_info *fs_info, u64 start)
{
struct extent_buffer *eb;
- unsigned long index = (start >> fs_info->sectorsize_bits);
+ unsigned long index = (start >> fs_info->nodesize_bits);
rcu_read_lock();
eb = xa_load(&fs_info->buffer_tree, index);
- if (eb && !atomic_inc_not_zero(&eb->refs))
+ if (eb && !refcount_inc_not_zero(&eb->refs))
eb = NULL;
rcu_read_unlock();
return eb;
@@ -2031,10 +2060,7 @@ static void end_bbio_meta_write(struct btrfs_bio *bbio)
}
buffer_tree_clear_mark(eb, PAGECACHE_TAG_WRITEBACK);
- clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
- smp_mb__after_atomic();
- wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
-
+ clear_and_wake_up_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
bio_put(&bbio->bio);
}
@@ -2085,7 +2111,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio = eb->folios[i];
u64 range_start = max_t(u64, eb->start, folio_pos(folio));
- u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
+ u32 range_len = min_t(u64, folio_end(folio),
eb->start + eb->len) - range_start;
folio_lock(folio);
@@ -2114,8 +2140,8 @@ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
u64 end)
{
struct eb_batch batch;
- unsigned long start_index = (start >> fs_info->sectorsize_bits);
- unsigned long end_index = (end >> fs_info->sectorsize_bits);
+ unsigned long start_index = (start >> fs_info->nodesize_bits);
+ unsigned long end_index = (end >> fs_info->nodesize_bits);
eb_batch_init(&batch);
while (start_index <= end_index) {
@@ -2151,7 +2177,7 @@ int btree_write_cache_pages(struct address_space *mapping,
eb_batch_init(&batch);
if (wbc->range_cyclic) {
- index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits);
+ index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->nodesize_bits);
end = -1;
/*
@@ -2160,8 +2186,8 @@ int btree_write_cache_pages(struct address_space *mapping,
*/
scanned = (index == 0);
} else {
- index = (wbc->range_start >> fs_info->sectorsize_bits);
- end = (wbc->range_end >> fs_info->sectorsize_bits);
+ index = (wbc->range_start >> fs_info->nodesize_bits);
+ end = (wbc->range_end >> fs_info->nodesize_bits);
scanned = 1;
}
@@ -2489,7 +2515,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
continue;
}
- cur_end = min_t(u64, folio_pos(folio) + folio_size(folio) - 1, end);
+ cur_end = min_t(u64, folio_end(folio) - 1, end);
cur_len = cur_end + 1 - cur;
ASSERT(folio_test_locked(folio));
@@ -2541,7 +2567,10 @@ int btrfs_writepages(struct address_space *mapping, struct writeback_control *wb
void btrfs_readahead(struct readahead_control *rac)
{
- struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD };
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .opf = REQ_OP_READ | REQ_RAHEAD,
+ .ractl = rac
+ };
struct folio *folio;
struct btrfs_inode *inode = BTRFS_I(rac->mapping->host);
const u64 start = readahead_pos(rac);
@@ -2731,13 +2760,13 @@ static int extent_buffer_under_io(const struct extent_buffer *eb)
static bool folio_range_has_eb(struct folio *folio)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
lockdep_assert_held(&folio->mapping->i_private_lock);
if (folio_test_private(folio)) {
- subpage = folio_get_private(folio);
- if (atomic_read(&subpage->eb_refs))
+ bfs = folio_get_private(folio);
+ if (atomic_read(&bfs->eb_refs))
return true;
}
return false;
@@ -2787,7 +2816,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
* attached to one dummy eb, no sharing.
*/
if (!mapped) {
- btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA);
+ btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA);
return;
}
@@ -2798,7 +2827,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
* page range and no unfinished IO.
*/
if (!folio_range_has_eb(folio))
- btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA);
+ btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA);
spin_unlock(&mapping->i_private_lock);
}
@@ -2842,7 +2871,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info
btrfs_leak_debug_add_eb(eb);
spin_lock_init(&eb->refs_lock);
- atomic_set(&eb->refs, 1);
+ refcount_set(&eb->refs, 1);
ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE);
@@ -2975,13 +3004,13 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
* once io is initiated, TREE_REF can no longer be cleared, so that is
* the moment at which any such race is best fixed.
*/
- refs = atomic_read(&eb->refs);
+ refs = refcount_read(&eb->refs);
if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
return;
spin_lock(&eb->refs_lock);
if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
- atomic_inc(&eb->refs);
+ refcount_inc(&eb->refs);
spin_unlock(&eb->refs_lock);
}
@@ -3038,7 +3067,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
eb->fs_info = fs_info;
again:
xa_lock_irq(&fs_info->buffer_tree);
- exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits,
+ exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->nodesize_bits,
NULL, eb, GFP_NOFS);
if (xa_is_err(exists)) {
ret = xa_err(exists);
@@ -3047,7 +3076,7 @@ again:
return ERR_PTR(ret);
}
if (exists) {
- if (!atomic_inc_not_zero(&exists->refs)) {
+ if (!refcount_inc_not_zero(&exists->refs)) {
/* The extent buffer is being freed, retry. */
xa_unlock_irq(&fs_info->buffer_tree);
goto again;
@@ -3092,7 +3121,7 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
* just overwrite folio private.
*/
exists = folio_get_private(folio);
- if (atomic_inc_not_zero(&exists->refs))
+ if (refcount_inc_not_zero(&exists->refs))
return exists;
WARN_ON(folio_test_dirty(folio));
@@ -3141,13 +3170,13 @@ static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
* The caller needs to free the existing folios and retry using the same order.
*/
static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
- struct btrfs_subpage *prealloc,
+ struct btrfs_folio_state *prealloc,
struct extent_buffer **found_eb_ret)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
- const unsigned long index = eb->start >> PAGE_SHIFT;
+ const pgoff_t index = eb->start >> PAGE_SHIFT;
struct folio *existing_folio;
int ret;
@@ -3224,7 +3253,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
int attached = 0;
struct extent_buffer *eb;
struct extent_buffer *existing_eb = NULL;
- struct btrfs_subpage *prealloc = NULL;
+ struct btrfs_folio_state *prealloc = NULL;
u64 lockdep_owner = owner_root;
bool page_contig = true;
int uptodate = 1;
@@ -3269,7 +3298,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
* manually if we exit earlier.
*/
if (btrfs_meta_is_subpage(fs_info)) {
- prealloc = btrfs_alloc_subpage(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA);
+ prealloc = btrfs_alloc_folio_state(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA);
if (IS_ERR(prealloc)) {
ret = PTR_ERR(prealloc);
goto out;
@@ -3280,7 +3309,7 @@ reallocate:
/* Allocate all pages first. */
ret = alloc_eb_folio_array(eb, true);
if (ret < 0) {
- btrfs_free_subpage(prealloc);
+ btrfs_free_folio_state(prealloc);
goto out;
}
@@ -3354,7 +3383,7 @@ reallocate:
again:
xa_lock_irq(&fs_info->buffer_tree);
existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
- start >> fs_info->sectorsize_bits, NULL, eb,
+ start >> fs_info->nodesize_bits, NULL, eb,
GFP_NOFS);
if (xa_is_err(existing_eb)) {
ret = xa_err(existing_eb);
@@ -3362,7 +3391,7 @@ again:
goto out;
}
if (existing_eb) {
- if (!atomic_inc_not_zero(&existing_eb->refs)) {
+ if (!refcount_inc_not_zero(&existing_eb->refs)) {
xa_unlock_irq(&fs_info->buffer_tree);
goto again;
}
@@ -3391,7 +3420,7 @@ again:
return eb;
out:
- WARN_ON(!atomic_dec_and_test(&eb->refs));
+ WARN_ON(!refcount_dec_and_test(&eb->refs));
/*
* Any attached folios need to be detached before we unlock them. This
@@ -3437,8 +3466,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
{
lockdep_assert_held(&eb->refs_lock);
- WARN_ON(atomic_read(&eb->refs) == 0);
- if (atomic_dec_and_test(&eb->refs)) {
+ if (refcount_dec_and_test(&eb->refs)) {
struct btrfs_fs_info *fs_info = eb->fs_info;
spin_unlock(&eb->refs_lock);
@@ -3458,7 +3486,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
* in this case.
*/
xa_cmpxchg_irq(&fs_info->buffer_tree,
- eb->start >> fs_info->sectorsize_bits, eb, NULL,
+ eb->start >> fs_info->nodesize_bits, eb, NULL,
GFP_ATOMIC);
btrfs_leak_debug_del_eb(eb);
@@ -3484,22 +3512,26 @@ void free_extent_buffer(struct extent_buffer *eb)
if (!eb)
return;
- refs = atomic_read(&eb->refs);
+ refs = refcount_read(&eb->refs);
while (1) {
- if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
- || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
- refs == 1))
+ if (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) {
+ if (refs == 1)
+ break;
+ } else if (refs <= 3) {
break;
- if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1))
+ }
+
+ /* Optimization to avoid locking eb->refs_lock. */
+ if (atomic_try_cmpxchg(&eb->refs.refs, &refs, refs - 1))
return;
}
spin_lock(&eb->refs_lock);
- if (atomic_read(&eb->refs) == 2 &&
+ if (refcount_read(&eb->refs) == 2 &&
test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
!extent_buffer_under_io(eb) &&
test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
- atomic_dec(&eb->refs);
+ refcount_dec(&eb->refs);
/*
* I know this is terrible, but it's temporary until we stop tracking
@@ -3516,9 +3548,9 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
spin_lock(&eb->refs_lock);
set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
- if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
+ if (refcount_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
- atomic_dec(&eb->refs);
+ refcount_dec(&eb->refs);
release_extent_buffer(eb);
}
@@ -3576,7 +3608,7 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
btree_clear_folio_dirty_tag(folio);
folio_unlock(folio);
}
- WARN_ON(atomic_read(&eb->refs) == 0);
+ WARN_ON(refcount_read(&eb->refs) == 0);
}
void set_extent_buffer_dirty(struct extent_buffer *eb)
@@ -3587,7 +3619,7 @@ void set_extent_buffer_dirty(struct extent_buffer *eb)
was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
- WARN_ON(atomic_read(&eb->refs) == 0);
+ WARN_ON(refcount_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags));
@@ -3646,9 +3678,7 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb)
static void clear_extent_buffer_reading(struct extent_buffer *eb)
{
- clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
- smp_mb__after_atomic();
- wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
+ clear_and_wake_up_bit(EXTENT_BUFFER_READING, &eb->bflags);
}
static void end_bbio_meta_read(struct btrfs_bio *bbio)
@@ -3713,7 +3743,7 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num,
eb->read_mirror = 0;
check_buffer_tree_ref(eb);
- atomic_inc(&eb->refs);
+ refcount_inc(&eb->refs);
bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
REQ_OP_READ | REQ_META, eb->fs_info,
@@ -3725,7 +3755,7 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num,
for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio = eb->folios[i];
u64 range_start = max_t(u64, eb->start, folio_pos(folio));
- u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
+ u32 range_len = min_t(u64, folio_end(folio),
eb->start + eb->len) - range_start;
bio_add_folio_nofail(&bbio->bio, folio, range_len,
@@ -4104,8 +4134,8 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
* @start: offset of the bitmap item in the extent buffer
* @nr: bit number to test
*/
-int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
- unsigned long nr)
+bool extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
+ unsigned long nr)
{
unsigned long i;
size_t offset;
@@ -4296,9 +4326,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
{
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
struct extent_buffer *eb;
- unsigned long start = (folio_pos(folio) >> fs_info->sectorsize_bits);
+ unsigned long start = (folio_pos(folio) >> fs_info->nodesize_bits);
unsigned long index = start;
- unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1;
+ unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1;
int ret;
xa_lock_irq(&fs_info->buffer_tree);
@@ -4308,11 +4338,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
* won't disappear out from under us.
*/
spin_lock(&eb->refs_lock);
- if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+ if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
spin_unlock(&eb->refs_lock);
continue;
}
- xa_unlock_irq(&fs_info->buffer_tree);
/*
* If tree ref isn't set then we know the ref on this eb is a
@@ -4329,6 +4358,7 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
* check the folio private at the end. And
* release_extent_buffer() will release the refs_lock.
*/
+ xa_unlock_irq(&fs_info->buffer_tree);
release_extent_buffer(eb);
xa_lock_irq(&fs_info->buffer_tree);
}
@@ -4374,7 +4404,7 @@ int try_release_extent_buffer(struct folio *folio)
* this page.
*/
spin_lock(&eb->refs_lock);
- if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+ if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
spin_unlock(&eb->refs_lock);
spin_unlock(&folio->mapping->i_private_lock);
return 0;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index e36e8d6a00bc..61130786b9a3 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -98,7 +98,7 @@ struct extent_buffer {
void *addr;
spinlock_t refs_lock;
- atomic_t refs;
+ refcount_t refs;
int read_mirror;
/* >= 0 if eb belongs to a log tree, -1 otherwise */
s8 log_index;
@@ -345,8 +345,8 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
unsigned long len);
void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
unsigned long len);
-int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
- unsigned long pos);
+bool extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
+ unsigned long pos);
void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len);
void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 02bfdb976e40..57f52585a6dd 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -84,7 +84,7 @@ static void remove_em(struct btrfs_inode *inode, struct extent_map *em)
rb_erase(&em->rb_node, &inode->extent_tree.root);
RB_CLEAR_NODE(&em->rb_node);
- if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root)))
+ if (!btrfs_is_testing(fs_info) && btrfs_is_fstree(btrfs_root_id(inode->root)))
percpu_counter_dec(&fs_info->evictable_extent_maps);
}
@@ -502,7 +502,7 @@ static int add_extent_mapping(struct btrfs_inode *inode,
setup_extent_mapping(inode, em, modified);
- if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(root)))
+ if (!btrfs_is_testing(fs_info) && btrfs_is_fstree(btrfs_root_id(root)))
percpu_counter_inc(&fs_info->evictable_extent_maps);
return 0;
@@ -1337,7 +1337,7 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
if (!root)
continue;
- if (is_fstree(btrfs_root_id(root)))
+ if (btrfs_is_fstree(btrfs_root_id(root)))
nr_dropped += btrfs_scan_root(root, &ctx);
btrfs_put_root(root);
diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c
index 43bf0979fd53..7935586a9dbd 100644
--- a/fs/btrfs/fiemap.c
+++ b/fs/btrfs/fiemap.c
@@ -320,7 +320,7 @@ static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *p
* the cost of allocating a new one.
*/
ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags));
- atomic_inc(&clone->refs);
+ refcount_inc(&clone->refs);
ret = btrfs_next_leaf(inode->root, path);
if (ret != 0)
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 54d523d4f421..c09fbc257634 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -427,7 +427,7 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
memset(csum_dst, 0, csum_size);
count = 1;
- if (btrfs_root_id(inode->root) == BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ if (btrfs_is_data_reloc_root(inode->root)) {
u64 file_offset = bbio->file_offset + bio_offset;
btrfs_set_extent_bit(&inode->io_tree, file_offset,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8ce6f45f45e0..204674934795 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -89,8 +89,7 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos
num_bytes = round_up(write_bytes + pos - start_pos,
fs_info->sectorsize);
ASSERT(num_bytes <= U32_MAX);
- ASSERT(folio_pos(folio) <= pos &&
- folio_pos(folio) + folio_size(folio) >= pos + write_bytes);
+ ASSERT(folio_pos(folio) <= pos && folio_end(folio) >= pos + write_bytes);
end_of_last_block = start_pos + num_bytes - 1;
@@ -801,7 +800,7 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64
u64 len)
{
u64 clamp_start = max_t(u64, pos, folio_pos(folio));
- u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio));
+ u64 clamp_end = min_t(u64, pos + len, folio_end(folio));
const u32 blocksize = inode_to_fs_info(inode)->sectorsize;
int ret = 0;
@@ -857,7 +856,7 @@ static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_
loff_t pos, size_t write_bytes,
bool nowait)
{
- unsigned long index = pos >> PAGE_SHIFT;
+ const pgoff_t index = pos >> PAGE_SHIFT;
gfp_t mask = get_prepare_gfp_flags(inode, nowait);
fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) |
fgf_set_order(write_bytes);
@@ -963,6 +962,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
* @pos: File offset.
* @write_bytes: The length to write, will be updated to the nocow writeable
* range.
+ * @nowait: Indicate if we can block or not (non-blocking IO context).
*
* This function will flush ordered extents in the range to ensure proper
* nocow checks.
@@ -971,7 +971,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
* > 0 If we can nocow, and updates @write_bytes.
* 0 If we can't do a nocow write.
* -EAGAIN If we can't do a nocow write because snapshoting of the inode's
- * root is in progress.
+ * root is in progress or because we are in a non-blocking IO
+ * context and need to block (@nowait is true).
* < 0 If an error happened.
*
* NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
@@ -983,8 +984,8 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
struct btrfs_root *root = inode->root;
struct extent_state *cached_state = NULL;
u64 lockstart, lockend;
- u64 num_bytes;
- int ret;
+ u64 cur_offset;
+ int ret = 0;
if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
return 0;
@@ -995,7 +996,6 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
lockstart = round_down(pos, fs_info->sectorsize);
lockend = round_up(pos + *write_bytes,
fs_info->sectorsize) - 1;
- num_bytes = lockend - lockstart + 1;
if (nowait) {
if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
@@ -1007,14 +1007,36 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
&cached_state);
}
- ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, nowait);
- if (ret <= 0)
- btrfs_drew_write_unlock(&root->snapshot_lock);
- else
- *write_bytes = min_t(size_t, *write_bytes ,
- num_bytes - pos + lockstart);
+
+ cur_offset = lockstart;
+ while (cur_offset < lockend) {
+ u64 num_bytes = lockend - cur_offset + 1;
+
+ ret = can_nocow_extent(inode, cur_offset, &num_bytes, NULL, nowait);
+ if (ret <= 0) {
+ /*
+ * If cur_offset == lockstart it means we haven't found
+ * any extent against which we can NOCOW, so unlock the
+ * snapshot lock.
+ */
+ if (cur_offset == lockstart)
+ btrfs_drew_write_unlock(&root->snapshot_lock);
+ break;
+ }
+ cur_offset += num_bytes;
+ }
+
btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+ /*
+ * cur_offset > lockstart means there's at least a partial range we can
+ * NOCOW, and that range can cover one or more extents.
+ */
+ if (cur_offset > lockstart) {
+ *write_bytes = min_t(size_t, *write_bytes, cur_offset - pos);
+ return 1;
+ }
+
return ret;
}
@@ -1233,8 +1255,8 @@ again:
* The reserved range goes beyond the current folio, shrink the reserved
* space to the folio boundary.
*/
- if (reserved_start + reserved_len > folio_pos(folio) + folio_size(folio)) {
- const u64 last_block = folio_pos(folio) + folio_size(folio);
+ if (reserved_start + reserved_len > folio_end(folio)) {
+ const u64 last_block = folio_end(folio);
shrink_reserved_space(inode, *data_reserved, reserved_start,
reserved_len, last_block - reserved_start,
@@ -1832,9 +1854,9 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
{
struct page *page = vmf->page;
struct folio *folio = page_folio(page);
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_inode *inode = BTRFS_I(file_inode(vmf->vma->vm_file));
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
@@ -1842,6 +1864,7 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
loff_t size;
size_t fsize = folio_size(folio);
int ret;
+ bool only_release_metadata = false;
u64 reserved_space;
u64 page_start;
u64 page_end;
@@ -1849,7 +1872,7 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
reserved_space = fsize;
- sb_start_pagefault(inode->i_sb);
+ sb_start_pagefault(inode->vfs_inode.i_sb);
page_start = folio_pos(folio);
page_end = page_start + folio_size(folio) - 1;
end = page_end;
@@ -1862,20 +1885,43 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
* end up waiting indefinitely to get a lock on the page currently
* being processed by btrfs_page_mkwrite() function.
*/
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- page_start, reserved_space);
- if (ret < 0)
+ ret = btrfs_check_data_free_space(inode, &data_reserved, page_start,
+ reserved_space, false);
+ if (ret < 0) {
+ size_t write_bytes = reserved_space;
+
+ if (btrfs_check_nocow_lock(inode, page_start, &write_bytes, false) <= 0)
+ goto out_noreserve;
+
+ only_release_metadata = true;
+
+ /*
+ * Can't write the whole range, there may be shared extents or
+ * holes in the range, bail out with @only_release_metadata set
+ * to true so that we unlock the nocow lock before returning the
+ * error.
+ */
+ if (write_bytes < reserved_space)
+ goto out_noreserve;
+ }
+ ret = btrfs_delalloc_reserve_metadata(inode, reserved_space,
+ reserved_space, false);
+ if (ret < 0) {
+ if (!only_release_metadata)
+ btrfs_free_reserved_data_space(inode, data_reserved,
+ page_start, reserved_space);
goto out_noreserve;
+ }
ret = file_update_time(vmf->vma->vm_file);
if (ret < 0)
goto out;
again:
- down_read(&BTRFS_I(inode)->i_mmap_lock);
+ down_read(&inode->i_mmap_lock);
folio_lock(folio);
- size = i_size_read(inode);
+ size = i_size_read(&inode->vfs_inode);
- if ((folio->mapping != inode->i_mapping) ||
+ if ((folio->mapping != inode->vfs_inode.i_mapping) ||
(page_start >= size)) {
/* Page got truncated out from underneath us. */
goto out_unlock;
@@ -1893,11 +1939,11 @@ again:
* We can't set the delalloc bits if there are pending ordered
* extents. Drop our locks and wait for them to finish.
*/
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, fsize);
+ ordered = btrfs_lookup_ordered_range(inode, page_start, fsize);
if (ordered) {
btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
folio_unlock(folio);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
+ up_read(&inode->i_mmap_lock);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
goto again;
@@ -1906,10 +1952,14 @@ again:
if (folio_contains(folio, (size - 1) >> PAGE_SHIFT)) {
reserved_space = round_up(size - page_start, fs_info->sectorsize);
if (reserved_space < fsize) {
+ const u64 to_free = fsize - reserved_space;
+
end = page_start + reserved_space - 1;
- btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved, end + 1,
- fsize - reserved_space, true);
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, to_free, true);
+ else
+ btrfs_delalloc_release_space(inode, data_reserved,
+ end + 1, to_free, true);
}
}
@@ -1920,12 +1970,11 @@ again:
* clear any delalloc bits within this page range since we have to
* reserve data&meta space before lock_page() (see above comments).
*/
- btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
+ btrfs_clear_extent_bit(io_tree, page_start, end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, &cached_state);
- ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
- &cached_state);
+ ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state);
if (ret < 0) {
btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
goto out_unlock;
@@ -1944,26 +1993,38 @@ again:
btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
- btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
+ btrfs_set_inode_last_sub_trans(inode);
+
+ if (only_release_metadata)
+ btrfs_set_extent_bit(io_tree, page_start, end, EXTENT_NORESERVE,
+ &cached_state);
btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
+ up_read(&inode->i_mmap_lock);
- btrfs_delalloc_release_extents(BTRFS_I(inode), fsize);
- sb_end_pagefault(inode->i_sb);
+ btrfs_delalloc_release_extents(inode, fsize);
+ if (only_release_metadata)
+ btrfs_check_nocow_unlock(inode);
+ sb_end_pagefault(inode->vfs_inode.i_sb);
extent_changeset_free(data_reserved);
return VM_FAULT_LOCKED;
out_unlock:
folio_unlock(folio);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
+ up_read(&inode->i_mmap_lock);
out:
- btrfs_delalloc_release_extents(BTRFS_I(inode), fsize);
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
- reserved_space, true);
+ btrfs_delalloc_release_extents(inode, fsize);
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, reserved_space, true);
+ else
+ btrfs_delalloc_release_space(inode, data_reserved, page_start,
+ reserved_space, true);
extent_changeset_free(data_reserved);
out_noreserve:
- sb_end_pagefault(inode->i_sb);
+ if (only_release_metadata)
+ btrfs_check_nocow_unlock(inode);
+
+ sb_end_pagefault(inode->vfs_inode.i_sb);
if (ret < 0)
return vmf_error(ret);
@@ -1978,15 +2039,16 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
.page_mkwrite = btrfs_page_mkwrite,
};
-static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
+static int btrfs_file_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *filp = desc->file;
struct address_space *mapping = filp->f_mapping;
if (!mapping->a_ops->read_folio)
return -ENOEXEC;
file_accessed(filp);
- vma->vm_ops = &btrfs_file_vm_ops;
+ desc->vm_ops = &btrfs_file_vm_ops;
return 0;
}
@@ -2195,7 +2257,7 @@ static bool check_range_has_page(struct inode *inode, u64 start, u64 end)
if (folio->index < start_index)
continue;
/* A large folio extends beyond the end. Not a target. */
- if (folio->index + folio_nr_pages(folio) > end_index)
+ if (folio_next_index(folio) > end_index)
continue;
/* A folio doesn't cover the head/tail index. Found a target. */
ret = true;
@@ -2341,7 +2403,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
struct btrfs_trans_handle *trans = NULL;
- struct btrfs_block_rsv *rsv;
+ struct btrfs_block_rsv rsv;
unsigned int rsv_count;
u64 cur_offset;
u64 len = end - start;
@@ -2350,13 +2412,9 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
if (end <= start)
return -EINVAL;
- rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
- if (!rsv) {
- ret = -ENOMEM;
- goto out;
- }
- rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
- rsv->failfast = true;
+ btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
+ rsv.size = btrfs_calc_insert_metadata_size(fs_info, 1);
+ rsv.failfast = true;
/*
* 1 - update the inode
@@ -2373,14 +2431,14 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
- goto out_free;
+ goto out_release;
}
- ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
+ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv,
min_size, false);
if (WARN_ON(ret))
goto out_trans;
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
cur_offset = start;
drop_args.path = path;
@@ -2496,10 +2554,10 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
}
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
- rsv, min_size, false);
+ &rsv, min_size, false);
if (WARN_ON(ret))
break;
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
cur_offset = drop_args.drop_end;
len = end - cur_offset;
@@ -2576,16 +2634,15 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
out_trans:
if (!trans)
- goto out_free;
+ goto out_release;
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret)
btrfs_end_transaction(trans);
else
*trans_out = trans;
-out_free:
- btrfs_free_block_rsv(fs_info, rsv);
-out:
+out_release:
+ btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
return ret;
}
@@ -3765,7 +3822,7 @@ const struct file_operations btrfs_file_operations = {
.splice_read = filemap_splice_read,
.write_iter = btrfs_file_write_iter,
.splice_write = iter_file_splice_write,
- .mmap = btrfs_file_mmap,
+ .mmap_prepare = btrfs_file_mmap_prepare,
.open = btrfs_file_open,
.release = btrfs_release_file,
.get_unmapped_area = thp_get_unmapped_area,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4b34ea1f01c2..5d8d1570a5c9 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -366,7 +366,7 @@ fail:
static void readahead_cache(struct inode *inode)
{
struct file_ra_state ra;
- unsigned long last_index;
+ pgoff_t last_index;
file_ra_state_init(&ra, inode->i_mapping);
last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
@@ -3192,7 +3192,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
u64 *max_extent_size)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
- int err;
+ int ret2;
u64 search_start = cluster->window_start;
u64 search_bytes = bytes;
u64 ret = 0;
@@ -3200,8 +3200,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
search_start = min_start;
search_bytes = bytes;
- err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
- if (err) {
+ ret2 = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
+ if (ret2) {
*max_extent_size = max(get_max_extent_size(entry),
*max_extent_size);
return 0;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 0c573d46639a..eba7f22ae49c 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -35,7 +35,7 @@ static struct btrfs_root *btrfs_free_space_root(
return btrfs_global_root(block_group->fs_info, &key);
}
-void set_free_space_tree_thresholds(struct btrfs_block_group *cache)
+void btrfs_set_free_space_tree_thresholds(struct btrfs_block_group *cache)
{
u32 bitmap_range;
size_t bitmap_size;
@@ -82,22 +82,19 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info));
if (ret)
- goto out;
+ return ret;
leaf = path->nodes[0];
info = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_free_space_info);
btrfs_set_free_space_extent_count(leaf, info, 0);
btrfs_set_free_space_flags(leaf, info, 0);
-
- ret = 0;
-out:
btrfs_release_path(path);
- return ret;
+ return 0;
}
EXPORT_FOR_TESTS
-struct btrfs_free_space_info *search_free_space_info(
+struct btrfs_free_space_info *btrfs_search_free_space_info(
struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path, int cow)
@@ -201,9 +198,9 @@ static void le_bitmap_set(unsigned long *map, unsigned int start, int len)
}
EXPORT_FOR_TESTS
-int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path)
+int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = btrfs_free_space_root(block_group);
@@ -281,7 +278,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
}
- info = search_free_space_info(trans, block_group, path, 1);
+ info = btrfs_search_free_space_info(trans, block_group, path, 1);
if (IS_ERR(info)) {
ret = PTR_ERR(info);
btrfs_abort_transaction(trans, ret);
@@ -290,6 +287,8 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
flags = btrfs_free_space_flags(leaf, info);
flags |= BTRFS_FREE_SPACE_USING_BITMAPS;
+ block_group->using_free_space_bitmaps = true;
+ block_group->using_free_space_bitmaps_cached = true;
btrfs_set_free_space_flags(leaf, info, flags);
expected_extent_count = btrfs_free_space_extent_count(leaf, info);
btrfs_release_path(path);
@@ -343,9 +342,9 @@ out:
}
EXPORT_FOR_TESTS
-int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path)
+int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = btrfs_free_space_root(block_group);
@@ -409,12 +408,12 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
data_size = free_space_bitmap_size(fs_info,
found_key.offset);
- ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1);
+ path->slots[0]--;
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
read_extent_buffer(leaf, bitmap_cursor, ptr,
data_size);
nr++;
- path->slots[0]--;
} else {
ASSERT(0);
}
@@ -428,7 +427,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
}
- info = search_free_space_info(trans, block_group, path, 1);
+ info = btrfs_search_free_space_info(trans, block_group, path, 1);
if (IS_ERR(info)) {
ret = PTR_ERR(info);
btrfs_abort_transaction(trans, ret);
@@ -437,20 +436,22 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
flags = btrfs_free_space_flags(leaf, info);
flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS;
+ block_group->using_free_space_bitmaps = false;
+ block_group->using_free_space_bitmaps_cached = true;
btrfs_set_free_space_flags(leaf, info, flags);
expected_extent_count = btrfs_free_space_extent_count(leaf, info);
btrfs_release_path(path);
- nrbits = block_group->length >> block_group->fs_info->sectorsize_bits;
+ nrbits = block_group->length >> fs_info->sectorsize_bits;
start_bit = find_next_bit_le(bitmap, nrbits, 0);
while (start_bit < nrbits) {
end_bit = find_next_zero_bit_le(bitmap, nrbits, start_bit);
ASSERT(start_bit < end_bit);
- key.objectid = start + start_bit * block_group->fs_info->sectorsize;
+ key.objectid = start + start_bit * fs_info->sectorsize;
key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
- key.offset = (end_bit - start_bit) * block_group->fs_info->sectorsize;
+ key.offset = (end_bit - start_bit) * fs_info->sectorsize;
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
if (ret) {
@@ -493,11 +494,10 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
if (new_extents == 0)
return 0;
- info = search_free_space_info(trans, block_group, path, 1);
- if (IS_ERR(info)) {
- ret = PTR_ERR(info);
- goto out;
- }
+ info = btrfs_search_free_space_info(trans, block_group, path, 1);
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+
flags = btrfs_free_space_flags(path->nodes[0], info);
extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
@@ -507,19 +507,18 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
extent_count > block_group->bitmap_high_thresh) {
- ret = convert_free_space_to_bitmaps(trans, block_group, path);
+ ret = btrfs_convert_free_space_to_bitmaps(trans, block_group, path);
} else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
extent_count < block_group->bitmap_low_thresh) {
- ret = convert_free_space_to_extents(trans, block_group, path);
+ ret = btrfs_convert_free_space_to_extents(trans, block_group, path);
}
-out:
return ret;
}
EXPORT_FOR_TESTS
-int free_space_test_bit(struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 offset)
+bool btrfs_free_space_test_bit(struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 offset)
{
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -537,13 +536,13 @@ int free_space_test_bit(struct btrfs_block_group *block_group,
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
i = div_u64(offset - found_start,
block_group->fs_info->sectorsize);
- return !!extent_buffer_test_bit(leaf, ptr, i);
+ return extent_buffer_test_bit(leaf, ptr, i);
}
-static void free_space_set_bits(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 *start, u64 *size,
- int bit)
+static void free_space_modify_bits(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 *start, u64 *size,
+ bool set_bits)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct extent_buffer *leaf;
@@ -567,7 +566,7 @@ static void free_space_set_bits(struct btrfs_trans_handle *trans,
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
first = (*start - found_start) >> fs_info->sectorsize_bits;
last = (end - found_start) >> fs_info->sectorsize_bits;
- if (bit)
+ if (set_bits)
extent_buffer_bitmap_set(leaf, ptr, first, last - first);
else
extent_buffer_bitmap_clear(leaf, ptr, first, last - first);
@@ -611,13 +610,14 @@ static int free_space_next_bitmap(struct btrfs_trans_handle *trans,
static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path,
- u64 start, u64 size, int remove)
+ u64 start, u64 size, bool remove)
{
struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_key key;
u64 end = start + size;
u64 cur_start, cur_size;
- int prev_bit, next_bit;
+ bool prev_bit_set = false;
+ bool next_bit_set = false;
int new_extents;
int ret;
@@ -634,16 +634,16 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
if (ret)
- goto out;
+ return ret;
- prev_bit = free_space_test_bit(block_group, path, prev_block);
+ prev_bit_set = btrfs_free_space_test_bit(block_group, path, prev_block);
/* The previous block may have been in the previous bitmap. */
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (start >= key.objectid + key.offset) {
ret = free_space_next_bitmap(trans, root, path);
if (ret)
- goto out;
+ return ret;
}
} else {
key.objectid = start;
@@ -652,9 +652,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
if (ret)
- goto out;
-
- prev_bit = -1;
+ return ret;
}
/*
@@ -664,13 +662,13 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
cur_start = start;
cur_size = size;
while (1) {
- free_space_set_bits(trans, block_group, path, &cur_start, &cur_size,
- !remove);
+ free_space_modify_bits(trans, block_group, path, &cur_start,
+ &cur_size, !remove);
if (cur_size == 0)
break;
ret = free_space_next_bitmap(trans, root, path);
if (ret)
- goto out;
+ return ret;
}
/*
@@ -683,42 +681,36 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
if (end >= key.objectid + key.offset) {
ret = free_space_next_bitmap(trans, root, path);
if (ret)
- goto out;
+ return ret;
}
- next_bit = free_space_test_bit(block_group, path, end);
- } else {
- next_bit = -1;
+ next_bit_set = btrfs_free_space_test_bit(block_group, path, end);
}
if (remove) {
new_extents = -1;
- if (prev_bit == 1) {
+ if (prev_bit_set) {
/* Leftover on the left. */
new_extents++;
}
- if (next_bit == 1) {
+ if (next_bit_set) {
/* Leftover on the right. */
new_extents++;
}
} else {
new_extents = 1;
- if (prev_bit == 1) {
+ if (prev_bit_set) {
/* Merging with neighbor on the left. */
new_extents--;
}
- if (next_bit == 1) {
+ if (next_bit_set) {
/* Merging with neighbor on the right. */
new_extents--;
}
}
btrfs_release_path(path);
- ret = update_free_space_extent_count(trans, block_group, path,
- new_extents);
-
-out:
- return ret;
+ return update_free_space_extent_count(trans, block_group, path, new_extents);
}
static int remove_free_space_extent(struct btrfs_trans_handle *trans,
@@ -739,7 +731,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
if (ret)
- goto out;
+ return ret;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -771,7 +763,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
/* Delete the existing key (cases 1-4). */
ret = btrfs_del_item(trans, root, path);
if (ret)
- goto out;
+ return ret;
/* Add a key for leftovers at the beginning (cases 3 and 4). */
if (start > found_start) {
@@ -782,7 +774,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
if (ret)
- goto out;
+ return ret;
new_extents++;
}
@@ -795,50 +787,58 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
if (ret)
- goto out;
+ return ret;
new_extents++;
}
btrfs_release_path(path);
- ret = update_free_space_extent_count(trans, block_group, path,
- new_extents);
-
-out:
- return ret;
+ return update_free_space_extent_count(trans, block_group, path, new_extents);
}
-EXPORT_FOR_TESTS
-int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 start, u64 size)
+static int using_bitmaps(struct btrfs_block_group *bg, struct btrfs_path *path)
{
struct btrfs_free_space_info *info;
u32 flags;
- int ret;
- if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
- ret = __add_block_group_free_space(trans, block_group, path);
- if (ret)
- return ret;
- }
+ if (bg->using_free_space_bitmaps_cached)
+ return bg->using_free_space_bitmaps;
- info = search_free_space_info(NULL, block_group, path, 0);
+ info = btrfs_search_free_space_info(NULL, bg, path, 0);
if (IS_ERR(info))
return PTR_ERR(info);
flags = btrfs_free_space_flags(path->nodes[0], info);
btrfs_release_path(path);
- if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ bg->using_free_space_bitmaps = (flags & BTRFS_FREE_SPACE_USING_BITMAPS);
+ bg->using_free_space_bitmaps_cached = true;
+
+ return bg->using_free_space_bitmaps;
+}
+
+EXPORT_FOR_TESTS
+int __btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 start, u64 size)
+{
+ int ret;
+
+ ret = __add_block_group_free_space(trans, block_group, path);
+ if (ret)
+ return ret;
+
+ ret = using_bitmaps(block_group, path);
+ if (ret < 0)
+ return ret;
+
+ if (ret)
return modify_free_space_bitmap(trans, block_group, path,
- start, size, 1);
- } else {
- return remove_free_space_extent(trans, block_group, path,
- start, size);
- }
+ start, size, true);
+
+ return remove_free_space_extent(trans, block_group, path, start, size);
}
-int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- u64 start, u64 size)
+int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ u64 start, u64 size)
{
struct btrfs_block_group *block_group;
struct btrfs_path *path;
@@ -863,8 +863,7 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
}
mutex_lock(&block_group->free_space_lock);
- ret = __remove_from_free_space_tree(trans, block_group, path, start,
- size);
+ ret = __btrfs_remove_from_free_space_tree(trans, block_group, path, start, size);
mutex_unlock(&block_group->free_space_lock);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -918,7 +917,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans,
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
if (ret)
- goto out;
+ return ret;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -941,7 +940,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans,
if (found_end == start) {
ret = btrfs_del_item(trans, root, path);
if (ret)
- goto out;
+ return ret;
new_key.objectid = found_start;
new_key.offset += key.offset;
new_extents--;
@@ -958,7 +957,7 @@ right:
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
if (ret)
- goto out;
+ return ret;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -982,7 +981,7 @@ right:
if (found_start == end) {
ret = btrfs_del_item(trans, root, path);
if (ret)
- goto out;
+ return ret;
new_key.offset += key.offset;
new_extents--;
}
@@ -992,48 +991,36 @@ insert:
/* Insert the new key (cases 1-4). */
ret = btrfs_insert_empty_item(trans, root, path, &new_key, 0);
if (ret)
- goto out;
+ return ret;
btrfs_release_path(path);
- ret = update_free_space_extent_count(trans, block_group, path,
- new_extents);
-
-out:
- return ret;
+ return update_free_space_extent_count(trans, block_group, path, new_extents);
}
EXPORT_FOR_TESTS
-int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 start, u64 size)
+int __btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 start, u64 size)
{
- struct btrfs_free_space_info *info;
- u32 flags;
int ret;
- if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
- ret = __add_block_group_free_space(trans, block_group, path);
- if (ret)
- return ret;
- }
+ ret = __add_block_group_free_space(trans, block_group, path);
+ if (ret)
+ return ret;
- info = search_free_space_info(NULL, block_group, path, 0);
- if (IS_ERR(info))
- return PTR_ERR(info);
- flags = btrfs_free_space_flags(path->nodes[0], info);
- btrfs_release_path(path);
+ ret = using_bitmaps(block_group, path);
+ if (ret < 0)
+ return ret;
- if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ if (ret)
return modify_free_space_bitmap(trans, block_group, path,
- start, size, 0);
- } else {
- return add_free_space_extent(trans, block_group, path, start,
- size);
- }
+ start, size, false);
+
+ return add_free_space_extent(trans, block_group, path, start, size);
}
-int add_to_free_space_tree(struct btrfs_trans_handle *trans,
- u64 start, u64 size)
+int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ u64 start, u64 size)
{
struct btrfs_block_group *block_group;
struct btrfs_path *path;
@@ -1058,7 +1045,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans,
}
mutex_lock(&block_group->free_space_lock);
- ret = __add_to_free_space_tree(trans, block_group, path, start, size);
+ ret = __btrfs_add_to_free_space_tree(trans, block_group, path, start, size);
mutex_unlock(&block_group->free_space_lock);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -1115,11 +1102,21 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0);
if (ret < 0)
goto out_locked;
- ASSERT(ret == 0);
+ /*
+ * If ret is 1 (no key found), it means this is an empty block group,
+ * without any extents allocated from it and there's no block group
+ * item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree
+ * because we are using the block group tree feature, so block group
+ * items are stored in the block group tree. It also means there are no
+ * extents allocated for block groups with a start offset beyond this
+ * block group's end offset (this is the last, highest, block group).
+ */
+ if (!btrfs_fs_compat_ro(trans->fs_info, BLOCK_GROUP_TREE))
+ ASSERT(ret == 0);
start = block_group->start;
end = block_group->start + block_group->length;
- while (1) {
+ while (ret == 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.type == BTRFS_EXTENT_ITEM_KEY ||
@@ -1128,11 +1125,11 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
break;
if (start < key.objectid) {
- ret = __add_to_free_space_tree(trans,
- block_group,
- path2, start,
- key.objectid -
- start);
+ ret = __btrfs_add_to_free_space_tree(trans,
+ block_group,
+ path2, start,
+ key.objectid -
+ start);
if (ret)
goto out_locked;
}
@@ -1149,12 +1146,10 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
ret = btrfs_next_item(extent_root, path);
if (ret < 0)
goto out_locked;
- if (ret)
- break;
}
if (start < end) {
- ret = __add_to_free_space_tree(trans, block_group, path2,
- start, end - start);
+ ret = __btrfs_add_to_free_space_tree(trans, block_group, path2,
+ start, end - start);
if (ret)
goto out_locked;
}
@@ -1233,6 +1228,7 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans,
{
BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
+ struct rb_node *node;
int nr;
int ret;
@@ -1261,6 +1257,16 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
}
+ node = rb_first_cached(&trans->fs_info->block_group_cache_tree);
+ while (node) {
+ struct btrfs_block_group *bg;
+
+ bg = rb_entry(node, struct btrfs_block_group, cache_node);
+ clear_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &bg->runtime_flags);
+ node = rb_next(node);
+ cond_resched();
+ }
+
return 0;
}
@@ -1350,12 +1356,18 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
+
+ if (test_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED,
+ &block_group->runtime_flags))
+ goto next;
+
ret = populate_free_space_tree(trans, block_group);
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
}
+next:
if (btrfs_should_end_transaction(trans)) {
btrfs_end_transaction(trans);
trans = btrfs_start_transaction(free_space_root, 1);
@@ -1378,51 +1390,79 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
+ bool own_path = false;
int ret;
- clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags);
+ if (!test_and_clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
+ &block_group->runtime_flags))
+ return 0;
+
+ /*
+ * While rebuilding the free space tree we may allocate new metadata
+ * block groups while modifying the free space tree.
+ *
+ * Because during the rebuild (at btrfs_rebuild_free_space_tree()) we
+ * can use multiple transactions, every time btrfs_end_transaction() is
+ * called at btrfs_rebuild_free_space_tree() we finish the creation of
+ * new block groups by calling btrfs_create_pending_block_groups(), and
+ * that in turn calls us, through add_block_group_free_space(), to add
+ * a free space info item and a free space extent item for the block
+ * group.
+ *
+ * Then later btrfs_rebuild_free_space_tree() may find such new block
+ * groups and processes them with populate_free_space_tree(), which can
+ * fail with EEXIST since there are already items for the block group in
+ * the free space tree. Notice that we say "may find" because a new
+ * block group may be added to the block groups rbtree in a node before
+ * or after the block group currently being processed by the rebuild
+ * process. So signal the rebuild process to skip such new block groups
+ * if it finds them.
+ */
+ set_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &block_group->runtime_flags);
+
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path) {
+ btrfs_abort_transaction(trans, -ENOMEM);
+ return -ENOMEM;
+ }
+ own_path = true;
+ }
ret = add_new_free_space_info(trans, block_group, path);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = __btrfs_add_to_free_space_tree(trans, block_group, path,
+ block_group->start, block_group->length);
if (ret)
- return ret;
+ btrfs_abort_transaction(trans, ret);
- return __add_to_free_space_tree(trans, block_group, path,
- block_group->start,
- block_group->length);
+out:
+ if (own_path)
+ btrfs_free_path(path);
+
+ return ret;
}
-int add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group)
+int btrfs_add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_path *path = NULL;
- int ret = 0;
+ int ret;
- if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
return 0;
mutex_lock(&block_group->free_space_lock);
- if (!test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags))
- goto out;
-
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = __add_block_group_free_space(trans, block_group, path);
-
-out:
- btrfs_free_path(path);
+ ret = __add_block_group_free_space(trans, block_group, NULL);
mutex_unlock(&block_group->free_space_lock);
- if (ret)
- btrfs_abort_transaction(trans, ret);
return ret;
}
-int remove_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group)
+int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group)
{
struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_path *path;
@@ -1443,6 +1483,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1455,8 +1496,10 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
while (!done) {
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
leaf = path->nodes[0];
nr = 0;
@@ -1484,16 +1527,16 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
}
ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
btrfs_release_path(path);
}
ret = 0;
out:
btrfs_free_path(path);
- if (ret)
- btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1505,7 +1548,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
struct btrfs_fs_info *fs_info;
struct btrfs_root *root;
struct btrfs_key key;
- int prev_bit = 0, bit;
+ bool prev_bit_set = false;
/* Initialize to silence GCC. */
u64 extent_start = 0;
u64 end, offset;
@@ -1522,7 +1565,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
while (1) {
ret = btrfs_next_item(root, path);
if (ret < 0)
- goto out;
+ return ret;
if (ret)
break;
@@ -1536,10 +1579,12 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
offset = key.objectid;
while (offset < key.objectid + key.offset) {
- bit = free_space_test_bit(block_group, path, offset);
- if (prev_bit == 0 && bit == 1) {
+ bool bit_set;
+
+ bit_set = btrfs_free_space_test_bit(block_group, path, offset);
+ if (!prev_bit_set && bit_set) {
extent_start = offset;
- } else if (prev_bit == 1 && bit == 0) {
+ } else if (prev_bit_set && !bit_set) {
u64 space_added;
ret = btrfs_add_new_free_space(block_group,
@@ -1547,7 +1592,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
offset,
&space_added);
if (ret)
- goto out;
+ return ret;
total_found += space_added;
if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
@@ -1555,14 +1600,14 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
}
extent_count++;
}
- prev_bit = bit;
+ prev_bit_set = bit_set;
offset += fs_info->sectorsize;
}
}
- if (prev_bit == 1) {
+ if (prev_bit_set) {
ret = btrfs_add_new_free_space(block_group, extent_start, end, NULL);
if (ret)
- goto out;
+ return ret;
extent_count++;
}
@@ -1572,13 +1617,10 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
block_group->start, extent_count,
expected_extent_count);
DEBUG_WARN();
- ret = -EIO;
- goto out;
+ return -EIO;
}
- ret = 0;
-out:
- return ret;
+ return 0;
}
static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
@@ -1605,7 +1647,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
ret = btrfs_next_item(root, path);
if (ret < 0)
- goto out;
+ return ret;
if (ret)
break;
@@ -1621,7 +1663,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
key.objectid + key.offset,
&space_added);
if (ret)
- goto out;
+ return ret;
total_found += space_added;
if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
@@ -1636,16 +1678,13 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
block_group->start, extent_count,
expected_extent_count);
DEBUG_WARN();
- ret = -EIO;
- goto out;
+ return -EIO;
}
- ret = 0;
-out:
- return ret;
+ return 0;
}
-int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
+int btrfs_load_free_space_tree(struct btrfs_caching_control *caching_ctl)
{
struct btrfs_block_group *block_group;
struct btrfs_free_space_info *info;
@@ -1666,7 +1705,7 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
path->search_commit_root = 1;
path->reada = READA_FORWARD;
- info = search_free_space_info(NULL, block_group, path, 0);
+ info = btrfs_search_free_space_info(NULL, block_group, path, 0);
if (IS_ERR(info))
return PTR_ERR(info);
diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
index e6c6d6f4f221..3d9a5d4477fc 100644
--- a/fs/btrfs/free-space-tree.h
+++ b/fs/btrfs/free-space-tree.h
@@ -22,39 +22,39 @@ struct btrfs_trans_handle;
#define BTRFS_FREE_SPACE_BITMAP_SIZE 256
#define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE)
-void set_free_space_tree_thresholds(struct btrfs_block_group *block_group);
+void btrfs_set_free_space_tree_thresholds(struct btrfs_block_group *block_group);
int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info);
int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info);
int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info);
-int load_free_space_tree(struct btrfs_caching_control *caching_ctl);
-int add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group);
-int remove_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group);
-int add_to_free_space_tree(struct btrfs_trans_handle *trans,
- u64 start, u64 size);
-int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- u64 start, u64 size);
+int btrfs_load_free_space_tree(struct btrfs_caching_control *caching_ctl);
+int btrfs_add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group);
+int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group);
+int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ u64 start, u64 size);
+int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ u64 start, u64 size);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_free_space_info *
-search_free_space_info(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path, int cow);
-int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
+btrfs_search_free_space_info(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 start, u64 size);
-int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 start, u64 size);
-int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path);
-int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path);
-int free_space_test_bit(struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 offset);
+ struct btrfs_path *path, int cow);
+int __btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 start, u64 size);
+int __btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 start, u64 size);
+int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path);
+int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path);
+bool btrfs_free_space_test_bit(struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 offset);
#endif
#endif
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 4394de12a767..8cc07cc70b12 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -420,6 +420,8 @@ struct btrfs_commit_stats {
u64 last_commit_dur;
/* The total commit duration in ns */
u64 total_commit_dur;
+ /* Start of the last critical section in ns. */
+ u64 critical_section_start_time;
};
struct btrfs_fs_info {
@@ -713,8 +715,6 @@ struct btrfs_fs_info {
u32 data_chunk_allocations;
u32 metadata_ratio;
- void *bdev_holder;
-
/* Private scrub information */
struct mutex scrub_lock;
atomic_t scrubs_running;
@@ -739,12 +739,6 @@ struct btrfs_fs_info {
spinlock_t qgroup_lock;
/*
- * Used to avoid frequently calling ulist_alloc()/ulist_free()
- * when doing qgroup accounting, it must be protected by qgroup_lock.
- */
- struct ulist *qgroup_ulist;
-
- /*
* Protect user change for quota operations. If a transaction is needed,
* it must be started before locking this lock.
*/
@@ -779,7 +773,7 @@ struct btrfs_fs_info {
struct btrfs_delayed_root *delayed_root;
- /* Entries are eb->start / sectorsize */
+ /* Entries are eb->start >> nodesize_bits */
struct xarray buffer_tree;
/* Next backup root to be overwritten */
@@ -811,6 +805,7 @@ struct btrfs_fs_info {
/* Cached block sizes */
u32 nodesize;
+ u32 nodesize_bits;
u32 sectorsize;
/* ilog2 of sectorsize, use to avoid 64bit division */
u32 sectorsize_bits;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index a61c3540d67b..f06cf701ae5a 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -78,13 +78,10 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
}
/* Returns NULL if no extref found */
-struct btrfs_inode_extref *
-btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const struct fscrypt_str *name,
- u64 inode_objectid, u64 ref_objectid, int ins_len,
- int cow)
+struct btrfs_inode_extref *btrfs_lookup_inode_extref(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct fscrypt_str *name,
+ u64 inode_objectid, u64 ref_objectid)
{
int ret;
struct btrfs_key key;
@@ -93,7 +90,7 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len);
- ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
return ERR_PTR(ret);
if (ret > 0)
@@ -720,13 +717,12 @@ delete:
}
out:
if (ret >= 0 && pending_del_nr) {
- int err;
+ int ret2;
- err = btrfs_del_items(trans, root, path, pending_del_slot,
- pending_del_nr);
- if (err) {
- btrfs_abort_transaction(trans, err);
- ret = err;
+ ret2 = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr);
+ if (ret2) {
+ btrfs_abort_transaction(trans, ret2);
+ ret = ret2;
}
}
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index c11b97fdccc4..6d9f5ad20646 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -101,13 +101,10 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *location, int mod);
-struct btrfs_inode_extref *btrfs_lookup_inode_extref(
- struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const struct fscrypt_str *name,
- u64 inode_objectid, u64 ref_objectid, int ins_len,
- int cow);
+struct btrfs_inode_extref *btrfs_lookup_inode_extref(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct fscrypt_str *name,
+ u64 inode_objectid, u64 ref_objectid);
struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf,
int slot,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c0c778243bf1..b77dd22b8cdb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -308,7 +308,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
const u32 csum_size = root->fs_info->csum_size;
/* For data reloc tree, it's better to do a backref lookup instead. */
- if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_is_data_reloc_root(root))
return print_data_reloc_error(inode, logical_start, csum,
csum_expected, mirror_num);
@@ -395,8 +395,8 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
u64 offset, u64 bytes)
{
- unsigned long index = offset >> PAGE_SHIFT;
- unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
+ pgoff_t index = offset >> PAGE_SHIFT;
+ const pgoff_t end_index = (offset + bytes - 1) >> PAGE_SHIFT;
struct folio *folio;
while (index <= end_index) {
@@ -423,18 +423,18 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode);
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
struct btrfs_new_inode_args *args)
{
- int err;
+ int ret;
if (args->default_acl) {
- err = __btrfs_set_acl(trans, args->inode, args->default_acl,
+ ret = __btrfs_set_acl(trans, args->inode, args->default_acl,
ACL_TYPE_DEFAULT);
- if (err)
- return err;
+ if (ret)
+ return ret;
}
if (args->acl) {
- err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
- if (err)
- return err;
+ ret = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
+ if (ret)
+ return ret;
}
if (!args->default_acl && !args->acl)
cache_no_acl(args->inode);
@@ -781,12 +781,15 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
return 0;
}
+ /* Defrag ioctl takes precedence over mount options and properties. */
+ if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS)
+ return 0;
+ if (BTRFS_COMPRESS_NONE < inode->defrag_compress &&
+ inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES)
+ return 1;
/* force compress */
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
return 1;
- /* defrag ioctl */
- if (inode->defrag_compress)
- return 1;
/* bad compression ratios */
if (inode->flags & BTRFS_INODE_NOCOMPRESS)
return 0;
@@ -808,12 +811,11 @@ static inline void inode_should_defrag(struct btrfs_inode *inode,
static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end)
{
- unsigned long end_index = end >> PAGE_SHIFT;
+ const pgoff_t end_index = end >> PAGE_SHIFT;
struct folio *folio;
int ret = 0;
- for (unsigned long index = start >> PAGE_SHIFT;
- index <= end_index; index++) {
+ for (pgoff_t index = start >> PAGE_SHIFT; index <= end_index; index++) {
folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
if (IS_ERR(folio)) {
if (!ret)
@@ -943,7 +945,7 @@ again:
goto cleanup_and_bail_uncompressed;
}
- if (inode->defrag_compress) {
+ if (0 < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) {
compress_type = inode->defrag_compress;
compress_level = inode->defrag_compress_level;
} else if (inode->prop_compress) {
@@ -1755,7 +1757,8 @@ static int fallback_to_cow(struct btrfs_inode *inode,
spin_unlock(&sinfo->lock);
if (count > 0)
- btrfs_clear_extent_bits(io_tree, start, end, EXTENT_NORESERVE);
+ btrfs_clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
+ &cached_state);
}
btrfs_unlock_extent(io_tree, start, end, &cached_state);
@@ -2328,8 +2331,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
* The range must cover part of the @locked_folio, or a return of 1
* can confuse the caller.
*/
- ASSERT(!(end <= folio_pos(locked_folio) ||
- start >= folio_pos(locked_folio) + folio_size(locked_folio)));
+ ASSERT(!(end <= folio_pos(locked_folio) || start >= folio_end(locked_folio)));
if (should_nocow(inode, start, end)) {
ret = run_delalloc_nocow(inode, locked_folio, start, end);
@@ -2737,7 +2739,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
struct btrfs_inode *inode = fixup->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 page_start = folio_pos(folio);
- u64 page_end = folio_pos(folio) + folio_size(folio) - 1;
+ u64 page_end = folio_end(folio) - 1;
int ret = 0;
bool free_delalloc_space = true;
@@ -2881,7 +2883,7 @@ int btrfs_writepage_cow_fixup(struct folio *folio)
DEBUG_WARN();
btrfs_err_rl(fs_info,
"root %lld ino %llu folio %llu is marked dirty without notifying the fs",
- BTRFS_I(inode)->root->root_key.objectid,
+ btrfs_root_id(BTRFS_I(inode)->root),
btrfs_ino(BTRFS_I(inode)),
folio_pos(folio));
return -EUCLEAN;
@@ -3375,8 +3377,8 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
btrfs_test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
NULL)) {
/* Skip the range without csum for data reloc inode */
- btrfs_clear_extent_bits(&inode->io_tree, file_offset, end,
- EXTENT_NODATASUM);
+ btrfs_clear_extent_bit(&inode->io_tree, file_offset, end,
+ EXTENT_NODATASUM, NULL);
return true;
}
@@ -3946,6 +3948,7 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path
btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
&inode->flags, &inode->ro_flags);
btrfs_update_inode_mapping_flags(inode);
+ btrfs_set_inode_mapping_order(inode);
cache_index:
/*
@@ -4078,45 +4081,35 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *item,
struct inode *inode)
{
- struct btrfs_map_token token;
u64 flags;
- btrfs_init_map_token(&token, leaf);
-
- btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
- btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
- btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
- btrfs_set_token_inode_mode(&token, item, inode->i_mode);
- btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
-
- btrfs_set_token_timespec_sec(&token, &item->atime,
- inode_get_atime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->atime,
- inode_get_atime_nsec(inode));
-
- btrfs_set_token_timespec_sec(&token, &item->mtime,
- inode_get_mtime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->mtime,
- inode_get_mtime_nsec(inode));
-
- btrfs_set_token_timespec_sec(&token, &item->ctime,
- inode_get_ctime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->ctime,
- inode_get_ctime_nsec(inode));
-
- btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec);
- btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec);
-
- btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
- btrfs_set_token_inode_generation(&token, item,
- BTRFS_I(inode)->generation);
- btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
- btrfs_set_token_inode_transid(&token, item, trans->transid);
- btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
+ btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
+ btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
+ btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
+ btrfs_set_inode_mode(leaf, item, inode->i_mode);
+ btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+
+ btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode));
+
+ btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode));
+
+ btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode));
+
+ btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec);
+ btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec);
+
+ btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
+ btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
+ btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode));
+ btrfs_set_inode_transid(leaf, item, trans->transid);
+ btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
BTRFS_I(inode)->ro_flags);
- btrfs_set_token_inode_flags(&token, item, flags);
- btrfs_set_token_inode_block_group(&token, item, 0);
+ btrfs_set_inode_flags(leaf, item, flags);
+ btrfs_set_inode_block_group(leaf, item, 0);
}
/*
@@ -4215,20 +4208,22 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
u64 dir_ino = btrfs_ino(dir);
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
if (IS_ERR_OR_NULL(di)) {
- ret = di ? PTR_ERR(di) : -ENOENT;
- goto err;
+ btrfs_free_path(path);
+ return di ? PTR_ERR(di) : -ENOENT;
}
ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ /*
+ * Down the call chains below we'll also need to allocate a path, so no
+ * need to hold on to this one for longer than necessary.
+ */
+ btrfs_free_path(path);
if (ret)
- goto err;
- btrfs_release_path(path);
+ return ret;
/*
* If we don't have dir index, we have to get it by looking up
@@ -4250,11 +4245,11 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
if (ret) {
- btrfs_info(fs_info,
- "failed to delete reference to %.*s, inode %llu parent %llu",
- name->len, name->name, ino, dir_ino);
+ btrfs_crit(fs_info,
+ "failed to delete reference to %.*s, root %llu inode %llu parent %llu",
+ name->len, name->name, btrfs_root_id(root), ino, dir_ino);
btrfs_abort_transaction(trans, ret);
- goto err;
+ return ret;
}
skip_backref:
if (rename_ctx)
@@ -4263,7 +4258,7 @@ skip_backref:
ret = btrfs_delete_delayed_dir_index(trans, dir, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto err;
+ return ret;
}
/*
@@ -4287,19 +4282,14 @@ skip_backref:
* holding.
*/
btrfs_run_delayed_iput(fs_info, inode);
-err:
- btrfs_free_path(path);
- if (ret)
- goto out;
btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
inode_inc_iversion(&inode->vfs_inode);
inode_set_ctime_current(&inode->vfs_inode);
inode_inc_iversion(&dir->vfs_inode);
inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
- ret = btrfs_update_inode(trans, dir);
-out:
- return ret;
+
+ return btrfs_update_inode(trans, dir);
}
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
@@ -4704,68 +4694,68 @@ out_up_write:
return ret;
}
-static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
+static int btrfs_rmdir(struct inode *vfs_dir, struct dentry *dentry)
{
- struct inode *inode = d_inode(dentry);
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_inode *dir = BTRFS_I(vfs_dir);
+ struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret = 0;
struct btrfs_trans_handle *trans;
- u64 last_unlink_trans;
struct fscrypt_name fname;
- if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
+ if (inode->vfs_inode.i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
- if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
+ if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
btrfs_err(fs_info,
"extent tree v2 doesn't support snapshot deletion yet");
return -EOPNOTSUPP;
}
- return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
+ return btrfs_delete_subvolume(dir, dentry);
}
- ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+ ret = fscrypt_setup_filename(vfs_dir, &dentry->d_name, 1, &fname);
if (ret)
return ret;
/* This needs to handle no-key deletions later on */
- trans = __unlink_start_trans(BTRFS_I(dir));
+ trans = __unlink_start_trans(dir);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_notrans;
}
- if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
- ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
+ /*
+ * Propagate the last_unlink_trans value of the deleted dir to its
+ * parent directory. This is to prevent an unrecoverable log tree in the
+ * case we do something like this:
+ * 1) create dir foo
+ * 2) create snapshot under dir foo
+ * 3) delete the snapshot
+ * 4) rmdir foo
+ * 5) mkdir foo
+ * 6) fsync foo or some file inside foo
+ *
+ * This is because we can't unlink other roots when replaying the dir
+ * deletes for directory foo.
+ */
+ if (inode->last_unlink_trans >= trans->transid)
+ btrfs_record_snapshot_destroy(trans, dir);
+
+ if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+ ret = btrfs_unlink_subvol(trans, dir, dentry);
goto out;
}
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ ret = btrfs_orphan_add(trans, inode);
if (ret)
goto out;
- last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
-
/* now the directory is empty */
- ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
- &fname.disk_name);
- if (!ret) {
- btrfs_i_size_write(BTRFS_I(inode), 0);
- /*
- * Propagate the last_unlink_trans value of the deleted dir to
- * its parent directory. This is to prevent an unrecoverable
- * log tree in the case we do something like this:
- * 1) create dir foo
- * 2) create snapshot under dir foo
- * 3) delete the snapshot
- * 4) rmdir foo
- * 5) mkdir foo
- * 6) fsync foo or some file inside foo
- */
- if (last_unlink_trans >= trans->transid)
- BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
- }
+ ret = btrfs_unlink_inode(trans, dir, inode, &fname.disk_name);
+ if (!ret)
+ btrfs_i_size_write(inode, 0);
out:
btrfs_end_transaction(trans);
out_notrans:
@@ -4821,9 +4811,9 @@ again:
*/
zero_start = max_t(u64, folio_pos(folio), start);
- zero_end = folio_pos(folio) + folio_size(folio) - 1;
+ zero_end = folio_end(folio);
folio_zero_range(folio, zero_start - folio_pos(folio),
- zero_end - zero_start + 1);
+ zero_end - zero_start);
out_unlock:
folio_unlock(folio);
@@ -4861,7 +4851,6 @@ int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 e
pgoff_t index = (offset >> PAGE_SHIFT);
struct folio *folio;
gfp_t mask = btrfs_alloc_write_mask(mapping);
- size_t write_bytes = blocksize;
int ret = 0;
const bool in_head_block = is_inside_block(offset, round_down(start, blocksize),
blocksize);
@@ -4913,8 +4902,12 @@ int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 e
ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
blocksize, false);
if (ret < 0) {
+ size_t write_bytes = blocksize;
+
if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
- /* For nocow case, no need to reserve data space */
+ /* For nocow case, no need to reserve data space. */
+ ASSERT(write_bytes == blocksize, "write_bytes=%zu blocksize=%u",
+ write_bytes, blocksize);
only_release_metadata = true;
} else {
goto out;
@@ -5001,8 +4994,7 @@ again:
* not reach disk, it still affects our page caches.
*/
zero_start = max_t(u64, folio_pos(folio), start);
- zero_end = min_t(u64, folio_pos(folio) + folio_size(folio) - 1,
- end);
+ zero_end = min_t(u64, folio_end(folio) - 1, end);
} else {
zero_start = max_t(u64, block_start, start);
zero_end = min_t(u64, block_end, end);
@@ -5014,11 +5006,12 @@ again:
block_end + 1 - block_start);
btrfs_folio_set_dirty(fs_info, folio, block_start,
block_end + 1 - block_start);
- btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
if (only_release_metadata)
btrfs_set_extent_bit(&inode->io_tree, block_start, block_end,
- EXTENT_NORESERVE, NULL);
+ EXTENT_NORESERVE, &cached_state);
+
+ btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
out_unlock:
if (ret) {
@@ -5256,7 +5249,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
if (ret && inode->i_nlink) {
- int err;
+ int ret2;
/*
* Truncate failed, so fix up the in-memory size. We
@@ -5264,9 +5257,9 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
* wait for disk_i_size to be stable and then update the
* in-memory size to match.
*/
- err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
- if (err)
- return err;
+ ret2 = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
+ if (ret2)
+ return ret2;
i_size_write(inode, BTRFS_I(inode)->disk_i_size);
}
}
@@ -5279,31 +5272,31 @@ static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
{
struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
- int err;
+ int ret;
if (btrfs_root_readonly(root))
return -EROFS;
- err = setattr_prepare(idmap, dentry, attr);
- if (err)
- return err;
+ ret = setattr_prepare(idmap, dentry, attr);
+ if (ret)
+ return ret;
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
- err = btrfs_setsize(inode, attr);
- if (err)
- return err;
+ ret = btrfs_setsize(inode, attr);
+ if (ret)
+ return ret;
}
if (attr->ia_valid) {
setattr_copy(idmap, inode, attr);
inode_inc_iversion(inode);
- err = btrfs_dirty_inode(BTRFS_I(inode));
+ ret = btrfs_dirty_inode(BTRFS_I(inode));
- if (!err && attr->ia_valid & ATTR_MODE)
- err = posix_acl_chmod(idmap, dentry, inode->i_mode);
+ if (!ret && attr->ia_valid & ATTR_MODE)
+ ret = posix_acl_chmod(idmap, dentry, inode->i_mode);
}
- return err;
+ return ret;
}
/*
@@ -5437,7 +5430,7 @@ void btrfs_evict_inode(struct inode *inode)
struct btrfs_fs_info *fs_info;
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_block_rsv *rsv = NULL;
+ struct btrfs_block_rsv rsv;
int ret;
trace_btrfs_inode_evict(inode);
@@ -5485,11 +5478,9 @@ void btrfs_evict_inode(struct inode *inode)
*/
btrfs_kill_delayed_inode_items(BTRFS_I(inode));
- rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
- if (!rsv)
- goto out;
- rsv->size = btrfs_calc_metadata_size(fs_info, 1);
- rsv->failfast = true;
+ btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
+ rsv.size = btrfs_calc_metadata_size(fs_info, 1);
+ rsv.failfast = true;
btrfs_i_size_write(BTRFS_I(inode), 0);
@@ -5501,11 +5492,11 @@ void btrfs_evict_inode(struct inode *inode)
.min_type = 0,
};
- trans = evict_refill_and_join(root, rsv);
+ trans = evict_refill_and_join(root, &rsv);
if (IS_ERR(trans))
- goto out;
+ goto out_release;
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
ret = btrfs_truncate_inode_items(trans, root, &control);
trans->block_rsv = &fs_info->trans_block_rsv;
@@ -5517,7 +5508,7 @@ void btrfs_evict_inode(struct inode *inode)
*/
btrfs_btree_balance_dirty_nodelay(fs_info);
if (ret && ret != -ENOSPC && ret != -EAGAIN)
- goto out;
+ goto out_release;
else if (!ret)
break;
}
@@ -5531,16 +5522,17 @@ void btrfs_evict_inode(struct inode *inode)
* If it turns out that we are dropping too many of these, we might want
* to add a mechanism for retrying these after a commit.
*/
- trans = evict_refill_and_join(root, rsv);
+ trans = evict_refill_and_join(root, &rsv);
if (!IS_ERR(trans)) {
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
btrfs_orphan_del(trans, BTRFS_I(inode));
trans->block_rsv = &fs_info->trans_block_rsv;
btrfs_end_transaction(trans);
}
+out_release:
+ btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
out:
- btrfs_free_block_rsv(fs_info, rsv);
/*
* If we didn't successfully delete, the orphan item will still be in
* the tree and we'll retry on the next mount. Again, we might also want
@@ -6173,8 +6165,7 @@ again:
if (ret)
goto nopos;
- ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
- if (ret)
+ if (btrfs_readdir_delayed_dir_index(ctx, &ins_list))
goto nopos;
/*
@@ -6467,6 +6458,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
BTRFS_INODE_NODATASUM;
btrfs_update_inode_mapping_flags(BTRFS_I(inode));
+ btrfs_set_inode_mapping_order(BTRFS_I(inode));
}
ret = btrfs_insert_inode_locked(inode);
@@ -6610,13 +6602,17 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (args->orphan) {
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
} else {
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
0, BTRFS_I(inode)->dir_index);
- }
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto discard;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
}
return 0;
@@ -6703,20 +6699,18 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
fail_dir_item:
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
u64 local_index;
- int err;
- err = btrfs_del_root_ref(trans, key.objectid,
- btrfs_root_id(root), parent_ino,
- &local_index, name);
- if (err)
- btrfs_abort_transaction(trans, err);
+ int ret2;
+
+ ret2 = btrfs_del_root_ref(trans, key.objectid, btrfs_root_id(root),
+ parent_ino, &local_index, name);
+ if (ret2)
+ btrfs_abort_transaction(trans, ret2);
} else if (add_backref) {
- u64 local_index;
- int err;
+ int ret2;
- err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino,
- &local_index);
- if (err)
- btrfs_abort_transaction(trans, err);
+ ret2 = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, NULL);
+ if (ret2)
+ btrfs_abort_transaction(trans, ret2);
}
/* Return the original error code */
@@ -6735,20 +6729,20 @@ static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
};
unsigned int trans_num_items;
struct btrfs_trans_handle *trans;
- int err;
+ int ret;
- err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
- if (err)
+ ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
+ if (ret)
goto out_inode;
trans = btrfs_start_transaction(root, trans_num_items);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_new_inode_args;
}
- err = btrfs_create_new_inode(trans, &new_inode_args);
- if (!err)
+ ret = btrfs_create_new_inode(trans, &new_inode_args);
+ if (!ret)
d_instantiate_new(dentry, inode);
btrfs_end_transaction(trans);
@@ -6756,9 +6750,9 @@ static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
out_new_inode_args:
btrfs_new_inode_args_destroy(&new_inode_args);
out_inode:
- if (err)
+ if (ret)
iput(inode);
- return err;
+ return ret;
}
static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
@@ -6799,7 +6793,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct fscrypt_name fname;
u64 index;
- int err;
+ int ret;
int drop_inode = 0;
/* do not allow sys_link's with other subvols of the same device */
@@ -6809,12 +6803,12 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (inode->i_nlink >= BTRFS_LINK_MAX)
return -EMLINK;
- err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
- if (err)
+ ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
+ if (ret)
goto fail;
- err = btrfs_set_inode_index(BTRFS_I(dir), &index);
- if (err)
+ ret = btrfs_set_inode_index(BTRFS_I(dir), &index);
+ if (ret)
goto fail;
/*
@@ -6825,7 +6819,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
*/
trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
trans = NULL;
goto fail;
}
@@ -6838,24 +6832,24 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
ihold(inode);
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
- err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
&fname.disk_name, 1, index);
- if (err) {
+ if (ret) {
drop_inode = 1;
} else {
struct dentry *parent = dentry->d_parent;
- err = btrfs_update_inode(trans, BTRFS_I(inode));
- if (err)
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ if (ret)
goto fail;
if (inode->i_nlink == 1) {
/*
* If new hard link count is 1, it's a file created
* with open(2) O_TMPFILE flag.
*/
- err = btrfs_orphan_del(trans, BTRFS_I(inode));
- if (err)
+ ret = btrfs_orphan_del(trans, BTRFS_I(inode));
+ if (ret)
goto fail;
}
d_instantiate(dentry, inode);
@@ -6871,7 +6865,7 @@ fail:
iput(inode);
}
btrfs_btree_balance_dirty(fs_info);
- return err;
+ return ret;
}
static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
@@ -7364,13 +7358,13 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
static void wait_subpage_spinlock(struct folio *folio)
{
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
if (!btrfs_is_subpage(fs_info, folio))
return;
ASSERT(folio_test_private(folio) && folio_get_private(folio));
- subpage = folio_get_private(folio);
+ bfs = folio_get_private(folio);
/*
* This may look insane as we just acquire the spinlock and release it,
@@ -7383,8 +7377,8 @@ static void wait_subpage_spinlock(struct folio *folio)
* Here we just acquire the spinlock so that all existing callers
* should exit and we're safe to release/invalidate the page.
*/
- spin_lock_irq(&subpage->lock);
- spin_unlock_irq(&subpage->lock);
+ spin_lock_irq(&bfs->lock);
+ spin_unlock_irq(&bfs->lock);
}
static int btrfs_launder_folio(struct folio *folio)
@@ -7607,7 +7601,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
};
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_block_rsv *rsv;
+ struct btrfs_block_rsv rsv;
int ret;
struct btrfs_trans_handle *trans;
u64 mask = fs_info->sectorsize - 1;
@@ -7649,11 +7643,9 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
* 2) fs_info->trans_block_rsv - this will have 1 items worth left for
* updating the inode.
*/
- rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
- if (!rsv)
- return -ENOMEM;
- rsv->size = min_size;
- rsv->failfast = true;
+ btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
+ rsv.size = min_size;
+ rsv.failfast = true;
/*
* 1 for the truncate slack space
@@ -7666,7 +7658,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
}
/* Migrate the slack space for the truncate to our reserve */
- ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
+ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv,
min_size, false);
/*
* We have reserved 2 metadata units when we started the transaction and
@@ -7678,7 +7670,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
goto out;
}
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
while (1) {
struct extent_state *cached_state = NULL;
@@ -7721,9 +7713,9 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
break;
}
- btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
+ btrfs_block_rsv_release(fs_info, &rsv, -1, NULL);
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
- rsv, min_size, false);
+ &rsv, min_size, false);
/*
* We have reserved 2 metadata units when we started the
* transaction and min_size matches 1 unit, so this should never
@@ -7732,7 +7724,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
if (WARN_ON(ret))
break;
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
}
/*
@@ -7771,7 +7763,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
btrfs_btree_balance_dirty(fs_info);
}
out:
- btrfs_free_block_rsv(fs_info, rsv);
+ btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
/*
* So if we truncate and then write and fsync we normally would just
* write the extents that changed, which is a problem if we need to
@@ -8026,7 +8018,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
generic_fillattr(idmap, request_mask, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
- stat->subvol = BTRFS_I(inode)->root->root_key.objectid;
+ stat->subvol = btrfs_root_id(BTRFS_I(inode)->root);
stat->result_mask |= STATX_SUBVOL;
spin_lock(&BTRFS_I(inode)->lock);
@@ -8059,6 +8051,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
int ret;
int ret2;
bool need_abort = false;
+ bool logs_pinned = false;
struct fscrypt_name old_fname, new_fname;
struct fscrypt_str *old_name, *new_name;
@@ -8182,6 +8175,31 @@ static int btrfs_rename_exchange(struct inode *old_dir,
inode_inc_iversion(new_inode);
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID &&
+ new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * If we are renaming in the same directory (and it's not for
+ * root entries) pin the log early to prevent any concurrent
+ * task from logging the directory after we removed the old
+ * entries and before we add the new entries, otherwise that
+ * task can sync a log without any entry for the inodes we are
+ * renaming and therefore replaying that log, if a power failure
+ * happens after syncing the log, would result in deleting the
+ * inodes.
+ *
+ * If the rename affects two different directories, we want to
+ * make sure the that there's no log commit that contains
+ * updates for only one of the directories but not for the
+ * other.
+ *
+ * If we are renaming an entry for a root, we don't care about
+ * log updates since we called btrfs_set_log_full_commit().
+ */
+ btrfs_pin_log_trans(root);
+ btrfs_pin_log_trans(dest);
+ logs_pinned = true;
+ }
+
if (old_dentry->d_parent != new_dentry->d_parent) {
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
BTRFS_I(old_inode), true);
@@ -8253,30 +8271,23 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode)->dir_index = new_idx;
/*
- * Now pin the logs of the roots. We do it to ensure that no other task
- * can sync the logs while we are in progress with the rename, because
- * that could result in an inconsistency in case any of the inodes that
- * are part of this rename operation were logged before.
+ * Do the log updates for all inodes.
+ *
+ * If either entry is for a root we don't need to update the logs since
+ * we've called btrfs_set_log_full_commit() before.
*/
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
- btrfs_pin_log_trans(root);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
- btrfs_pin_log_trans(dest);
-
- /* Do the log updates for all inodes. */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ if (logs_pinned) {
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
old_rename_ctx.index, new_dentry->d_parent);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
new_rename_ctx.index, old_dentry->d_parent);
+ }
- /* Now unpin the logs. */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+out_fail:
+ if (logs_pinned) {
btrfs_end_log_trans(root);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
btrfs_end_log_trans(dest);
-out_fail:
+ }
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -8326,6 +8337,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
struct fscrypt_name old_fname, new_fname;
+ bool logs_pinned = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -8460,6 +8472,29 @@ static int btrfs_rename(struct mnt_idmap *idmap,
inode_inc_iversion(old_inode);
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * If we are renaming in the same directory (and it's not a
+ * root entry) pin the log to prevent any concurrent task from
+ * logging the directory after we removed the old entry and
+ * before we add the new entry, otherwise that task can sync
+ * a log without any entry for the inode we are renaming and
+ * therefore replaying that log, if a power failure happens
+ * after syncing the log, would result in deleting the inode.
+ *
+ * If the rename affects two different directories, we want to
+ * make sure the that there's no log commit that contains
+ * updates for only one of the directories but not for the
+ * other.
+ *
+ * If we are renaming an entry for a root, we don't care about
+ * log updates since we called btrfs_set_log_full_commit().
+ */
+ btrfs_pin_log_trans(root);
+ btrfs_pin_log_trans(dest);
+ logs_pinned = true;
+ }
+
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
BTRFS_I(old_inode), true);
@@ -8524,7 +8559,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
if (old_inode->i_nlink == 1)
BTRFS_I(old_inode)->dir_index = index;
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ if (logs_pinned)
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
rename_ctx.index, new_dentry->d_parent);
@@ -8540,6 +8575,10 @@ static int btrfs_rename(struct mnt_idmap *idmap,
}
}
out_fail:
+ if (logs_pinned) {
+ btrfs_end_log_trans(root);
+ btrfs_end_log_trans(dest);
+ }
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -8776,7 +8815,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
.dentry = dentry,
};
unsigned int trans_num_items;
- int err;
+ int ret;
int name_len;
int datasize;
unsigned long ptr;
@@ -8803,26 +8842,26 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
inode_set_bytes(inode, name_len);
new_inode_args.inode = inode;
- err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
- if (err)
+ ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
+ if (ret)
goto out_inode;
/* 1 additional item for the inline extent */
trans_num_items++;
trans = btrfs_start_transaction(root, trans_num_items);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_new_inode_args;
}
- err = btrfs_create_new_inode(trans, &new_inode_args);
- if (err)
+ ret = btrfs_create_new_inode(trans, &new_inode_args);
+ if (ret)
goto out;
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
- btrfs_abort_transaction(trans, err);
+ ret = -ENOMEM;
+ btrfs_abort_transaction(trans, ret);
discard_new_inode(inode);
inode = NULL;
goto out;
@@ -8831,10 +8870,9 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = 0;
datasize = btrfs_file_extent_calc_inline_size(name_len);
- err = btrfs_insert_empty_item(trans, root, path, &key,
- datasize);
- if (err) {
- btrfs_abort_transaction(trans, err);
+ ret = btrfs_insert_empty_item(trans, root, path, &key, datasize);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
btrfs_free_path(path);
discard_new_inode(inode);
inode = NULL;
@@ -8856,16 +8894,16 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
btrfs_free_path(path);
d_instantiate_new(dentry, inode);
- err = 0;
+ ret = 0;
out:
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
out_new_inode_args:
btrfs_new_inode_args_destroy(&new_inode_args);
out_inode:
- if (err)
+ if (ret)
iput(inode);
- return err;
+ return ret;
}
static struct btrfs_trans_handle *insert_prealloc_file_extent(
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 913acef3f0a9..7e13de2bdcbf 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -245,7 +245,7 @@ static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_
* Set flags/xflags from the internal inode flags. The remaining items of
* fsxattr are zeroed.
*/
-int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int btrfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
const struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
@@ -254,7 +254,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
int btrfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
struct btrfs_root *root = inode->root;
@@ -666,14 +666,14 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
goto out;
}
+ btrfs_record_new_subvolume(trans, BTRFS_I(dir));
+
ret = btrfs_create_new_inode(trans, &new_inode_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- btrfs_record_new_subvolume(trans, BTRFS_I(dir));
-
d_instantiate_new(dentry, new_inode_args.inode);
new_inode_args.inode = NULL;
@@ -841,7 +841,7 @@ free_pending:
static int btrfs_may_delete(struct mnt_idmap *idmap,
struct inode *dir, struct dentry *victim, int isdir)
{
- int error;
+ int ret;
if (d_really_is_negative(victim))
return -ENOENT;
@@ -851,9 +851,9 @@ static int btrfs_may_delete(struct mnt_idmap *idmap,
return -EINVAL;
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
- error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
- if (error)
- return error;
+ ret = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
+ if (ret)
+ return ret;
if (IS_APPEND(dir))
return -EPERM;
if (check_sticky(idmap, dir, d_inode(victim)) ||
@@ -892,39 +892,37 @@ static inline int btrfs_may_create(struct mnt_idmap *idmap,
* sys_mkdirat and vfs_mkdir, but we only do a single component lookup
* inside this filesystem so it's quite a bit simpler.
*/
-static noinline int btrfs_mksubvol(const struct path *parent,
+static noinline int btrfs_mksubvol(struct dentry *parent,
struct mnt_idmap *idmap,
- const char *name, int namelen,
- struct btrfs_root *snap_src,
+ struct qstr *qname, struct btrfs_root *snap_src,
bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
- struct inode *dir = d_inode(parent->dentry);
+ struct inode *dir = d_inode(parent);
struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct dentry *dentry;
- struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen);
- int error;
+ struct fscrypt_str name_str = FSTR_INIT((char *)qname->name, qname->len);
+ int ret;
- error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
- if (error == -EINTR)
- return error;
+ ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
+ if (ret == -EINTR)
+ return ret;
- dentry = lookup_one(idmap, &QSTR_LEN(name, namelen), parent->dentry);
- error = PTR_ERR(dentry);
+ dentry = lookup_one(idmap, qname, parent);
+ ret = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_unlock;
- error = btrfs_may_create(idmap, dir, dentry);
- if (error)
+ ret = btrfs_may_create(idmap, dir, dentry);
+ if (ret)
goto out_dput;
/*
* even if this name doesn't exist, we may get hash collisions.
* check for them now when we can safely fail
*/
- error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
- dir->i_ino, &name_str);
- if (error)
+ ret = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, dir->i_ino, &name_str);
+ if (ret)
goto out_dput;
down_read(&fs_info->subvol_sem);
@@ -933,11 +931,11 @@ static noinline int btrfs_mksubvol(const struct path *parent,
goto out_up_read;
if (snap_src)
- error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
+ ret = create_snapshot(snap_src, dir, dentry, readonly, inherit);
else
- error = create_subvol(idmap, dir, dentry, inherit);
+ ret = create_subvol(idmap, dir, dentry, inherit);
- if (!error)
+ if (!ret)
fsnotify_mkdir(dir, dentry);
out_up_read:
up_read(&fs_info->subvol_sem);
@@ -945,12 +943,12 @@ out_dput:
dput(dentry);
out_unlock:
btrfs_inode_unlock(BTRFS_I(dir), 0);
- return error;
+ return ret;
}
-static noinline int btrfs_mksnapshot(const struct path *parent,
+static noinline int btrfs_mksnapshot(struct dentry *parent,
struct mnt_idmap *idmap,
- const char *name, int namelen,
+ struct qstr *qname,
struct btrfs_root *root,
bool readonly,
struct btrfs_qgroup_inherit *inherit)
@@ -977,8 +975,8 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
btrfs_wait_ordered_extents(root, U64_MAX, NULL);
- ret = btrfs_mksubvol(parent, idmap, name, namelen,
- root, readonly, inherit);
+ ret = btrfs_mksubvol(parent, idmap, qname, root, readonly, inherit);
+
atomic_dec(&root->snapshot_force_cow);
out:
btrfs_drew_read_unlock(&root->snapshot_lock);
@@ -1169,7 +1167,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
} /* equal, nothing need to do */
if (ret == 0 && new_size != old_size)
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"resize device %s (devid %llu) from %llu to %llu",
btrfs_dev_name(device), device->devid,
old_size, new_size);
@@ -1184,12 +1182,12 @@ out_drop:
static noinline int __btrfs_ioctl_snap_create(struct file *file,
struct mnt_idmap *idmap,
- const char *name, unsigned long fd, int subvol,
+ const char *name, unsigned long fd, bool subvol,
bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
- int namelen;
int ret = 0;
+ struct qstr qname = QSTR_INIT(name, strlen(name));
if (!S_ISDIR(file_inode(file)->i_mode))
return -ENOTDIR;
@@ -1198,21 +1196,20 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
if (ret)
goto out;
- namelen = strlen(name);
if (strchr(name, '/')) {
ret = -EINVAL;
goto out_drop_write;
}
- if (name[0] == '.' &&
- (namelen == 1 || (name[1] == '.' && namelen == 2))) {
+ if (qname.name[0] == '.' &&
+ (qname.len == 1 || (qname.name[1] == '.' && qname.len == 2))) {
ret = -EEXIST;
goto out_drop_write;
}
if (subvol) {
- ret = btrfs_mksubvol(&file->f_path, idmap, name,
- namelen, NULL, readonly, inherit);
+ ret = btrfs_mksubvol(file_dentry(file), idmap, &qname, NULL,
+ readonly, inherit);
} else {
CLASS(fd, src)(fd);
struct inode *src_inode;
@@ -1242,8 +1239,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
*/
ret = -EINVAL;
} else {
- ret = btrfs_mksnapshot(&file->f_path, idmap,
- name, namelen,
+ ret = btrfs_mksnapshot(file_dentry(file), idmap, &qname,
BTRFS_I(src_inode)->root,
readonly, inherit);
}
@@ -1280,7 +1276,7 @@ out:
}
static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
- void __user *arg, int subvol)
+ void __user *arg, bool subvol)
{
struct btrfs_ioctl_vol_args_v2 *vol_args;
int ret;
@@ -2558,8 +2554,14 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
ret = -EOPNOTSUPP;
goto out;
}
- /* compression requires us to start the IO */
- if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS) &&
+ (range.flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ /* Compression or no-compression require to start the IO. */
+ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS) ||
+ (range.flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS)) {
range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
range.extent_thresh = (u32)-1;
}
@@ -2700,7 +2702,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
err_drop:
mnt_drop_write_file(file);
if (bdev_file)
- fput(bdev_file);
+ bdev_fput(bdev_file);
out:
btrfs_put_dev_args_from_path(&args);
kfree(vol_args);
@@ -2751,7 +2753,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
mnt_drop_write_file(file);
if (bdev_file)
- fput(bdev_file);
+ bdev_fput(bdev_file);
out:
btrfs_put_dev_args_from_path(&args);
out_free:
@@ -2890,7 +2892,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
ret = PTR_ERR(new_root);
goto out;
}
- if (!is_fstree(btrfs_root_id(new_root))) {
+ if (!btrfs_is_fstree(btrfs_root_id(new_root))) {
ret = -ENOENT;
goto out_free;
}
@@ -3139,7 +3141,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
return -EPERM;
if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
- btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet");
+ btrfs_err(fs_info, "scrub: extent tree v2 not yet supported");
return -EINVAL;
}
@@ -3357,7 +3359,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
int size;
struct btrfs_ioctl_logical_ino_args *loi;
struct btrfs_data_container *inodes = NULL;
- struct btrfs_path *path = NULL;
bool ignore_offset;
if (!capable(CAP_SYS_ADMIN))
@@ -3391,14 +3392,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
goto out_loi;
}
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
- ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
- inodes, ignore_offset);
- btrfs_free_path(path);
+ ret = iterate_inodes_from_logical(loi->logical, fs_info, inodes, ignore_offset);
if (ret == -EINVAL)
ret = -ENOENT;
if (ret < 0)
@@ -3715,22 +3709,6 @@ drop_write:
return ret;
}
-/*
- * Quick check for ioctl handlers if quotas are enabled. Proper locking must be
- * done before any operations.
- */
-static bool qgroup_enabled(struct btrfs_fs_info *fs_info)
-{
- bool ret = true;
-
- mutex_lock(&fs_info->qgroup_ioctl_lock);
- if (!fs_info->quota_root)
- ret = false;
- mutex_unlock(&fs_info->qgroup_ioctl_lock);
-
- return ret;
-}
-
static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
@@ -3745,7 +3723,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!qgroup_enabled(root->fs_info))
+ if (!btrfs_qgroup_enabled(fs_info))
return -ENOTCONN;
ret = mnt_want_write_file(file);
@@ -3815,7 +3793,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!qgroup_enabled(root->fs_info))
+ if (!btrfs_qgroup_enabled(root->fs_info))
return -ENOTCONN;
ret = mnt_want_write_file(file);
@@ -3833,7 +3811,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
goto out;
}
- if (sa->create && is_fstree(sa->qgroupid)) {
+ if (sa->create && btrfs_is_fstree(sa->qgroupid)) {
ret = -EINVAL;
goto out;
}
@@ -3874,7 +3852,7 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!qgroup_enabled(root->fs_info))
+ if (!btrfs_qgroup_enabled(root->fs_info))
return -ENOTCONN;
ret = mnt_want_write_file(file);
@@ -3922,7 +3900,7 @@ static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!qgroup_enabled(fs_info))
+ if (!btrfs_qgroup_enabled(fs_info))
return -ENOTCONN;
ret = mnt_want_write_file(file);
@@ -4200,7 +4178,7 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
}
spin_lock(&fs_info->super_lock);
- strcpy(super_block->label, label);
+ strscpy(super_block->label, label);
spin_unlock(&fs_info->super_lock);
ret = btrfs_commit_transaction(trans);
@@ -4629,6 +4607,13 @@ out_acct:
return ret;
}
+struct btrfs_uring_encoded_data {
+ struct btrfs_ioctl_encoded_io_args args;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov;
+ struct iov_iter iter;
+};
+
/*
* Context that's attached to an encoded read io_uring command, in cmd->pdu. It
* contains the fields in btrfs_uring_read_extent that are necessary to finish
@@ -4650,6 +4635,7 @@ struct btrfs_uring_priv {
};
struct io_btrfs_cmd {
+ struct btrfs_uring_encoded_data *data;
struct btrfs_uring_priv *priv;
};
@@ -4659,7 +4645,7 @@ static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int iss
struct btrfs_uring_priv *priv = bc->priv;
struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp));
struct extent_io_tree *io_tree = &inode->io_tree;
- unsigned long index;
+ pgoff_t index;
u64 cur;
size_t page_offset;
ssize_t ret;
@@ -4708,6 +4694,7 @@ out:
kfree(priv->pages);
kfree(priv->iov);
kfree(priv);
+ kfree(bc->data);
}
void btrfs_uring_read_extent_endio(void *ctx, int err)
@@ -4791,13 +4778,6 @@ out_fail:
return ret;
}
-struct btrfs_uring_encoded_data {
- struct btrfs_ioctl_encoded_io_args args;
- struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov;
- struct iov_iter iter;
-};
-
static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags);
@@ -4813,7 +4793,11 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
struct extent_state *cached_state = NULL;
u64 start, lockend;
void __user *sqe_addr;
- struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data;
+ struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
+ struct btrfs_uring_encoded_data *data = NULL;
+
+ if (cmd->flags & IORING_URING_CMD_REISSUE)
+ data = bc->data;
if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
@@ -4829,7 +4813,8 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags);
#else
- return -ENOTTY;
+ ret = -ENOTTY;
+ goto out_acct;
#endif
} else {
copy_end = copy_end_kernel;
@@ -4842,7 +4827,7 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
goto out_acct;
}
- io_uring_cmd_get_async_data(cmd)->op_data = data;
+ bc->data = data;
if (issue_flags & IO_URING_F_COMPAT) {
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
@@ -4940,6 +4925,9 @@ out_acct:
add_rchar(current, ret);
inc_syscr(current);
+ if (ret != -EIOCBQUEUED && ret != -EAGAIN)
+ kfree(data);
+
return ret;
}
@@ -4950,7 +4938,11 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu
struct file *file;
ssize_t ret;
void __user *sqe_addr;
- struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data;
+ struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
+ struct btrfs_uring_encoded_data *data = NULL;
+
+ if (cmd->flags & IORING_URING_CMD_REISSUE)
+ data = bc->data;
if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
@@ -4972,7 +4964,7 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu
goto out_acct;
}
- io_uring_cmd_get_async_data(cmd)->op_data = data;
+ bc->data = data;
if (issue_flags & IO_URING_F_COMPAT) {
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
@@ -5062,6 +5054,9 @@ out_acct:
if (ret > 0)
add_wchar(current, ret);
inc_syscw(current);
+
+ if (ret != -EAGAIN)
+ kfree(data);
return ret;
}
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index e08ea446cf48..ccf6bed9cc24 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -8,7 +8,7 @@
struct file;
struct dentry;
struct mnt_idmap;
-struct fileattr;
+struct file_kattr;
struct io_uring_cmd;
struct btrfs_inode;
struct btrfs_fs_info;
@@ -16,9 +16,9 @@ struct btrfs_ioctl_balance_args;
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int btrfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
int btrfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
int btrfs_ioctl_get_supported_features(void __user *arg);
void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index 6abf81bb00c2..022ebc89af85 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -37,106 +37,46 @@ void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
btrfs_no_printk(fs_info, fmt, ##args)
#endif
-#define btrfs_emerg(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_INFO fmt, ##args)
-
/*
- * Wrappers that use printk_in_rcu
+ * Print a message with filesystem info, enclosed in RCU protection.
*/
-#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit_in_rcu(fs_info, fmt, args...) \
+#define btrfs_crit(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err_in_rcu(fs_info, fmt, args...) \
+#define btrfs_err(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn_in_rcu(fs_info, fmt, args...) \
+#define btrfs_warn(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info_in_rcu(fs_info, fmt, args...) \
+#define btrfs_info(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
/*
- * Wrappers that use a ratelimited printk_in_rcu
- */
-#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
-
-/*
* Wrappers that use a ratelimited printk
*/
-#define btrfs_emerg_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args)
#define btrfs_crit_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
#define btrfs_err_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
#define btrfs_warn_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
#define btrfs_info_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
#if defined(CONFIG_DYNAMIC_DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
- _dynamic_func_call_no_desc(fmt, btrfs_printk, \
- fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
_dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \
fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
- _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \
- fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl(fs_info, fmt, args...) \
- _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \
fs_info, KERN_DEBUG fmt, ##args)
#elif defined(DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
#else
-#define btrfs_debug(fs_info, fmt, args...) \
- btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
- btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl(fs_info, fmt, args...) \
- btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
+/* When printk() is no_printk(), expand to no-op. */
+#define btrfs_debug(fs_info, fmt, args...) do { (void)(fs_info); } while(0)
+#define btrfs_debug_rl(fs_info, fmt, args...) do { (void)(fs_info); } while(0)
#endif
#define btrfs_printk_in_rcu(fs_info, fmt, args...) \
@@ -146,26 +86,15 @@ do { \
rcu_read_unlock(); \
} while (0)
-#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \
-do { \
- rcu_read_lock(); \
- btrfs_no_printk(fs_info, fmt, ##args); \
- rcu_read_unlock(); \
-} while (0)
-
-#define btrfs_printk_ratelimited(fs_info, fmt, args...) \
+#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \
do { \
static DEFINE_RATELIMIT_STATE(_rs, \
DEFAULT_RATELIMIT_INTERVAL, \
DEFAULT_RATELIMIT_BURST); \
+ \
+ rcu_read_lock(); \
if (__ratelimit(&_rs)) \
btrfs_printk(fs_info, fmt, ##args); \
-} while (0)
-
-#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \
-do { \
- rcu_read_lock(); \
- btrfs_printk_ratelimited(fs_info, fmt, ##args); \
rcu_read_unlock(); \
} while (0)
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index 0d599fd847c9..ff5eac84d819 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -7,6 +7,8 @@
#include <linux/bitmap.h>
#include <linux/sched.h>
#include <linux/wait.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
#include <linux/math64.h>
#include <linux/rbtree.h>
@@ -119,28 +121,23 @@ static inline struct rb_node *rb_simple_search_first(const struct rb_root *root,
return ret;
}
-static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr,
- struct rb_node *node)
+static int rb_simple_node_bytenr_cmp(struct rb_node *new, const struct rb_node *existing)
{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct rb_simple_node *entry;
+ struct rb_simple_node *new_entry = rb_entry(new, struct rb_simple_node, rb_node);
+ struct rb_simple_node *existing_entry = rb_entry(existing, struct rb_simple_node, rb_node);
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct rb_simple_node, rb_node);
+ if (new_entry->bytenr < existing_entry->bytenr)
+ return -1;
+ else if (new_entry->bytenr > existing_entry->bytenr)
+ return 1;
- if (bytenr < entry->bytenr)
- p = &(*p)->rb_left;
- else if (bytenr > entry->bytenr)
- p = &(*p)->rb_right;
- else
- return parent;
- }
+ return 0;
+}
- rb_link_node(node, parent, p);
- rb_insert_color(node, root);
- return NULL;
+static inline struct rb_node *rb_simple_insert(struct rb_root *root,
+ struct rb_simple_node *simple_node)
+{
+ return rb_find_add(&simple_node->rb_node, root, rb_simple_node_bytenr_cmp);
}
static inline bool bitmap_test_range_all_set(const unsigned long *addr,
@@ -163,4 +160,9 @@ static inline bool bitmap_test_range_all_zero(const unsigned long *addr,
return (found_set == start + nbits);
}
+static inline u64 folio_end(struct folio *folio)
+{
+ return folio_pos(folio) + folio_size(folio);
+}
+
#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 9212ce110cde..2829f20d7bb5 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -359,7 +359,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
if (folio) {
ASSERT(folio->mapping);
ASSERT(folio_pos(folio) <= file_offset);
- ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio));
+ ASSERT(file_offset + len <= folio_end(folio));
/*
* Ordered flag indicates whether we still have
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fc821aa446f0..74e38da9bd39 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -190,7 +190,7 @@ static void print_uuid_item(const struct extent_buffer *l, unsigned long offset,
u32 item_size)
{
if (!IS_ALIGNED(item_size, sizeof(u64))) {
- pr_warn("BTRFS: uuid item with illegal size %lu!\n",
+ btrfs_warn(l->fs_info, "uuid item with illegal size %lu",
(unsigned long)item_size);
return;
}
@@ -223,7 +223,7 @@ static void print_eb_refs_lock(const struct extent_buffer *eb)
{
#ifdef CONFIG_BTRFS_DEBUG
btrfs_info(eb->fs_info, "refs %u lock_owner %u current %u",
- atomic_read(&eb->refs), eb->lock_owner, current->pid);
+ refcount_read(&eb->refs), eb->lock_owner, current->pid);
#endif
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b3176edbde82..1a5972178b3a 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -160,23 +160,34 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
int init_flags);
static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
+static int btrfs_qgroup_qgroupid_key_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *qgroupid = key;
+ const struct btrfs_qgroup *qgroup = rb_entry(node, struct btrfs_qgroup, node);
+
+ if (qgroup->qgroupid < *qgroupid)
+ return -1;
+ else if (qgroup->qgroupid > *qgroupid)
+ return 1;
+
+ return 0;
+}
+
/* must be called with qgroup_ioctl_lock held */
static struct btrfs_qgroup *find_qgroup_rb(const struct btrfs_fs_info *fs_info,
u64 qgroupid)
{
- struct rb_node *n = fs_info->qgroup_tree.rb_node;
- struct btrfs_qgroup *qgroup;
+ struct rb_node *node;
- while (n) {
- qgroup = rb_entry(n, struct btrfs_qgroup, node);
- if (qgroup->qgroupid < qgroupid)
- n = n->rb_left;
- else if (qgroup->qgroupid > qgroupid)
- n = n->rb_right;
- else
- return qgroup;
- }
- return NULL;
+ node = rb_find(&qgroupid, &fs_info->qgroup_tree, btrfs_qgroup_qgroupid_key_cmp);
+ return rb_entry_safe(node, struct btrfs_qgroup, node);
+}
+
+static int btrfs_qgroup_qgroupid_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct btrfs_qgroup *new_qgroup = rb_entry(new, struct btrfs_qgroup, node);
+
+ return btrfs_qgroup_qgroupid_key_cmp(&new_qgroup->qgroupid, existing);
}
/*
@@ -191,39 +202,25 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *prealloc,
u64 qgroupid)
{
- struct rb_node **p = &fs_info->qgroup_tree.rb_node;
- struct rb_node *parent = NULL;
- struct btrfs_qgroup *qgroup;
+ struct rb_node *node;
/* Caller must have pre-allocated @prealloc. */
ASSERT(prealloc);
- while (*p) {
- parent = *p;
- qgroup = rb_entry(parent, struct btrfs_qgroup, node);
-
- if (qgroup->qgroupid < qgroupid) {
- p = &(*p)->rb_left;
- } else if (qgroup->qgroupid > qgroupid) {
- p = &(*p)->rb_right;
- } else {
- kfree(prealloc);
- return qgroup;
- }
+ prealloc->qgroupid = qgroupid;
+ node = rb_find_add(&prealloc->node, &fs_info->qgroup_tree, btrfs_qgroup_qgroupid_cmp);
+ if (node) {
+ kfree(prealloc);
+ return rb_entry(node, struct btrfs_qgroup, node);
}
- qgroup = prealloc;
- qgroup->qgroupid = qgroupid;
- INIT_LIST_HEAD(&qgroup->groups);
- INIT_LIST_HEAD(&qgroup->members);
- INIT_LIST_HEAD(&qgroup->dirty);
- INIT_LIST_HEAD(&qgroup->iterator);
- INIT_LIST_HEAD(&qgroup->nested_iterator);
+ INIT_LIST_HEAD(&prealloc->groups);
+ INIT_LIST_HEAD(&prealloc->members);
+ INIT_LIST_HEAD(&prealloc->dirty);
+ INIT_LIST_HEAD(&prealloc->iterator);
+ INIT_LIST_HEAD(&prealloc->nested_iterator);
- rb_link_node(&qgroup->node, parent, p);
- rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
-
- return qgroup;
+ return prealloc;
}
static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
@@ -349,13 +346,27 @@ int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid
}
#endif
-static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info)
+__printf(2, 3)
+static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
+ const u64 old_flags = fs_info->qgroup_flags;
+
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
return;
fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT |
BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
+ if (!(old_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_warn_rl(fs_info, "qgroup marked inconsistent, %pV", &vaf);
+ va_end(args);
+ }
}
static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info,
@@ -386,12 +397,6 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
if (!fs_info->quota_root)
return 0;
- fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
- if (!fs_info->qgroup_ulist) {
- ret = -ENOMEM;
- goto out;
- }
-
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -434,13 +439,10 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
goto out;
}
fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr);
- if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) {
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE)
qgroup_read_enable_gen(fs_info, l, slot, ptr);
- } else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) {
- qgroup_mark_inconsistent(fs_info);
- btrfs_err(fs_info,
- "qgroup generation mismatch, marked as inconsistent");
- }
+ else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation)
+ qgroup_mark_inconsistent(fs_info, "qgroup generation mismatch");
rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
goto next1;
}
@@ -451,10 +453,8 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
qgroup = find_qgroup_rb(fs_info, found_key.offset);
if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
- (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
- btrfs_err(fs_info, "inconsistent qgroup config");
- qgroup_mark_inconsistent(fs_info);
- }
+ (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY))
+ qgroup_mark_inconsistent(fs_info, "inconsistent qgroup config");
if (!qgroup) {
struct btrfs_qgroup *prealloc;
struct btrfs_root *tree_root = fs_info->tree_root;
@@ -476,7 +476,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
* during mount before we start doing things like creating
* subvolumes.
*/
- if (is_fstree(qgroup->qgroupid) &&
+ if (btrfs_is_fstree(qgroup->qgroupid) &&
qgroup->qgroupid > tree_root->free_objectid)
/*
* Don't need to check against BTRFS_LAST_FREE_OBJECTID,
@@ -581,8 +581,6 @@ out:
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
} else {
- ulist_free(fs_info->qgroup_ulist);
- fs_info->qgroup_ulist = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
btrfs_sysfs_del_qgroups(fs_info);
}
@@ -630,29 +628,30 @@ bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info)
/*
* This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
- * first two are in single-threaded paths.And for the third one, we have set
- * quota_root to be null with qgroup_lock held before, so it is safe to clean
- * up the in-memory structures without qgroup_lock held.
+ * first two are in single-threaded paths.
*/
void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
{
struct rb_node *n;
struct btrfs_qgroup *qgroup;
+ /*
+ * btrfs_quota_disable() can be called concurrently with
+ * btrfs_qgroup_rescan() -> qgroup_rescan_zero_tracking(), so take the
+ * lock.
+ */
+ spin_lock(&fs_info->qgroup_lock);
while ((n = rb_first(&fs_info->qgroup_tree))) {
qgroup = rb_entry(n, struct btrfs_qgroup, node);
rb_erase(n, &fs_info->qgroup_tree);
__del_qgroup_rb(qgroup);
+ spin_unlock(&fs_info->qgroup_lock);
btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
kfree(qgroup);
+ spin_lock(&fs_info->qgroup_lock);
}
- /*
- * We call btrfs_free_qgroup_config() when unmounting
- * filesystem and disabling quota, so we set qgroup_ulist
- * to be null here to avoid double free.
- */
- ulist_free(fs_info->qgroup_ulist);
- fs_info->qgroup_ulist = NULL;
+ spin_unlock(&fs_info->qgroup_lock);
+
btrfs_sysfs_del_qgroups(fs_info);
}
@@ -998,7 +997,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup = NULL;
struct btrfs_qgroup *prealloc = NULL;
struct btrfs_trans_handle *trans = NULL;
- struct ulist *ulist = NULL;
const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA);
int ret = 0;
int slot;
@@ -1021,12 +1019,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
if (fs_info->quota_root)
goto out;
- ulist = ulist_alloc(GFP_KERNEL);
- if (!ulist) {
- ret = -ENOMEM;
- goto out;
- }
-
ret = btrfs_sysfs_add_qgroups(fs_info);
if (ret < 0)
goto out;
@@ -1066,9 +1058,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
if (fs_info->quota_root)
goto out;
- fs_info->qgroup_ulist = ulist;
- ulist = NULL;
-
/*
* initially create the quota tree
*/
@@ -1155,11 +1144,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
prealloc = NULL;
- if (IS_ERR(qgroup)) {
- ret = PTR_ERR(qgroup);
- btrfs_abort_transaction(trans, ret);
- goto out_free_path;
- }
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
@@ -1272,17 +1256,13 @@ out_free_root:
if (ret)
btrfs_put_root(quota_root);
out:
- if (ret) {
- ulist_free(fs_info->qgroup_ulist);
- fs_info->qgroup_ulist = NULL;
+ if (ret)
btrfs_sysfs_del_qgroups(fs_info);
- }
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (ret && trans)
btrfs_end_transaction(trans);
else if (trans)
ret = btrfs_end_transaction(trans);
- ulist_free(ulist);
kfree(prealloc);
return ret;
}
@@ -1354,11 +1334,14 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
/*
* We have nothing held here and no trans handle, just return the error
- * if there is one.
+ * if there is one and set back the quota enabled bit since we didn't
+ * actually disable quotas.
*/
ret = flush_reservations(fs_info);
- if (ret)
+ if (ret) {
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
return ret;
+ }
/*
* 1 For the root item
@@ -1679,9 +1662,6 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
struct btrfs_qgroup *prealloc = NULL;
int ret = 0;
- if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED)
- return 0;
-
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root) {
ret = -ENOTCONN;
@@ -1844,13 +1824,12 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
if (qgroup->rfer || qgroup->excl ||
qgroup->rfer_cmpr || qgroup->excl_cmpr) {
DEBUG_WARN();
- btrfs_warn_rl(fs_info,
-"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
- btrfs_qgroup_level(qgroup->qgroupid),
- btrfs_qgroup_subvolid(qgroup->qgroupid),
- qgroup->rfer, qgroup->rfer_cmpr,
- qgroup->excl, qgroup->excl_cmpr);
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info,
+ "to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
+ btrfs_qgroup_level(qgroup->qgroupid),
+ btrfs_qgroup_subvolid(qgroup->qgroupid),
+ qgroup->rfer, qgroup->rfer_cmpr,
+ qgroup->excl, qgroup->excl_cmpr);
}
}
del_qgroup_rb(fs_info, qgroupid);
@@ -1873,7 +1852,8 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su
struct btrfs_trans_handle *trans;
int ret;
- if (!is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) || !fs_info->quota_root)
+ if (!btrfs_is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) ||
+ !fs_info->quota_root)
return 0;
/*
@@ -1968,11 +1948,8 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
spin_unlock(&fs_info->qgroup_lock);
ret = update_qgroup_limit_item(trans, qgroup);
- if (ret) {
- qgroup_mark_inconsistent(fs_info);
- btrfs_info(fs_info, "unable to update quota limit for %llu",
- qgroupid);
- }
+ if (ret)
+ qgroup_mark_inconsistent(fs_info, "qgroup item update error %d", ret);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
@@ -2027,7 +2004,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
ret = __xa_store(&delayed_refs->dirty_extents, index, record, GFP_ATOMIC);
xa_unlock(&delayed_refs->dirty_extents);
if (xa_is_err(ret)) {
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info, "xarray insert error: %d", xa_err(ret));
return xa_err(ret);
}
@@ -2094,10 +2071,8 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
ret = btrfs_find_all_roots(&ctx, true);
if (ret < 0) {
- qgroup_mark_inconsistent(fs_info);
- btrfs_warn(fs_info,
-"error accounting new delayed refs extent (err code: %d), quota inconsistent",
- ret);
+ qgroup_mark_inconsistent(fs_info,
+ "error accounting new delayed refs extent: %d", ret);
return 0;
}
@@ -2341,7 +2316,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
/* For src_path */
- atomic_inc(&src_eb->refs);
+ refcount_inc(&src_eb->refs);
src_path->nodes[root_level] = src_eb;
src_path->slots[root_level] = dst_path->slots[root_level];
src_path->locks[root_level] = 0;
@@ -2574,7 +2549,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
goto out;
}
/* For dst_path */
- atomic_inc(&dst_eb->refs);
+ refcount_inc(&dst_eb->refs);
dst_path->nodes[level] = dst_eb;
dst_path->slots[level] = 0;
dst_path->locks[level] = 0;
@@ -2589,7 +2564,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
out:
btrfs_free_path(dst_path);
if (ret < 0)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info, "%s error: %d", __func__, ret);
return ret;
}
@@ -2633,7 +2608,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
* mark qgroup inconsistent.
*/
if (root_level >= drop_subptree_thres) {
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info, "subtree level reached threshold");
return 0;
}
@@ -2666,7 +2641,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
* walk back up the tree (adjusting slot pointers as we go)
* and restart the search process.
*/
- atomic_inc(&root_eb->refs); /* For path */
+ refcount_inc(&root_eb->refs); /* For path */
path->nodes[root_level] = root_eb;
path->slots[root_level] = 0;
path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
@@ -2932,7 +2907,7 @@ static int maybe_fs_roots(struct ulist *roots)
* trees.
* If it contains a non-fs tree, it won't be shared with fs/subvol trees.
*/
- return is_fstree(unode->val);
+ return btrfs_is_fstree(unode->val);
}
int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
@@ -3133,10 +3108,12 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
spin_unlock(&fs_info->qgroup_lock);
ret = update_qgroup_info_item(trans, qgroup);
if (ret)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info,
+ "qgroup info item update error %d", ret);
ret = update_qgroup_limit_item(trans, qgroup);
if (ret)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info,
+ "qgroup limit item update error %d", ret);
spin_lock(&fs_info->qgroup_lock);
}
if (btrfs_qgroup_enabled(fs_info))
@@ -3147,7 +3124,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
ret = update_qgroup_status_item(trans);
if (ret)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info,
+ "qgroup status item update error %d", ret);
return ret;
}
@@ -3329,6 +3307,9 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
u32 level_size = 0;
u64 nums;
+ if (!btrfs_qgroup_enabled(fs_info))
+ return 0;
+
prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
if (!prealloc)
return -ENOMEM;
@@ -3352,8 +3333,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
if (!committing)
mutex_lock(&fs_info->qgroup_ioctl_lock);
- if (!btrfs_qgroup_enabled(fs_info))
- goto out;
quota_root = fs_info->quota_root;
if (!quota_root) {
@@ -3554,7 +3533,7 @@ out:
if (!committing)
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (need_rescan)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info, "qgroup inherit needs a rescan");
if (qlist_prealloc) {
for (int i = 0; i < inherit->num_qgroups; i++)
kfree(qlist_prealloc[i]);
@@ -3588,7 +3567,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
int ret = 0;
LIST_HEAD(qgroup_list);
- if (!is_fstree(ref_root))
+ if (!btrfs_is_fstree(ref_root))
return 0;
if (num_bytes == 0)
@@ -3648,7 +3627,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup;
LIST_HEAD(qgroup_list);
- if (!is_fstree(ref_root))
+ if (!btrfs_is_fstree(ref_root))
return;
if (num_bytes == 0)
@@ -4036,12 +4015,21 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
qgroup_rescan_zero_tracking(fs_info);
mutex_lock(&fs_info->qgroup_rescan_lock);
- fs_info->qgroup_rescan_running = true;
- btrfs_queue_work(fs_info->qgroup_rescan_workers,
- &fs_info->qgroup_rescan_work);
+ /*
+ * The rescan worker is only for full accounting qgroups, check if it's
+ * enabled as it is pointless to queue it otherwise. A concurrent quota
+ * disable may also have just cleared BTRFS_FS_QUOTA_ENABLED.
+ */
+ if (btrfs_qgroup_full_accounting(fs_info)) {
+ fs_info->qgroup_rescan_running = true;
+ btrfs_queue_work(fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
+ } else {
+ ret = -ENOTCONN;
+ }
mutex_unlock(&fs_info->qgroup_rescan_lock);
- return 0;
+ return ret;
}
int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
@@ -4128,8 +4116,8 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode,
* Now the entry is in [start, start + len), revert the
* EXTENT_QGROUP_RESERVED bit.
*/
- clear_ret = btrfs_clear_extent_bits(&inode->io_tree, entry_start,
- entry_end, EXTENT_QGROUP_RESERVED);
+ clear_ret = btrfs_clear_extent_bit(&inode->io_tree, entry_start, entry_end,
+ EXTENT_QGROUP_RESERVED, NULL);
if (!ret && clear_ret < 0)
ret = clear_ret;
@@ -4216,7 +4204,7 @@ static int qgroup_reserve_data(struct btrfs_inode *inode,
int ret;
if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(btrfs_root_id(root)) || len == 0)
+ !btrfs_is_fstree(btrfs_root_id(root)) || len == 0)
return 0;
/* @reserved parameter is mandatory for qgroup */
@@ -4469,7 +4457,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
int ret;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(btrfs_root_id(root)) || num_bytes == 0)
+ !btrfs_is_fstree(btrfs_root_id(root)) || num_bytes == 0)
return 0;
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
@@ -4514,7 +4502,7 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
struct btrfs_fs_info *fs_info = root->fs_info;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(btrfs_root_id(root)))
+ !btrfs_is_fstree(btrfs_root_id(root)))
return;
/* TODO: Update trace point to handle such free */
@@ -4530,7 +4518,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
struct btrfs_fs_info *fs_info = root->fs_info;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(btrfs_root_id(root)))
+ !btrfs_is_fstree(btrfs_root_id(root)))
return;
/*
@@ -4589,7 +4577,7 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
struct btrfs_fs_info *fs_info = root->fs_info;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(btrfs_root_id(root)))
+ !btrfs_is_fstree(btrfs_root_id(root)))
return;
/* Same as btrfs_qgroup_free_meta_prealloc() */
num_bytes = sub_root_meta_rsv(root, num_bytes,
@@ -4673,6 +4661,28 @@ out:
spin_unlock(&swapped_blocks->lock);
}
+static int qgroup_swapped_block_bytenr_key_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *bytenr = key;
+ const struct btrfs_qgroup_swapped_block *block = rb_entry(node,
+ struct btrfs_qgroup_swapped_block, node);
+
+ if (block->subvol_bytenr < *bytenr)
+ return -1;
+ else if (block->subvol_bytenr > *bytenr)
+ return 1;
+
+ return 0;
+}
+
+static int qgroup_swapped_block_bytenr_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct btrfs_qgroup_swapped_block *new_block = rb_entry(new,
+ struct btrfs_qgroup_swapped_block, node);
+
+ return qgroup_swapped_block_bytenr_key_cmp(&new_block->subvol_bytenr, existing);
+}
+
/*
* Add subtree roots record into @subvol_root.
*
@@ -4692,8 +4702,7 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
struct btrfs_fs_info *fs_info = subvol_root->fs_info;
struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
struct btrfs_qgroup_swapped_block *block;
- struct rb_node **cur;
- struct rb_node *parent = NULL;
+ struct rb_node *node;
int level = btrfs_header_level(subvol_parent) - 1;
int ret = 0;
@@ -4742,46 +4751,32 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
/* Insert @block into @blocks */
spin_lock(&blocks->lock);
- cur = &blocks->blocks[level].rb_node;
- while (*cur) {
+ node = rb_find_add(&block->node, &blocks->blocks[level], qgroup_swapped_block_bytenr_cmp);
+ if (node) {
struct btrfs_qgroup_swapped_block *entry;
- parent = *cur;
- entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
- node);
+ entry = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
- if (entry->subvol_bytenr < block->subvol_bytenr) {
- cur = &(*cur)->rb_left;
- } else if (entry->subvol_bytenr > block->subvol_bytenr) {
- cur = &(*cur)->rb_right;
- } else {
- if (entry->subvol_generation !=
- block->subvol_generation ||
- entry->reloc_bytenr != block->reloc_bytenr ||
- entry->reloc_generation !=
- block->reloc_generation) {
- /*
- * Duplicated but mismatch entry found.
- * Shouldn't happen.
- *
- * Marking qgroup inconsistent should be enough
- * for end users.
- */
- DEBUG_WARN("duplicated but mismatched entry found");
- ret = -EEXIST;
- }
- kfree(block);
- goto out_unlock;
+ if (entry->subvol_generation != block->subvol_generation ||
+ entry->reloc_bytenr != block->reloc_bytenr ||
+ entry->reloc_generation != block->reloc_generation) {
+ /*
+ * Duplicated but mismatch entry found. Shouldn't happen.
+ * Marking qgroup inconsistent should be enough for end
+ * users.
+ */
+ DEBUG_WARN("duplicated but mismatched entry found");
+ ret = -EEXIST;
}
+ kfree(block);
+ goto out_unlock;
}
- rb_link_node(&block->node, parent, cur);
- rb_insert_color(&block->node, &blocks->blocks[level]);
blocks->swapped = true;
out_unlock:
spin_unlock(&blocks->lock);
out:
if (ret < 0)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info, "%s error: %d", __func__, ret);
return ret;
}
@@ -4801,7 +4796,6 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_swapped_block *block;
struct extent_buffer *reloc_eb = NULL;
struct rb_node *node;
- bool found = false;
bool swapped = false;
int level = btrfs_header_level(subvol_eb);
int ret = 0;
@@ -4809,7 +4803,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
- if (!is_fstree(btrfs_root_id(root)) || !root->reloc_root)
+ if (!btrfs_is_fstree(btrfs_root_id(root)) || !root->reloc_root)
return 0;
spin_lock(&blocks->lock);
@@ -4817,23 +4811,14 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
spin_unlock(&blocks->lock);
return 0;
}
- node = blocks->blocks[level].rb_node;
-
- while (node) {
- block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
- if (block->subvol_bytenr < subvol_eb->start) {
- node = node->rb_left;
- } else if (block->subvol_bytenr > subvol_eb->start) {
- node = node->rb_right;
- } else {
- found = true;
- break;
- }
- }
- if (!found) {
+ node = rb_find(&subvol_eb->start, &blocks->blocks[level],
+ qgroup_swapped_block_bytenr_key_cmp);
+ if (!node) {
spin_unlock(&blocks->lock);
goto out;
}
+ block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
+
/* Found one, remove it from @blocks first and update blocks->swapped */
rb_erase(&block->node, &blocks->blocks[level]);
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
@@ -4869,10 +4854,9 @@ free_out:
free_extent_buffer(reloc_eb);
out:
if (ret < 0) {
- btrfs_err_rl(fs_info,
- "failed to account subtree at bytenr %llu: %d",
- subvol_eb->start, ret);
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info,
+ "failed to account subtree at bytenr %llu: %d",
+ subvol_eb->start, ret);
}
return ret;
}
@@ -4903,7 +4887,7 @@ int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
return 0;
- if (!is_fstree(root))
+ if (!btrfs_is_fstree(root))
return 0;
/* If the extent predates enabling quotas, don't count it. */
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 1834011ccc49..cab0b291088c 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -329,11 +329,14 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
item_size);
- if (ret == -EEXIST)
+ if (ret == -EEXIST) {
ret = update_raid_extent_item(trans, &stripe_key, stripe_extent,
item_size);
- if (ret)
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ } else if (ret) {
btrfs_abort_transaction(trans, ret);
+ }
kfree(stripe_extent);
diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h
deleted file mode 100644
index 1c2d7cb1fe6f..000000000000
--- a/fs/btrfs/rcu-string.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2012 Red Hat. All rights reserved.
- */
-
-#ifndef BTRFS_RCU_STRING_H
-#define BTRFS_RCU_STRING_H
-
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/rcupdate.h>
-#include <linux/printk.h>
-
-struct rcu_string {
- struct rcu_head rcu;
- char str[];
-};
-
-static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask)
-{
- size_t len = strlen(src) + 1;
- struct rcu_string *ret = kzalloc(sizeof(struct rcu_string) +
- (len * sizeof(char)), mask);
- if (!ret)
- return ret;
- /* Warn if the source got unexpectedly truncated. */
- if (WARN_ON(strscpy(ret->str, src, len) < 0)) {
- kfree(ret);
- return NULL;
- }
- return ret;
-}
-
-static inline void rcu_string_free(struct rcu_string *str)
-{
- if (str)
- kfree_rcu(str, rcu);
-}
-
-#define printk_in_rcu(fmt, ...) do { \
- rcu_read_lock(); \
- printk(fmt, __VA_ARGS__); \
- rcu_read_unlock(); \
-} while (0)
-
-#define printk_ratelimited_in_rcu(fmt, ...) do { \
- rcu_read_lock(); \
- printk_ratelimited(fmt, __VA_ARGS__); \
- rcu_read_unlock(); \
-} while (0)
-
-#define rcu_str_deref(rcu_str) ({ \
- struct rcu_string *__str = rcu_dereference(rcu_str); \
- __str->str; \
-})
-
-#endif
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 2928abf7eb82..3871c3a6c743 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -75,69 +75,70 @@ struct block_entry {
struct list_head actions;
};
+static int block_entry_bytenr_key_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *bytenr = key;
+ const struct block_entry *entry = rb_entry(node, struct block_entry, node);
+
+ if (entry->bytenr < *bytenr)
+ return 1;
+ else if (entry->bytenr > *bytenr)
+ return -1;
+
+ return 0;
+}
+
+static int block_entry_bytenr_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct block_entry *new_entry = rb_entry(new, struct block_entry, node);
+
+ return block_entry_bytenr_key_cmp(&new_entry->bytenr, existing);
+}
+
static struct block_entry *insert_block_entry(struct rb_root *root,
struct block_entry *be)
{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent_node = NULL;
- struct block_entry *entry;
-
- while (*p) {
- parent_node = *p;
- entry = rb_entry(parent_node, struct block_entry, node);
- if (entry->bytenr > be->bytenr)
- p = &(*p)->rb_left;
- else if (entry->bytenr < be->bytenr)
- p = &(*p)->rb_right;
- else
- return entry;
- }
+ struct rb_node *node;
- rb_link_node(&be->node, parent_node, p);
- rb_insert_color(&be->node, root);
- return NULL;
+ node = rb_find_add(&be->node, root, block_entry_bytenr_cmp);
+ return rb_entry_safe(node, struct block_entry, node);
}
static struct block_entry *lookup_block_entry(struct rb_root *root, u64 bytenr)
{
- struct rb_node *n;
- struct block_entry *entry = NULL;
+ struct rb_node *node;
- n = root->rb_node;
- while (n) {
- entry = rb_entry(n, struct block_entry, node);
- if (entry->bytenr < bytenr)
- n = n->rb_right;
- else if (entry->bytenr > bytenr)
- n = n->rb_left;
- else
- return entry;
- }
- return NULL;
+ node = rb_find(&bytenr, root, block_entry_bytenr_key_cmp);
+ return rb_entry_safe(node, struct block_entry, node);
+}
+
+static int root_entry_root_objectid_key_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *objectid = key;
+ const struct root_entry *entry = rb_entry(node, struct root_entry, node);
+
+ if (entry->root_objectid < *objectid)
+ return 1;
+ else if (entry->root_objectid > *objectid)
+ return -1;
+
+ return 0;
+}
+
+static int root_entry_root_objectid_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct root_entry *new_entry = rb_entry(new, struct root_entry, node);
+
+ return root_entry_root_objectid_key_cmp(&new_entry->root_objectid, existing);
}
static struct root_entry *insert_root_entry(struct rb_root *root,
struct root_entry *re)
{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent_node = NULL;
- struct root_entry *entry;
-
- while (*p) {
- parent_node = *p;
- entry = rb_entry(parent_node, struct root_entry, node);
- if (entry->root_objectid > re->root_objectid)
- p = &(*p)->rb_left;
- else if (entry->root_objectid < re->root_objectid)
- p = &(*p)->rb_right;
- else
- return entry;
- }
-
- rb_link_node(&re->node, parent_node, p);
- rb_insert_color(&re->node, root);
- return NULL;
+ struct rb_node *node;
+ node = rb_find_add(&re->node, root, root_entry_root_objectid_cmp);
+ return rb_entry_safe(node, struct root_entry, node);
}
static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2)
@@ -161,48 +162,29 @@ static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2)
return 0;
}
+static int ref_entry_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ struct ref_entry *new_entry = rb_entry(new, struct ref_entry, node);
+ struct ref_entry *existing_entry = rb_entry(existing, struct ref_entry, node);
+
+ return comp_refs(new_entry, existing_entry);
+}
+
static struct ref_entry *insert_ref_entry(struct rb_root *root,
struct ref_entry *ref)
{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent_node = NULL;
- struct ref_entry *entry;
- int cmp;
-
- while (*p) {
- parent_node = *p;
- entry = rb_entry(parent_node, struct ref_entry, node);
- cmp = comp_refs(entry, ref);
- if (cmp > 0)
- p = &(*p)->rb_left;
- else if (cmp < 0)
- p = &(*p)->rb_right;
- else
- return entry;
- }
-
- rb_link_node(&ref->node, parent_node, p);
- rb_insert_color(&ref->node, root);
- return NULL;
+ struct rb_node *node;
+ node = rb_find_add(&ref->node, root, ref_entry_cmp);
+ return rb_entry_safe(node, struct ref_entry, node);
}
static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid)
{
- struct rb_node *n;
- struct root_entry *entry = NULL;
+ struct rb_node *node;
- n = root->rb_node;
- while (n) {
- entry = rb_entry(n, struct root_entry, node);
- if (entry->root_objectid < objectid)
- n = n->rb_right;
- else if (entry->root_objectid > objectid)
- n = n->rb_left;
- else
- return entry;
- }
- return NULL;
+ node = rb_find(&objectid, root, root_entry_root_objectid_key_cmp);
+ return rb_entry_safe(node, struct root_entry, node);
}
#ifdef CONFIG_STACKTRACE
@@ -668,7 +650,7 @@ static void dump_block_entry(struct btrfs_fs_info *fs_info,
* our sanity checks pass as they are no longer needed.
*/
int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
- struct btrfs_ref *generic_ref)
+ const struct btrfs_ref *generic_ref)
{
struct ref_entry *ref = NULL, *exist;
struct ref_action *ra = NULL;
diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h
index 3511e1a5c96b..559bd25a2b7a 100644
--- a/fs/btrfs/ref-verify.h
+++ b/fs/btrfs/ref-verify.h
@@ -19,7 +19,7 @@ struct btrfs_ref;
int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info);
void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info);
int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
- struct btrfs_ref *generic_ref);
+ const struct btrfs_ref *generic_ref);
void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
u64 len);
@@ -39,7 +39,7 @@ static inline void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info)
}
static inline int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
- struct btrfs_ref *generic_ref)
+ const struct btrfs_ref *generic_ref)
{
return 0;
}
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 62161beca559..ce25ab7f0e99 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -46,11 +46,9 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
- goto out;
+ return ret;
}
- ret = btrfs_end_transaction(trans);
-out:
- return ret;
+ return btrfs_end_transaction(trans);
}
static int copy_inline_to_page(struct btrfs_inode *inode,
@@ -95,8 +93,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
if (ret < 0)
goto out_unlock;
- btrfs_clear_extent_bits(&inode->io_tree, file_offset, range_end,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG);
+ btrfs_clear_extent_bit(&inode->io_tree, file_offset, range_end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, NULL);
ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
if (ret)
goto out_unlock;
@@ -270,11 +268,15 @@ copy_inline_extent:
drop_args.end = aligned_end;
drop_args.drop_cache = true;
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
write_extent_buffer(path->nodes[0], inline_data,
btrfs_item_ptr_offset(path->nodes[0],
@@ -283,6 +285,8 @@ copy_inline_extent:
btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found);
btrfs_set_inode_full_sync(inode);
ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
out:
if (!ret && !trans) {
/*
@@ -297,10 +301,8 @@ out:
trans = NULL;
}
}
- if (ret && trans) {
- btrfs_abort_transaction(trans, ret);
+ if (ret && trans)
btrfs_end_transaction(trans);
- }
if (!ret)
*trans_out = trans;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 02086191630d..e58151933844 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -90,10 +90,15 @@
* map address of tree root to tree
*/
struct mapping_node {
- struct {
- struct rb_node rb_node;
- u64 bytenr;
- }; /* Use rb_simle_node for search/insert */
+ union {
+ /* Use rb_simple_node for search/insert */
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ };
+
+ struct rb_simple_node simple_node;
+ };
void *data;
};
@@ -106,10 +111,15 @@ struct mapping_tree {
* present a tree block to process
*/
struct tree_block {
- struct {
- struct rb_node rb_node;
- u64 bytenr;
- }; /* Use rb_simple_node for search/insert */
+ union {
+ /* Use rb_simple_node for search/insert */
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ };
+
+ struct rb_simple_node simple_node;
+ };
u64 owner;
struct btrfs_key key;
u8 level;
@@ -480,8 +490,7 @@ static int __add_reloc_root(struct btrfs_root *root)
node->data = root;
spin_lock(&rc->reloc_root_tree.lock);
- rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
- node->bytenr, &node->rb_node);
+ rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node);
spin_unlock(&rc->reloc_root_tree.lock);
if (rb_node) {
btrfs_err(fs_info,
@@ -564,8 +573,7 @@ static int __update_reloc_root(struct btrfs_root *root)
spin_lock(&rc->reloc_root_tree.lock);
node->bytenr = root->node->start;
- rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
- node->bytenr, &node->rb_node);
+ rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node);
spin_unlock(&rc->reloc_root_tree.lock);
if (rb_node)
btrfs_backref_panic(fs_info, node->bytenr, -EEXIST);
@@ -1516,7 +1524,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_root_level(root_item);
- atomic_inc(&reloc_root->node->refs);
+ refcount_inc(&reloc_root->node->refs);
path->nodes[level] = reloc_root->node;
path->slots[level] = 0;
} else {
@@ -2617,7 +2625,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
* tree.
*/
if (block->owner &&
- (!is_fstree(block->owner) ||
+ (!btrfs_is_fstree(block->owner) ||
block->owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) {
ret = relocate_cowonly_block(trans, rc, block, path);
if (ret)
@@ -2658,66 +2666,24 @@ static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control
u64 num_bytes;
int nr;
int ret = 0;
- u64 i_size = i_size_read(&inode->vfs_inode);
u64 prealloc_start = cluster->start - offset;
u64 prealloc_end = cluster->end - offset;
u64 cur_offset = prealloc_start;
/*
- * For subpage case, previous i_size may not be aligned to PAGE_SIZE.
- * This means the range [i_size, PAGE_END + 1) is filled with zeros by
- * btrfs_do_readpage() call of previously relocated file cluster.
+ * For blocksize < folio size case (either bs < page size or large folios),
+ * beyond i_size, all blocks are filled with zero.
*
- * If the current cluster starts in the above range, btrfs_do_readpage()
+ * If the current cluster covers the above range, btrfs_do_readpage()
* will skip the read, and relocate_one_folio() will later writeback
* the padding zeros as new data, causing data corruption.
*
- * Here we have to manually invalidate the range (i_size, PAGE_END + 1).
+ * Here we have to invalidate the cache covering our cluster.
*/
- if (!PAGE_ALIGNED(i_size)) {
- struct address_space *mapping = inode->vfs_inode.i_mapping;
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- const u32 sectorsize = fs_info->sectorsize;
- struct folio *folio;
-
- ASSERT(sectorsize < PAGE_SIZE);
- ASSERT(IS_ALIGNED(i_size, sectorsize));
-
- /*
- * Subpage can't handle page with DIRTY but without UPTODATE
- * bit as it can lead to the following deadlock:
- *
- * btrfs_read_folio()
- * | Page already *locked*
- * |- btrfs_lock_and_flush_ordered_range()
- * |- btrfs_start_ordered_extent()
- * |- extent_write_cache_pages()
- * |- lock_page()
- * We try to lock the page we already hold.
- *
- * Here we just writeback the whole data reloc inode, so that
- * we will be ensured to have no dirty range in the page, and
- * are safe to clear the uptodate bits.
- *
- * This shouldn't cause too much overhead, as we need to write
- * the data back anyway.
- */
- ret = filemap_write_and_wait(mapping);
- if (ret < 0)
- return ret;
-
- folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT);
- /*
- * If page is freed we don't need to do anything then, as we
- * will re-read the whole page anyway.
- */
- if (!IS_ERR(folio)) {
- btrfs_subpage_clear_uptodate(fs_info, folio, i_size,
- round_up(i_size, PAGE_SIZE) - i_size);
- folio_unlock(folio);
- folio_put(folio);
- }
- }
+ ret = filemap_invalidate_inode(&inode->vfs_inode, true, prealloc_start,
+ prealloc_end);
+ if (ret < 0)
+ return ret;
BUG_ON(cluster->start != cluster->boundary[0]);
ret = btrfs_alloc_data_chunk_ondemand(inode,
@@ -2806,13 +2772,15 @@ static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster,
static int relocate_one_folio(struct reloc_control *rc,
struct file_ra_state *ra,
- int *cluster_nr, unsigned long index)
+ int *cluster_nr, u64 *file_offset_ret)
{
const struct file_extent_cluster *cluster = &rc->cluster;
struct inode *inode = rc->data_inode;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ const u64 orig_file_offset = *file_offset_ret;
u64 offset = BTRFS_I(inode)->reloc_block_group_start;
- const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
+ const pgoff_t last_index = (cluster->end - offset) >> PAGE_SHIFT;
+ const pgoff_t index = orig_file_offset >> PAGE_SHIFT;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
struct folio *folio;
u64 folio_start;
@@ -2845,8 +2813,6 @@ again:
return PTR_ERR(folio);
}
- WARN_ON(folio_order(folio));
-
if (folio_test_readahead(folio) && !use_rst)
page_cache_async_readahead(inode->i_mapping, ra, NULL,
folio, last_index + 1 - index);
@@ -2875,7 +2841,7 @@ again:
goto release_folio;
folio_start = folio_pos(folio);
- folio_end = folio_start + PAGE_SIZE - 1;
+ folio_end = folio_start + folio_size(folio) - 1;
/*
* Start from the cluster, as for subpage case, the cluster can start
@@ -2923,7 +2889,8 @@ again:
* EXTENT_BOUNDARY bit prevents current extent from being merged
* with previous extent.
*/
- if (in_range(cluster->boundary[*cluster_nr] - offset, folio_start, PAGE_SIZE)) {
+ if (in_range(cluster->boundary[*cluster_nr] - offset,
+ folio_start, folio_size(folio))) {
u64 boundary_start = cluster->boundary[*cluster_nr] -
offset;
u64 boundary_end = boundary_start +
@@ -2953,6 +2920,7 @@ again:
btrfs_throttle(fs_info);
if (btrfs_should_cancel_balance(fs_info))
ret = -ECANCELED;
+ *file_offset_ret = folio_end + 1;
return ret;
release_folio:
@@ -2966,8 +2934,7 @@ static int relocate_file_extent_cluster(struct reloc_control *rc)
struct inode *inode = rc->data_inode;
const struct file_extent_cluster *cluster = &rc->cluster;
u64 offset = BTRFS_I(inode)->reloc_block_group_start;
- unsigned long index;
- unsigned long last_index;
+ u64 cur_file_offset = cluster->start - offset;
struct file_ra_state *ra;
int cluster_nr = 0;
int ret = 0;
@@ -2989,10 +2956,11 @@ static int relocate_file_extent_cluster(struct reloc_control *rc)
if (ret)
goto out;
- last_index = (cluster->end - offset) >> PAGE_SHIFT;
- for (index = (cluster->start - offset) >> PAGE_SHIFT;
- index <= last_index && !ret; index++)
- ret = relocate_one_folio(rc, ra, &cluster_nr, index);
+ while (cur_file_offset < cluster->end - offset) {
+ ret = relocate_one_folio(rc, ra, &cluster_nr, &cur_file_offset);
+ if (ret)
+ break;
+ }
if (ret == 0)
WARN_ON(cluster_nr != cluster->nr);
out:
@@ -3155,7 +3123,7 @@ static int add_tree_block(struct reloc_control *rc,
block->key_ready = false;
block->owner = owner;
- rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
+ rb_node = rb_simple_insert(blocks, &block->simple_node);
if (rb_node)
btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr,
-EEXIST);
@@ -3643,7 +3611,7 @@ restart:
}
btrfs_release_path(path);
- btrfs_clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
+ btrfs_clear_extent_bit(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, NULL);
if (trans) {
btrfs_end_transaction_throttle(trans);
@@ -3880,7 +3848,7 @@ static void free_reloc_control(struct reloc_control *rc)
*/
static void describe_relocation(struct btrfs_block_group *block_group)
{
- char buf[128] = {'\0'};
+ char buf[128] = "NONE";
btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
@@ -3900,7 +3868,8 @@ static const char *stage_to_string(enum reloc_stage stage)
/*
* function to relocate all extents in a block group.
*/
-int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
+int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
+ bool verbose)
{
struct btrfs_block_group *bg;
struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start);
@@ -3992,7 +3961,8 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
goto out;
}
- describe_relocation(rc->block_group);
+ if (verbose)
+ describe_relocation(rc->block_group);
btrfs_wait_block_group_reservations(rc->block_group);
btrfs_wait_nocow_writers(rc->block_group);
@@ -4036,8 +4006,10 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
if (rc->extents_found == 0)
break;
- btrfs_info(fs_info, "found %llu extents, stage: %s",
- rc->extents_found, stage_to_string(finishes_stage));
+ if (verbose)
+ btrfs_info(fs_info, "found %llu extents, stage: %s",
+ rc->extents_found,
+ stage_to_string(finishes_stage));
}
WARN_ON(rc->block_group->pinned > 0);
@@ -4339,7 +4311,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
}
btrfs_backref_drop_node_buffer(node);
- atomic_inc(&cow->refs);
+ refcount_inc(&cow->refs);
node->eb = cow;
node->new_bytenr = cow->start;
diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
index 788c86d8633a..5c36b3f84b57 100644
--- a/fs/btrfs/relocation.h
+++ b/fs/btrfs/relocation.h
@@ -12,7 +12,8 @@ struct btrfs_trans_handle;
struct btrfs_ordered_extent;
struct btrfs_pending_snapshot;
-int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start);
+int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
+ bool verbose);
int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root);
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ce36fafc771e..6776e6ab8d10 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -556,8 +556,8 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
* hold all of the paths here
*/
for (i = 0; i < ipath->fspath->elem_cnt; ++i)
- btrfs_warn_in_rcu(fs_info,
-"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
+ btrfs_warn(fs_info,
+"scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu length %u links %u (path: %s)",
swarn->errstr, swarn->logical,
btrfs_dev_name(swarn->dev),
swarn->physical,
@@ -570,8 +570,8 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
return 0;
err:
- btrfs_warn_in_rcu(fs_info,
- "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
+ btrfs_warn(fs_info,
+ "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu: path resolving failed with ret=%d",
swarn->errstr, swarn->logical,
btrfs_dev_name(swarn->dev),
swarn->physical,
@@ -596,7 +596,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
/* Super block error, no need to search extent tree. */
if (is_super) {
- btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
+ btrfs_warn(fs_info, "scrub: %s on device %s, physical %llu",
errstr, btrfs_dev_name(dev), physical);
return;
}
@@ -631,14 +631,14 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
&ref_level);
if (ret < 0) {
btrfs_warn(fs_info,
- "failed to resolve tree backref for logical %llu: %d",
- swarn.logical, ret);
+ "scrub: failed to resolve tree backref for logical %llu: %d",
+ swarn.logical, ret);
break;
}
if (ret > 0)
break;
- btrfs_warn_in_rcu(fs_info,
-"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
+ btrfs_warn(fs_info,
+"scrub: %s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
errstr, swarn.logical, btrfs_dev_name(dev),
swarn.physical, (ref_level ? "node" : "leaf"),
ref_level, ref_root);
@@ -718,7 +718,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad bytenr, has %llu want %llu",
+ "scrub: tree block %llu mirror %u has bad bytenr, has %llu want %llu",
logical, stripe->mirror_num,
btrfs_stack_header_bytenr(header), logical);
return;
@@ -728,7 +728,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad fsid, has %pU want %pU",
+ "scrub: tree block %llu mirror %u has bad fsid, has %pU want %pU",
logical, stripe->mirror_num,
header->fsid, fs_info->fs_devices->fsid);
return;
@@ -738,7 +738,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
+ "scrub: tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
logical, stripe->mirror_num,
header->chunk_tree_uuid, fs_info->chunk_tree_uuid);
return;
@@ -760,7 +760,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
+"scrub: tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
logical, stripe->mirror_num,
CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
@@ -771,7 +771,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
scrub_bitmap_set_meta_gen_error(stripe, sector_nr, sectors_per_tree);
scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad generation, has %llu want %llu",
+ "scrub: tree block %llu mirror %u has bad generation, has %llu want %llu",
logical, stripe->mirror_num,
btrfs_stack_header_generation(header),
stripe->sectors[sector_nr].generation);
@@ -814,7 +814,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
*/
if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) {
btrfs_warn_rl(fs_info,
- "tree block at %llu crosses stripe boundary %llu",
+ "scrub: tree block at %llu crosses stripe boundary %llu",
stripe->logical +
(sector_nr << fs_info->sectorsize_bits),
stripe->logical);
@@ -1045,13 +1045,13 @@ skip:
*/
if (repaired) {
if (dev) {
- btrfs_err_rl_in_rcu(fs_info,
- "fixed up error at logical %llu on dev %s physical %llu",
+ btrfs_err_rl(fs_info,
+ "scrub: fixed up error at logical %llu on dev %s physical %llu",
stripe->logical, btrfs_dev_name(dev),
physical);
} else {
- btrfs_err_rl_in_rcu(fs_info,
- "fixed up error at logical %llu on mirror %u",
+ btrfs_err_rl(fs_info,
+ "scrub: fixed up error at logical %llu on mirror %u",
stripe->logical, stripe->mirror_num);
}
continue;
@@ -1059,13 +1059,13 @@ skip:
/* The remaining are all for unrepaired. */
if (dev) {
- btrfs_err_rl_in_rcu(fs_info,
- "unable to fixup (regular) error at logical %llu on dev %s physical %llu",
+ btrfs_err_rl(fs_info,
+"scrub: unable to fixup (regular) error at logical %llu on dev %s physical %llu",
stripe->logical, btrfs_dev_name(dev),
physical);
} else {
- btrfs_err_rl_in_rcu(fs_info,
- "unable to fixup (regular) error at logical %llu on mirror %u",
+ btrfs_err_rl(fs_info,
+ "scrub: unable to fixup (regular) error at logical %llu on mirror %u",
stripe->logical, stripe->mirror_num);
}
@@ -1593,8 +1593,7 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
physical,
sctx->write_pointer);
if (ret)
- btrfs_err(fs_info,
- "zoned: failed to recover write pointer");
+ btrfs_err(fs_info, "scrub: zoned: failed to recover write pointer");
}
mutex_unlock(&sctx->wr_lock);
btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
@@ -1658,7 +1657,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
int ret;
if (unlikely(!extent_root || !csum_root)) {
- btrfs_err(fs_info, "no valid extent or csum root for scrub");
+ btrfs_err(fs_info, "scrub: no valid extent or csum root found");
return -EUCLEAN;
}
memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) *
@@ -1807,7 +1806,7 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe)
struct btrfs_io_context *bioc = NULL;
const u64 logical = stripe->logical +
(i << fs_info->sectorsize_bits);
- int err;
+ int ret;
io_stripe.rst_search_commit_root = true;
stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits;
@@ -1815,11 +1814,11 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe)
* For RST cases, we need to manually split the bbio to
* follow the RST boundary.
*/
- err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
&stripe_len, &bioc, &io_stripe, &mirror);
btrfs_put_bioc(bioc);
- if (err < 0) {
- if (err != -ENODATA) {
+ if (ret < 0) {
+ if (ret != -ENODATA) {
/*
* Earlier btrfs_get_raid_extent_offset()
* returned -ENODATA, which means there's
@@ -1907,7 +1906,7 @@ static bool stripe_has_metadata_error(struct scrub_stripe *stripe)
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
btrfs_err(fs_info,
- "stripe %llu has unrepaired metadata sector at %llu",
+ "scrub: stripe %llu has unrepaired metadata sector at logical %llu",
stripe->logical,
stripe->logical + (i << fs_info->sectorsize_bits));
return true;
@@ -2167,7 +2166,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
bitmap_and(&error, &error, &has_extent, stripe->nr_sectors);
if (!bitmap_empty(&error, stripe->nr_sectors)) {
btrfs_err(fs_info,
-"unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
+"scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
full_stripe_start, i, stripe->nr_sectors,
&error);
ret = -EIO;
@@ -2789,14 +2788,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
ro_set = 0;
} else if (ret == -ETXTBSY) {
btrfs_warn(fs_info,
- "skipping scrub of block group %llu due to active swapfile",
+ "scrub: skipping scrub of block group %llu due to active swapfile",
cache->start);
scrub_pause_off(fs_info);
ret = 0;
goto skip_unfreeze;
} else {
- btrfs_warn(fs_info,
- "failed setting block group ro: %d", ret);
+ btrfs_warn(fs_info, "scrub: failed setting block group ro: %d",
+ ret);
btrfs_unfreeze_block_group(cache);
btrfs_put_block_group(cache);
scrub_pause_off(fs_info);
@@ -2892,13 +2891,13 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev,
ret = btrfs_check_super_csum(fs_info, sb);
if (ret != 0) {
btrfs_err_rl(fs_info,
- "super block at physical %llu devid %llu has bad csum",
+ "scrub: super block at physical %llu devid %llu has bad csum",
physical, dev->devid);
return -EIO;
}
if (btrfs_super_generation(sb) != generation) {
btrfs_err_rl(fs_info,
-"super block at physical %llu devid %llu has bad generation %llu expect %llu",
+"scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu",
physical, dev->devid,
btrfs_super_generation(sb), generation);
return -EUCLEAN;
@@ -3058,8 +3057,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (!is_dev_replace && !readonly &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- btrfs_err_in_rcu(fs_info,
- "scrub on devid %llu: filesystem on %s is not writable",
+ btrfs_err(fs_info,
+ "scrub: devid %llu: filesystem on %s is not writable",
devid, btrfs_dev_name(dev));
ret = -EROFS;
goto out;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 2891ec4056c6..7664025a5af4 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4,6 +4,7 @@
*/
#include <linux/bsearch.h>
+#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/sort.h>
@@ -758,7 +759,7 @@ static int send_header(struct send_ctx *sctx)
{
struct btrfs_stream_header hdr;
- strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
+ strscpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
hdr.version = cpu_to_le32(sctx->proto);
return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
&sctx->send_off);
@@ -1804,7 +1805,7 @@ static int gen_unique_name(struct send_ctx *sctx,
ino, gen, idx);
ASSERT(len < sizeof(tmp));
tmp_name.name = tmp;
- tmp_name.len = strlen(tmp);
+ tmp_name.len = len;
di = btrfs_lookup_dir_item(NULL, sctx->send_root,
path, BTRFS_FIRST_FREE_OBJECTID,
@@ -1843,7 +1844,7 @@ static int gen_unique_name(struct send_ctx *sctx,
break;
}
- ret = fs_path_add(dest, tmp, strlen(tmp));
+ ret = fs_path_add(dest, tmp, len);
out:
btrfs_free_path(path);
@@ -4628,7 +4629,6 @@ static int rbtree_ref_comp(const void *k, const struct rb_node *node)
{
const struct recorded_ref *data = k;
const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node);
- int result;
if (data->dir > ref->dir)
return 1;
@@ -4642,12 +4642,7 @@ static int rbtree_ref_comp(const void *k, const struct rb_node *node)
return 1;
if (data->name_len < ref->name_len)
return -1;
- result = strcmp(data->name, ref->name);
- if (result > 0)
- return 1;
- if (result < 0)
- return -1;
- return 0;
+ return strcmp(data->name, ref->name);
}
static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent)
@@ -5411,6 +5406,30 @@ tlv_put_failure:
return ret;
}
+static int send_fallocate(struct send_ctx *sctx, u32 mode, u64 offset, u64 len)
+{
+ struct fs_path *path;
+ int ret;
+
+ path = get_cur_inode_path(sctx);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_FALLOCATE);
+ if (ret < 0)
+ return ret;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_FALLOCATE_MODE, mode);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+ return ret;
+}
+
static int send_hole(struct send_ctx *sctx, u64 end)
{
struct fs_path *p = NULL;
@@ -5419,6 +5438,14 @@ static int send_hole(struct send_ctx *sctx, u64 end)
int ret = 0;
/*
+ * Starting with send stream v2 we have fallocate and can use it to
+ * punch holes instead of sending writes full of zeroes.
+ */
+ if (proto_cmd_ok(sctx, BTRFS_SEND_C_FALLOCATE))
+ return send_fallocate(sctx, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ offset, end - offset);
+
+ /*
* A hole that starts at EOF or beyond it. Since we do not yet support
* fallocate (for extent preallocation and hole punching), sending a
* write of zeroes starting at EOF or beyond would later require issuing
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index d9087aa81b21..0481c693ac2e 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -615,7 +615,7 @@ static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info,
void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info, u64 bytes,
- int dump_block_groups)
+ bool dump_block_groups)
{
struct btrfs_block_group *cache;
u64 total_avail = 0;
@@ -1887,7 +1887,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
space_info->flags, orig_bytes, 1);
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
- btrfs_dump_space_info(fs_info, space_info, orig_bytes, 0);
+ btrfs_dump_space_info(fs_info, space_info, orig_bytes, false);
}
return ret;
}
@@ -1918,7 +1918,7 @@ int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes,
trace_btrfs_space_reservation(fs_info, "space_info:enospc",
space_info->flags, bytes, 1);
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
- btrfs_dump_space_info(fs_info, space_info, bytes, 0);
+ btrfs_dump_space_info(fs_info, space_info, bytes, false);
}
return ret;
}
@@ -1973,13 +1973,13 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
static u64 calc_pct_ratio(u64 x, u64 y)
{
- int err;
+ int ret;
if (!y)
return 0;
again:
- err = check_mul_overflow(100, x, &x);
- if (err)
+ ret = check_mul_overflow(100, x, &x);
+ if (ret)
goto lose_precision;
return div64_u64(x, y);
lose_precision:
@@ -2139,7 +2139,7 @@ void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool
}
}
-bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info)
+static bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info)
{
bool ret;
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 92b7f5e2b850..679f22efb407 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -278,7 +278,7 @@ u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info,
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info, u64 bytes,
- int dump_block_groups);
+ bool dump_block_groups);
int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
u64 orig_bytes,
@@ -306,7 +306,6 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes);
void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready);
-bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info);
int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info);
void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info);
void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len);
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index d4f019233493..c9b3821957f7 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -49,7 +49,7 @@
* Implementation:
*
* - Common
- * Both metadata and data will use a new structure, btrfs_subpage, to
+ * Both metadata and data will use a new structure, btrfs_folio_state, to
* record the status of each sector inside a page. This provides the extra
* granularity needed.
*
@@ -63,10 +63,10 @@
* This means a slightly higher tree locking latency.
*/
-int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
- struct folio *folio, enum btrfs_subpage_type type)
+int btrfs_attach_folio_state(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, enum btrfs_folio_type type)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
/* For metadata we don't support large folio yet. */
if (type == BTRFS_SUBPAGE_METADATA)
@@ -87,18 +87,18 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio))
return 0;
- subpage = btrfs_alloc_subpage(fs_info, folio_size(folio), type);
- if (IS_ERR(subpage))
- return PTR_ERR(subpage);
+ bfs = btrfs_alloc_folio_state(fs_info, folio_size(folio), type);
+ if (IS_ERR(bfs))
+ return PTR_ERR(bfs);
- folio_attach_private(folio, subpage);
+ folio_attach_private(folio, bfs);
return 0;
}
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio,
- enum btrfs_subpage_type type)
+void btrfs_detach_folio_state(const struct btrfs_fs_info *fs_info, struct folio *folio,
+ enum btrfs_folio_type type)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
/* Either not subpage, or the folio already has private attached. */
if (!folio_test_private(folio))
@@ -108,15 +108,15 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *fol
if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio))
return;
- subpage = folio_detach_private(folio);
- ASSERT(subpage);
- btrfs_free_subpage(subpage);
+ bfs = folio_detach_private(folio);
+ ASSERT(bfs);
+ btrfs_free_folio_state(bfs);
}
-struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
- size_t fsize, enum btrfs_subpage_type type)
+struct btrfs_folio_state *btrfs_alloc_folio_state(const struct btrfs_fs_info *fs_info,
+ size_t fsize, enum btrfs_folio_type type)
{
- struct btrfs_subpage *ret;
+ struct btrfs_folio_state *ret;
unsigned int real_size;
ASSERT(fs_info->sectorsize < fsize);
@@ -136,11 +136,6 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
return ret;
}
-void btrfs_free_subpage(struct btrfs_subpage *subpage)
-{
- kfree(subpage);
-}
-
/*
* Increase the eb_refs of current subpage.
*
@@ -152,7 +147,7 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage)
*/
void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
if (!btrfs_meta_is_subpage(fs_info))
return;
@@ -160,13 +155,13 @@ void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *
ASSERT(folio_test_private(folio) && folio->mapping);
lockdep_assert_held(&folio->mapping->i_private_lock);
- subpage = folio_get_private(folio);
- atomic_inc(&subpage->eb_refs);
+ bfs = folio_get_private(folio);
+ atomic_inc(&bfs->eb_refs);
}
void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
if (!btrfs_meta_is_subpage(fs_info))
return;
@@ -174,9 +169,9 @@ void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *
ASSERT(folio_test_private(folio) && folio->mapping);
lockdep_assert_held(&folio->mapping->i_private_lock);
- subpage = folio_get_private(folio);
- ASSERT(atomic_read(&subpage->eb_refs));
- atomic_dec(&subpage->eb_refs);
+ bfs = folio_get_private(folio);
+ ASSERT(atomic_read(&bfs->eb_refs));
+ atomic_dec(&bfs->eb_refs);
}
static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
@@ -191,8 +186,9 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
* unmapped page like dummy extent buffer pages.
*/
if (folio->mapping)
- ASSERT(folio_pos(folio) <= start &&
- start + len <= folio_pos(folio) + folio_size(folio));
+ ASSERT(folio_pos(folio) <= start && start + len <= folio_end(folio),
+ "start=%llu len=%u folio_pos=%llu folio_size=%zu",
+ start, len, folio_pos(folio), folio_size(folio));
}
#define subpage_calc_start_bit(fs_info, folio, name, start, len) \
@@ -221,14 +217,13 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
if (folio_pos(folio) >= orig_start + orig_len)
*len = 0;
else
- *len = min_t(u64, folio_pos(folio) + folio_size(folio),
- orig_start + orig_len) - *start;
+ *len = min_t(u64, folio_end(folio), orig_start + orig_len) - *start;
}
static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
const int nbits = (len >> fs_info->sectorsize_bits);
unsigned long flags;
@@ -238,7 +233,7 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
btrfs_subpage_assert(fs_info, folio, start, len);
- spin_lock_irqsave(&subpage->lock, flags);
+ spin_lock_irqsave(&bfs->lock, flags);
/*
* We have call sites passing @lock_page into
* extent_clear_unlock_delalloc() for compression path.
@@ -246,18 +241,18 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
* This @locked_page is locked by plain lock_page(), thus its
* subpage::locked is 0. Handle them in a special way.
*/
- if (atomic_read(&subpage->nr_locked) == 0) {
- spin_unlock_irqrestore(&subpage->lock, flags);
+ if (atomic_read(&bfs->nr_locked) == 0) {
+ spin_unlock_irqrestore(&bfs->lock, flags);
return true;
}
- for_each_set_bit_from(bit, subpage->bitmaps, start_bit + nbits) {
- clear_bit(bit, subpage->bitmaps);
+ for_each_set_bit_from(bit, bfs->bitmaps, start_bit + nbits) {
+ clear_bit(bit, bfs->bitmaps);
cleared++;
}
- ASSERT(atomic_read(&subpage->nr_locked) >= cleared);
- last = atomic_sub_and_test(cleared, &subpage->nr_locked);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ ASSERT(atomic_read(&bfs->nr_locked) >= cleared);
+ last = atomic_sub_and_test(cleared, &bfs->nr_locked);
+ spin_unlock_irqrestore(&bfs->lock, flags);
return last;
}
@@ -280,7 +275,7 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
ASSERT(folio_test_locked(folio));
@@ -296,7 +291,7 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
* Since we own the page lock, no one else could touch subpage::locked
* and we are safe to do several atomic operations without spinlock.
*/
- if (atomic_read(&subpage->nr_locked) == 0) {
+ if (atomic_read(&bfs->nr_locked) == 0) {
/* No subpage lock, locked by plain lock_page(). */
folio_unlock(folio);
return;
@@ -310,7 +305,7 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
struct folio *folio, unsigned long bitmap)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
const int start_bit = blocks_per_folio * btrfs_bitmap_nr_locked;
unsigned long flags;
@@ -323,42 +318,42 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
return;
}
- if (atomic_read(&subpage->nr_locked) == 0) {
+ if (atomic_read(&bfs->nr_locked) == 0) {
/* No subpage lock, locked by plain lock_page(). */
folio_unlock(folio);
return;
}
- spin_lock_irqsave(&subpage->lock, flags);
+ spin_lock_irqsave(&bfs->lock, flags);
for_each_set_bit(bit, &bitmap, blocks_per_folio) {
- if (test_and_clear_bit(bit + start_bit, subpage->bitmaps))
+ if (test_and_clear_bit(bit + start_bit, bfs->bitmaps))
cleared++;
}
- ASSERT(atomic_read(&subpage->nr_locked) >= cleared);
- last = atomic_sub_and_test(cleared, &subpage->nr_locked);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ ASSERT(atomic_read(&bfs->nr_locked) >= cleared);
+ last = atomic_sub_and_test(cleared, &bfs->nr_locked);
+ spin_unlock_irqrestore(&bfs->lock, flags);
if (last)
folio_unlock(folio);
}
#define subpage_test_bitmap_all_set(fs_info, folio, name) \
({ \
- struct btrfs_subpage *subpage = folio_get_private(folio); \
+ struct btrfs_folio_state *bfs = folio_get_private(folio); \
const unsigned int blocks_per_folio = \
btrfs_blocks_per_folio(fs_info, folio); \
\
- bitmap_test_range_all_set(subpage->bitmaps, \
+ bitmap_test_range_all_set(bfs->bitmaps, \
blocks_per_folio * btrfs_bitmap_nr_##name, \
blocks_per_folio); \
})
#define subpage_test_bitmap_all_zero(fs_info, folio, name) \
({ \
- struct btrfs_subpage *subpage = folio_get_private(folio); \
+ struct btrfs_folio_state *bfs = folio_get_private(folio); \
const unsigned int blocks_per_folio = \
btrfs_blocks_per_folio(fs_info, folio); \
\
- bitmap_test_range_all_zero(subpage->bitmaps, \
+ bitmap_test_range_all_zero(bfs->bitmaps, \
blocks_per_folio * btrfs_bitmap_nr_##name, \
blocks_per_folio); \
})
@@ -366,43 +361,43 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
uptodate, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_set(fs_info, folio, uptodate))
folio_mark_uptodate(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
uptodate, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
folio_clear_uptodate(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
dirty, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_unlock_irqrestore(&bfs->lock, flags);
folio_mark_dirty(folio);
}
@@ -419,17 +414,17 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
dirty, start, len);
unsigned long flags;
bool last = false;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_zero(fs_info, folio, dirty))
last = true;
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
return last;
}
@@ -446,91 +441,91 @@ void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
writeback, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (!folio_test_writeback(folio))
folio_start_writeback(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
writeback, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_zero(fs_info, folio, writeback)) {
ASSERT(folio_test_writeback(folio));
folio_end_writeback(folio);
}
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
ordered, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
folio_set_ordered(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
ordered, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_zero(fs_info, folio, ordered))
folio_clear_ordered(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
checked, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_set(fs_info, folio, checked))
folio_set_checked(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
checked, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
folio_clear_checked(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
/*
@@ -541,16 +536,16 @@ void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
- struct btrfs_subpage *subpage = folio_get_private(folio); \
+ struct btrfs_folio_state *bfs = folio_get_private(folio); \
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, \
name, start, len); \
unsigned long flags; \
bool ret; \
\
- spin_lock_irqsave(&subpage->lock, flags); \
- ret = bitmap_test_range_all_set(subpage->bitmaps, start_bit, \
+ spin_lock_irqsave(&bfs->lock, flags); \
+ ret = bitmap_test_range_all_set(bfs->bitmaps, start_bit, \
len >> fs_info->sectorsize_bits); \
- spin_unlock_irqrestore(&subpage->lock, flags); \
+ spin_unlock_irqrestore(&bfs->lock, flags); \
return ret; \
}
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
@@ -662,10 +657,10 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
{ \
const unsigned int blocks_per_folio = \
btrfs_blocks_per_folio(fs_info, folio); \
- const struct btrfs_subpage *subpage = folio_get_private(folio); \
+ const struct btrfs_folio_state *bfs = folio_get_private(folio); \
\
ASSERT(blocks_per_folio <= BITS_PER_LONG); \
- *dst = bitmap_read(subpage->bitmaps, \
+ *dst = bitmap_read(bfs->bitmaps, \
blocks_per_folio * btrfs_bitmap_nr_##name, \
blocks_per_folio); \
}
@@ -690,7 +685,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
unsigned int start_bit;
unsigned int nbits;
unsigned long flags;
@@ -705,15 +700,15 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len);
nbits = len >> fs_info->sectorsize_bits;
- subpage = folio_get_private(folio);
- ASSERT(subpage);
- spin_lock_irqsave(&subpage->lock, flags);
- if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) {
+ bfs = folio_get_private(folio);
+ ASSERT(bfs);
+ spin_lock_irqsave(&bfs->lock, flags);
+ if (unlikely(!bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits))) {
SUBPAGE_DUMP_BITMAP(fs_info, folio, dirty, start, len);
- ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
+ ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits));
}
- ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
- spin_unlock_irqrestore(&subpage->lock, flags);
+ ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits));
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
/*
@@ -726,7 +721,7 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
unsigned long flags;
unsigned int start_bit;
unsigned int nbits;
@@ -736,19 +731,19 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio))
return;
- subpage = folio_get_private(folio);
+ bfs = folio_get_private(folio);
start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
nbits = len >> fs_info->sectorsize_bits;
- spin_lock_irqsave(&subpage->lock, flags);
+ spin_lock_irqsave(&bfs->lock, flags);
/* Target range should not yet be locked. */
- if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) {
+ if (unlikely(!bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits))) {
SUBPAGE_DUMP_BITMAP(fs_info, folio, locked, start, len);
- ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
+ ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits));
}
- bitmap_set(subpage->bitmaps, start_bit, nbits);
- ret = atomic_add_return(nbits, &subpage->nr_locked);
+ bitmap_set(bfs->bitmaps, start_bit, nbits);
+ ret = atomic_add_return(nbits, &bfs->nr_locked);
ASSERT(ret <= btrfs_blocks_per_folio(fs_info, folio));
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
/*
@@ -776,7 +771,7 @@ bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct ext
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
unsigned long uptodate_bitmap;
unsigned long dirty_bitmap;
@@ -788,18 +783,18 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
ASSERT(folio_test_private(folio) && folio_get_private(folio));
ASSERT(blocks_per_folio > 1);
- subpage = folio_get_private(folio);
+ bfs = folio_get_private(folio);
- spin_lock_irqsave(&subpage->lock, flags);
+ spin_lock_irqsave(&bfs->lock, flags);
GET_SUBPAGE_BITMAP(fs_info, folio, uptodate, &uptodate_bitmap);
GET_SUBPAGE_BITMAP(fs_info, folio, dirty, &dirty_bitmap);
GET_SUBPAGE_BITMAP(fs_info, folio, writeback, &writeback_bitmap);
GET_SUBPAGE_BITMAP(fs_info, folio, ordered, &ordered_bitmap);
GET_SUBPAGE_BITMAP(fs_info, folio, checked, &checked_bitmap);
GET_SUBPAGE_BITMAP(fs_info, folio, locked, &locked_bitmap);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
- dump_page(folio_page(folio, 0), "btrfs subpage dump");
+ dump_page(folio_page(folio, 0), "btrfs folio state dump");
btrfs_warn(fs_info,
"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
start, len, folio_pos(folio),
@@ -815,14 +810,14 @@ void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
struct folio *folio,
unsigned long *ret_bitmap)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
unsigned long flags;
ASSERT(folio_test_private(folio) && folio_get_private(folio));
ASSERT(btrfs_blocks_per_folio(fs_info, folio) > 1);
- subpage = folio_get_private(folio);
+ bfs = folio_get_private(folio);
- spin_lock_irqsave(&subpage->lock, flags);
+ spin_lock_irqsave(&bfs->lock, flags);
GET_SUBPAGE_BITMAP(fs_info, folio, dirty, ret_bitmap);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 3042c5ea840a..ee0710eb13fd 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -32,9 +32,31 @@ struct folio;
enum {
btrfs_bitmap_nr_uptodate = 0,
btrfs_bitmap_nr_dirty,
+
+ /*
+ * This can be changed to atomic eventually. But this change will rely
+ * on the async delalloc range rework for locked bitmap. As async
+ * delalloc can unlock its range and mark blocks writeback at random
+ * timing.
+ */
btrfs_bitmap_nr_writeback,
+
+ /*
+ * The ordered and checked flags are for COW fixup, already marked
+ * deprecated, and will be removed eventually.
+ */
btrfs_bitmap_nr_ordered,
btrfs_bitmap_nr_checked,
+
+ /*
+ * The locked bit is for async delalloc range (compression), currently
+ * async extent is queued with the range locked, until the compression
+ * is done.
+ * So an async extent can unlock the range at any random timing.
+ *
+ * This will need a rework on the async extent lifespan (mark writeback
+ * and do compression) before deprecating this flag.
+ */
btrfs_bitmap_nr_locked,
btrfs_bitmap_nr_max
};
@@ -43,7 +65,7 @@ enum {
* Structure to trace status of each sector inside a page, attached to
* page::private for both data and metadata inodes.
*/
-struct btrfs_subpage {
+struct btrfs_folio_state {
/* Common members for both data and metadata pages */
spinlock_t lock;
union {
@@ -51,7 +73,7 @@ struct btrfs_subpage {
* Structures only used by metadata
*
* @eb_refs should only be operated under private_lock, as it
- * manages whether the subpage can be detached.
+ * manages whether the btrfs_folio_state can be detached.
*/
atomic_t eb_refs;
@@ -65,12 +87,11 @@ struct btrfs_subpage {
unsigned long bitmaps[];
};
-enum btrfs_subpage_type {
+enum btrfs_folio_type {
BTRFS_SUBPAGE_METADATA,
BTRFS_SUBPAGE_DATA,
};
-#if PAGE_SIZE > BTRFS_MIN_BLOCKSIZE
/*
* Subpage support for metadata is more complex, as we can have dummy extent
* buffers, where folios have no mapping to determine the owning inode.
@@ -91,29 +112,19 @@ static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info,
ASSERT(is_data_inode(BTRFS_I(folio->mapping->host)));
return fs_info->sectorsize < folio_size(folio);
}
-#else
-static inline bool btrfs_meta_is_subpage(const struct btrfs_fs_info *fs_info)
-{
- return false;
-}
-static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info,
- struct folio *folio)
-{
- if (folio->mapping && folio->mapping->host)
- ASSERT(is_data_inode(BTRFS_I(folio->mapping->host)));
- return false;
-}
-#endif
-int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
- struct folio *folio, enum btrfs_subpage_type type);
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio,
- enum btrfs_subpage_type type);
+int btrfs_attach_folio_state(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, enum btrfs_folio_type type);
+void btrfs_detach_folio_state(const struct btrfs_fs_info *fs_info, struct folio *folio,
+ enum btrfs_folio_type type);
/* Allocate additional data where page represents more than one sector */
-struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
- size_t fsize, enum btrfs_subpage_type type);
-void btrfs_free_subpage(struct btrfs_subpage *subpage);
+struct btrfs_folio_state *btrfs_alloc_folio_state(const struct btrfs_fs_info *fs_info,
+ size_t fsize, enum btrfs_folio_type type);
+static inline void btrfs_free_folio_state(struct btrfs_folio_state *bfs)
+{
+ kfree(bfs);
+}
void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a0c65adce1ab..68e35a3700ff 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -261,10 +261,65 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = {
{}
};
-/* No support for restricting writes to btrfs devices yet... */
-static inline blk_mode_t btrfs_open_mode(struct fs_context *fc)
+static bool btrfs_match_compress_type(const char *string, const char *type, bool may_have_level)
{
- return sb_open_mode(fc->sb_flags) & ~BLK_OPEN_RESTRICT_WRITES;
+ const int len = strlen(type);
+
+ return (strncmp(string, type, len) == 0) &&
+ ((may_have_level && string[len] == ':') || string[len] == '\0');
+}
+
+static int btrfs_parse_compress(struct btrfs_fs_context *ctx,
+ const struct fs_parameter *param, int opt)
+{
+ const char *string = param->string;
+
+ /*
+ * Provide the same semantics as older kernels that don't use fs
+ * context, specifying the "compress" option clears "force-compress"
+ * without the need to pass "compress-force=[no|none]" before
+ * specifying "compress".
+ */
+ if (opt != Opt_compress_force && opt != Opt_compress_force_type)
+ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+
+ if (opt == Opt_compress || opt == Opt_compress_force) {
+ ctx->compress_type = BTRFS_COMPRESS_ZLIB;
+ ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (btrfs_match_compress_type(string, "zlib", true)) {
+ ctx->compress_type = BTRFS_COMPRESS_ZLIB;
+ ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB,
+ string + 4);
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (btrfs_match_compress_type(string, "lzo", false)) {
+ ctx->compress_type = BTRFS_COMPRESS_LZO;
+ ctx->compress_level = 0;
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (btrfs_match_compress_type(string, "zstd", true)) {
+ ctx->compress_type = BTRFS_COMPRESS_ZSTD;
+ ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD,
+ string + 4);
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (btrfs_match_compress_type(string, "no", false) ||
+ btrfs_match_compress_type(string, "none", false)) {
+ ctx->compress_level = 0;
+ ctx->compress_type = 0;
+ btrfs_clear_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+ } else {
+ btrfs_err(NULL, "unrecognized compression value %s", string);
+ return -EINVAL;
+ }
+ return 0;
}
static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
@@ -303,10 +358,9 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_device: {
struct btrfs_device *device;
- blk_mode_t mode = btrfs_open_mode(fc);
mutex_lock(&uuid_mutex);
- device = btrfs_scan_one_device(param->string, mode, false);
+ device = btrfs_scan_one_device(param->string, false);
mutex_unlock(&uuid_mutex);
if (IS_ERR(device))
return PTR_ERR(device);
@@ -336,53 +390,8 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
fallthrough;
case Opt_compress:
case Opt_compress_type:
- /*
- * Provide the same semantics as older kernels that don't use fs
- * context, specifying the "compress" option clears
- * "force-compress" without the need to pass
- * "compress-force=[no|none]" before specifying "compress".
- */
- if (opt != Opt_compress_force && opt != Opt_compress_force_type)
- btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
-
- if (opt == Opt_compress || opt == Opt_compress_force) {
- ctx->compress_type = BTRFS_COMPRESS_ZLIB;
- ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
- btrfs_set_opt(ctx->mount_opt, COMPRESS);
- btrfs_clear_opt(ctx->mount_opt, NODATACOW);
- btrfs_clear_opt(ctx->mount_opt, NODATASUM);
- } else if (strncmp(param->string, "zlib", 4) == 0) {
- ctx->compress_type = BTRFS_COMPRESS_ZLIB;
- ctx->compress_level =
- btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB,
- param->string + 4);
- btrfs_set_opt(ctx->mount_opt, COMPRESS);
- btrfs_clear_opt(ctx->mount_opt, NODATACOW);
- btrfs_clear_opt(ctx->mount_opt, NODATASUM);
- } else if (strncmp(param->string, "lzo", 3) == 0) {
- ctx->compress_type = BTRFS_COMPRESS_LZO;
- ctx->compress_level = 0;
- btrfs_set_opt(ctx->mount_opt, COMPRESS);
- btrfs_clear_opt(ctx->mount_opt, NODATACOW);
- btrfs_clear_opt(ctx->mount_opt, NODATASUM);
- } else if (strncmp(param->string, "zstd", 4) == 0) {
- ctx->compress_type = BTRFS_COMPRESS_ZSTD;
- ctx->compress_level =
- btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD,
- param->string + 4);
- btrfs_set_opt(ctx->mount_opt, COMPRESS);
- btrfs_clear_opt(ctx->mount_opt, NODATACOW);
- btrfs_clear_opt(ctx->mount_opt, NODATASUM);
- } else if (strncmp(param->string, "no", 2) == 0) {
- ctx->compress_level = 0;
- ctx->compress_type = 0;
- btrfs_clear_opt(ctx->mount_opt, COMPRESS);
- btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
- } else {
- btrfs_err(NULL, "unrecognized compression value %s",
- param->string);
+ if (btrfs_parse_compress(ctx, param, opt))
return -EINVAL;
- }
break;
case Opt_ssd:
if (result.negated) {
@@ -945,12 +954,12 @@ static int btrfs_fill_super(struct super_block *sb,
{
struct btrfs_inode *inode;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- int err;
+ int ret;
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_magic = BTRFS_SUPER_MAGIC;
sb->s_op = &btrfs_super_ops;
- sb->s_d_op = &btrfs_dentry_operations;
+ set_default_d_op(sb, &btrfs_dentry_operations);
sb->s_export_op = &btrfs_export_ops;
#ifdef CONFIG_FS_VERITY
sb->s_vop = &btrfs_verityops;
@@ -959,28 +968,28 @@ static int btrfs_fill_super(struct super_block *sb,
sb->s_time_gran = 1;
sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM;
- err = super_setup_bdi(sb);
- if (err) {
+ ret = super_setup_bdi(sb);
+ if (ret) {
btrfs_err(fs_info, "super_setup_bdi failed");
- return err;
+ return ret;
}
- err = open_ctree(sb, fs_devices);
- if (err) {
- btrfs_err(fs_info, "open_ctree failed: %d", err);
- return err;
+ ret = open_ctree(sb, fs_devices);
+ if (ret) {
+ btrfs_err(fs_info, "open_ctree failed: %d", ret);
+ return ret;
}
inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- btrfs_handle_fs_error(fs_info, err, NULL);
+ ret = PTR_ERR(inode);
+ btrfs_handle_fs_error(fs_info, ret, NULL);
goto fail_close;
}
sb->s_root = d_make_root(&inode->vfs_inode);
if (!sb->s_root) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto fail_close;
}
@@ -989,7 +998,7 @@ static int btrfs_fill_super(struct super_block *sb,
fail_close:
close_ctree(fs_info);
- return err;
+ return ret;
}
int btrfs_sync_fs(struct super_block *sb, int wait)
@@ -1826,10 +1835,9 @@ static int btrfs_get_tree_super(struct fs_context *fc)
struct btrfs_fs_info *fs_info = fc->s_fs_info;
struct btrfs_fs_context *ctx = fc->fs_private;
struct btrfs_fs_devices *fs_devices = NULL;
- struct block_device *bdev;
struct btrfs_device *device;
struct super_block *sb;
- blk_mode_t mode = btrfs_open_mode(fc);
+ blk_mode_t mode = sb_open_mode(fc->sb_flags);
int ret;
btrfs_ctx_to_info(fs_info, ctx);
@@ -1839,47 +1847,60 @@ static int btrfs_get_tree_super(struct fs_context *fc)
* With 'true' passed to btrfs_scan_one_device() (mount time) we expect
* either a valid device or an error.
*/
- device = btrfs_scan_one_device(fc->source, mode, true);
+ device = btrfs_scan_one_device(fc->source, true);
ASSERT(device != NULL);
if (IS_ERR(device)) {
mutex_unlock(&uuid_mutex);
return PTR_ERR(device);
}
-
fs_devices = device->fs_devices;
+ /*
+ * We cannot hold uuid_mutex calling sget_fc(), it will lead to a
+ * locking order reversal with s_umount.
+ *
+ * So here we increase the holding number of fs_devices, this will ensure
+ * the fs_devices itself won't be freed.
+ */
+ btrfs_fs_devices_inc_holding(fs_devices);
fs_info->fs_devices = fs_devices;
-
- ret = btrfs_open_devices(fs_devices, mode, &btrfs_fs_type);
mutex_unlock(&uuid_mutex);
- if (ret)
- return ret;
-
- if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
- ret = -EACCES;
- goto error;
- }
- bdev = fs_devices->latest_dev->bdev;
- /*
- * From now on the error handling is not straightforward.
- *
- * If successful, this will transfer the fs_info into the super block,
- * and fc->s_fs_info will be NULL. However if there's an existing
- * super, we'll still have fc->s_fs_info populated. If we error
- * completely out it'll be cleaned up when we drop the fs_context,
- * otherwise it's tied to the lifetime of the super_block.
- */
sb = sget_fc(fc, btrfs_fc_test_super, set_anon_super_fc);
if (IS_ERR(sb)) {
- ret = PTR_ERR(sb);
- goto error;
+ mutex_lock(&uuid_mutex);
+ btrfs_fs_devices_dec_holding(fs_devices);
+ /*
+ * Since the fs_devices is not opened, it can be freed at any
+ * time after unlocking uuid_mutex. We need to avoid double
+ * free through put_fs_context()->btrfs_free_fs_info().
+ * So here we reset fs_info->fs_devices to NULL, and let the
+ * regular fs_devices reclaim path to handle it.
+ *
+ * This applies to all later branches where no fs_devices is
+ * opened.
+ */
+ fs_info->fs_devices = NULL;
+ mutex_unlock(&uuid_mutex);
+ return PTR_ERR(sb);
}
set_device_specific_options(fs_info);
if (sb->s_root) {
- btrfs_close_devices(fs_devices);
+ /*
+ * Not the first mount of the fs thus got an existing super block.
+ * Will reuse the returned super block, fs_info and fs_devices.
+ *
+ * fc->s_fs_info is not touched and will be later freed by
+ * put_fs_context() through btrfs_free_fs_context().
+ */
+ ASSERT(fc->s_fs_info == fs_info);
+
+ mutex_lock(&uuid_mutex);
+ btrfs_fs_devices_dec_holding(fs_devices);
+ fs_info->fs_devices = NULL;
+ mutex_unlock(&uuid_mutex);
/*
* At this stage we may have RO flag mismatch between
* fc->sb_flags and sb->s_flags. Caller should detect such
@@ -1887,9 +1908,32 @@ static int btrfs_get_tree_super(struct fs_context *fc)
* needed.
*/
} else {
+ struct block_device *bdev;
+
+ /*
+ * The first mount of the fs thus a new superblock, fc->s_fs_info
+ * must be NULL, and the ownership of our fs_info and fs_devices is
+ * transferred to the super block.
+ */
+ ASSERT(fc->s_fs_info == NULL);
+
+ mutex_lock(&uuid_mutex);
+ btrfs_fs_devices_dec_holding(fs_devices);
+ ret = btrfs_open_devices(fs_devices, mode, sb);
+ if (ret < 0)
+ fs_info->fs_devices = NULL;
+ mutex_unlock(&uuid_mutex);
+ if (ret < 0) {
+ deactivate_locked_super(sb);
+ return ret;
+ }
+ if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
+ deactivate_locked_super(sb);
+ return -EACCES;
+ }
+ bdev = fs_devices->latest_dev->bdev;
snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id);
- btrfs_sb(sb)->bdev_holder = &btrfs_fs_type;
ret = btrfs_fill_super(sb, fs_devices);
if (ret) {
deactivate_locked_super(sb);
@@ -1901,10 +1945,6 @@ static int btrfs_get_tree_super(struct fs_context *fc)
fc->root = dget(sb->s_root);
return 0;
-
-error:
- btrfs_close_devices(fs_devices);
- return ret;
}
/*
@@ -1980,17 +2020,13 @@ error:
* btrfs or not, setting the whole super block RO. To make per-subvolume mounting
* work with different options work we need to keep backward compatibility.
*/
-static int btrfs_reconfigure_for_mount(struct fs_context *fc, struct vfsmount *mnt)
+static int btrfs_reconfigure_for_mount(struct fs_context *fc)
{
int ret = 0;
- if (fc->sb_flags & SB_RDONLY)
- return ret;
-
- down_write(&mnt->mnt_sb->s_umount);
- if (!(fc->sb_flags & SB_RDONLY) && (mnt->mnt_sb->s_flags & SB_RDONLY))
+ if (!(fc->sb_flags & SB_RDONLY) && (fc->root->d_sb->s_flags & SB_RDONLY))
ret = btrfs_reconfigure(fc);
- up_write(&mnt->mnt_sb->s_umount);
+
return ret;
}
@@ -2035,25 +2071,18 @@ static int btrfs_get_tree_subvol(struct fs_context *fc)
*/
dup_fc->s_fs_info = fs_info;
- /*
- * We'll do the security settings in our btrfs_get_tree_super() mount
- * loop, they were duplicated into dup_fc, we can drop the originals
- * here.
- */
- security_free_mnt_opts(&fc->security);
- fc->security = NULL;
+ ret = btrfs_get_tree_super(dup_fc);
+ if (ret)
+ goto error;
- mnt = fc_mount(dup_fc);
- if (IS_ERR(mnt)) {
- put_fs_context(dup_fc);
- return PTR_ERR(mnt);
- }
- ret = btrfs_reconfigure_for_mount(dup_fc, mnt);
+ ret = btrfs_reconfigure_for_mount(dup_fc);
+ up_write(&dup_fc->root->d_sb->s_umount);
+ if (ret)
+ goto error;
+ mnt = vfs_create_mount(dup_fc);
put_fs_context(dup_fc);
- if (ret) {
- mntput(mnt);
- return ret;
- }
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
/*
* This free's ->subvol_name, because if it isn't set we have to
@@ -2067,25 +2096,15 @@ static int btrfs_get_tree_subvol(struct fs_context *fc)
fc->root = dentry;
return 0;
+error:
+ put_fs_context(dup_fc);
+ return ret;
}
static int btrfs_get_tree(struct fs_context *fc)
{
- /*
- * Since we use mount_subtree to mount the default/specified subvol, we
- * have to do mounts in two steps.
- *
- * First pass through we call btrfs_get_tree_subvol(), this is just a
- * wrapper around fc_mount() to call back into here again, and this time
- * we'll call btrfs_get_tree_super(). This will do the open_ctree() and
- * everything to open the devices and file system. Then we return back
- * with a fully constructed vfsmount in btrfs_get_tree_subvol(), and
- * from there we can do our mount_subvol() call, which will lookup
- * whichever subvol we're mounting and setup this fc with the
- * appropriate dentry for the subvol.
- */
- if (fc->s_fs_info)
- return btrfs_get_tree_super(fc);
+ ASSERT(fc->s_fs_info == NULL);
+
return btrfs_get_tree_subvol(fc);
}
@@ -2217,7 +2236,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
* Scanning outside of mount can return NULL which would turn
* into 0 error code.
*/
- device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
+ device = btrfs_scan_one_device(vol->name, false);
ret = PTR_ERR_OR_ZERO(device);
mutex_unlock(&uuid_mutex);
break;
@@ -2235,7 +2254,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
* Scanning outside of mount can return NULL which would turn
* into 0 error code.
*/
- device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
+ device = btrfs_scan_one_device(vol->name, false);
if (IS_ERR_OR_NULL(device)) {
mutex_unlock(&uuid_mutex);
if (IS_ERR(device))
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 5d93d9dd2c12..9d398f7a36ad 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -160,8 +160,7 @@ static int can_modify_feature(struct btrfs_feature_attr *fa)
clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR;
break;
default:
- pr_warn("btrfs: sysfs: unknown feature set %d\n",
- fa->feature_set);
+ btrfs_warn(NULL, "sysfs: unknown feature set %d", fa->feature_set);
return 0;
}
@@ -1138,13 +1137,21 @@ static ssize_t btrfs_commit_stats_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ u64 now = ktime_get_ns();
+ u64 start_time = fs_info->commit_stats.critical_section_start_time;
+ u64 pending = 0;
+
+ if (start_time)
+ pending = now - start_time;
return sysfs_emit(buf,
"commits %llu\n"
+ "cur_commit_ms %llu\n"
"last_commit_ms %llu\n"
"max_commit_ms %llu\n"
"total_commit_ms %llu\n",
fs_info->commit_stats.commit_count,
+ div_u64(pending, NSEC_PER_MSEC),
div_u64(fs_info->commit_stats.last_commit_dur, NSEC_PER_MSEC),
div_u64(fs_info->commit_stats.max_commit_dur, NSEC_PER_MSEC),
div_u64(fs_info->commit_stats.total_commit_dur, NSEC_PER_MSEC));
@@ -1202,7 +1209,7 @@ static ssize_t quota_override_store(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
unsigned long knob;
- int err;
+ int ret;
if (!fs_info)
return -EPERM;
@@ -1210,9 +1217,9 @@ static ssize_t quota_override_store(struct kobject *kobj,
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- err = kstrtoul(buf, 10, &knob);
- if (err)
- return err;
+ ret = kstrtoul(buf, 10, &knob);
+ if (ret)
+ return ret;
if (knob > 1)
return -EINVAL;
@@ -2239,7 +2246,7 @@ void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action)
ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
if (ret)
- pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
+ btrfs_warn(NULL, "sending event %d to kobject: '%s' (%p): failed",
action, kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
&disk_to_dev(bdev->bd_disk)->kobj);
}
@@ -2282,15 +2289,15 @@ static struct kset *btrfs_kset;
*/
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs)
{
- int error;
+ int ret;
init_completion(&fs_devs->kobj_unregister);
fs_devs->fsid_kobj.kset = btrfs_kset;
- error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL,
- "%pU", fs_devs->fsid);
- if (error) {
+ ret = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL,
+ "%pU", fs_devs->fsid);
+ if (ret) {
kobject_put(&fs_devs->fsid_kobj);
- return error;
+ return ret;
}
fs_devs->devices_kobj = kobject_create_and_add("devices",
@@ -2316,71 +2323,70 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs)
int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
{
- int error;
+ int ret;
struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
struct kobject *fsid_kobj = &fs_devs->fsid_kobj;
- error = btrfs_sysfs_add_fs_devices(fs_devs);
- if (error)
- return error;
+ ret = btrfs_sysfs_add_fs_devices(fs_devs);
+ if (ret)
+ return ret;
- error = sysfs_create_files(fsid_kobj, btrfs_attrs);
- if (error) {
+ ret = sysfs_create_files(fsid_kobj, btrfs_attrs);
+ if (ret) {
btrfs_sysfs_remove_fs_devices(fs_devs);
- return error;
+ return ret;
}
- error = sysfs_create_group(fsid_kobj,
- &btrfs_feature_attr_group);
- if (error)
+ ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group);
+ if (ret)
goto failure;
#ifdef CONFIG_BTRFS_DEBUG
fs_info->debug_kobj = kobject_create_and_add("debug", fsid_kobj);
if (!fs_info->debug_kobj) {
- error = -ENOMEM;
+ ret = -ENOMEM;
goto failure;
}
- error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
- if (error)
+ ret = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
+ if (ret)
goto failure;
#endif
/* Discard directory */
fs_info->discard_kobj = kobject_create_and_add("discard", fsid_kobj);
if (!fs_info->discard_kobj) {
- error = -ENOMEM;
+ ret = -ENOMEM;
goto failure;
}
- error = sysfs_create_files(fs_info->discard_kobj, discard_attrs);
- if (error)
+ ret = sysfs_create_files(fs_info->discard_kobj, discard_attrs);
+ if (ret)
goto failure;
- error = addrm_unknown_feature_attrs(fs_info, true);
- if (error)
+ ret = addrm_unknown_feature_attrs(fs_info, true);
+ if (ret)
goto failure;
- error = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi");
- if (error)
+ ret = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi");
+ if (ret)
goto failure;
fs_info->space_info_kobj = kobject_create_and_add("allocation",
fsid_kobj);
if (!fs_info->space_info_kobj) {
- error = -ENOMEM;
+ ret = -ENOMEM;
goto failure;
}
- error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs);
- if (error)
+ ret = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs);
+ if (ret)
goto failure;
return 0;
failure:
btrfs_sysfs_remove_mounted(fs_info);
- return error;
+ return ret;
}
static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj,
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 00da54f0164c..b19328d077d3 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -23,8 +23,8 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
{
int ret;
struct folio_batch fbatch;
- unsigned long index = start >> PAGE_SHIFT;
- unsigned long end_index = end >> PAGE_SHIFT;
+ pgoff_t index = start >> PAGE_SHIFT;
+ pgoff_t end_index = end >> PAGE_SHIFT;
int i;
int count = 0;
int loops = 0;
@@ -75,7 +75,8 @@ static void extent_flag_to_str(const struct extent_state *state, char *dest)
dest[0] = 0;
PRINT_ONE_FLAG(state, dest, cur, DIRTY);
PRINT_ONE_FLAG(state, dest, cur, LOCKED);
- PRINT_ONE_FLAG(state, dest, cur, NEW);
+ PRINT_ONE_FLAG(state, dest, cur, DIRTY_LOG1);
+ PRINT_ONE_FLAG(state, dest, cur, DIRTY_LOG2);
PRINT_ONE_FLAG(state, dest, cur, DELALLOC);
PRINT_ONE_FLAG(state, dest, cur, DEFRAG);
PRINT_ONE_FLAG(state, dest, cur, BOUNDARY);
@@ -113,7 +114,6 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
struct extent_io_tree *tmp;
struct page *page;
struct page *locked_page = NULL;
- unsigned long index = 0;
/* In this test we need at least 2 file extents at its maximum size */
u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
u64 total_dirty = 2 * max_bytes;
@@ -156,7 +156,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
* everything to make sure our pages don't get evicted and screw up our
* test.
*/
- for (index = 0; index < (total_dirty >> PAGE_SHIFT); index++) {
+ for (pgoff_t index = 0; index < (total_dirty >> PAGE_SHIFT); index++) {
page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
if (!page) {
test_err("failed to allocate test page");
@@ -326,7 +326,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
out_bits:
if (ret)
dump_extent_io_tree(tmp);
- btrfs_clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1);
+ btrfs_clear_extent_bit(tmp, 0, total_dirty - 1, (unsigned)-1, NULL);
out:
if (locked_page)
put_page(locked_page);
@@ -343,11 +343,11 @@ static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb)
unsigned long i;
for (i = 0; i < eb->len * BITS_PER_BYTE; i++) {
- int bit, bit1;
+ bool bit_set, bit1_set;
- bit = !!test_bit(i, bitmap);
- bit1 = !!extent_buffer_test_bit(eb, 0, i);
- if (bit1 != bit) {
+ bit_set = test_bit(i, bitmap);
+ bit1_set = extent_buffer_test_bit(eb, 0, i);
+ if (bit1_set != bit_set) {
u8 has;
u8 expect;
@@ -360,9 +360,9 @@ static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb)
return -EINVAL;
}
- bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE,
- i % BITS_PER_BYTE);
- if (bit1 != bit) {
+ bit1_set = extent_buffer_test_bit(eb, i / BITS_PER_BYTE,
+ i % BITS_PER_BYTE);
+ if (bit1_set != bit_set) {
u8 has;
u8 expect;
@@ -662,7 +662,7 @@ static int test_find_first_clear_extent_bit(void)
out:
if (ret)
dump_extent_io_tree(&tree);
- btrfs_clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ btrfs_clear_extent_bit(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL);
return ret;
}
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index b61972046feb..c8822edd32e2 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -32,7 +32,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
unsigned int i;
int ret;
- info = search_free_space_info(trans, cache, path, 0);
+ info = btrfs_search_free_space_info(trans, cache, path, 0);
if (IS_ERR(info)) {
test_err("could not find free space info");
ret = PTR_ERR(info);
@@ -57,7 +57,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
goto invalid;
offset = key.objectid;
while (offset < key.objectid + key.offset) {
- bit = free_space_test_bit(cache, path, offset);
+ bit = btrfs_free_space_test_bit(cache, path, offset);
if (prev_bit == 0 && bit == 1) {
extent_start = offset;
} else if (prev_bit == 1 && bit == 0) {
@@ -115,7 +115,7 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans,
u32 flags;
int ret;
- info = search_free_space_info(trans, cache, path, 0);
+ info = btrfs_search_free_space_info(trans, cache, path, 0);
if (IS_ERR(info)) {
test_err("could not find free space info");
btrfs_release_path(path);
@@ -131,13 +131,13 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans,
/* Flip it to the other format and check that for good measure. */
if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
- ret = convert_free_space_to_extents(trans, cache, path);
+ ret = btrfs_convert_free_space_to_extents(trans, cache, path);
if (ret) {
test_err("could not convert to extents");
return ret;
}
} else {
- ret = convert_free_space_to_bitmaps(trans, cache, path);
+ ret = btrfs_convert_free_space_to_bitmaps(trans, cache, path);
if (ret) {
test_err("could not convert to bitmaps");
return ret;
@@ -170,9 +170,8 @@ static int test_remove_all(struct btrfs_trans_handle *trans,
const struct free_space_extent extents[] = {};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start,
- cache->length);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
@@ -193,8 +192,8 @@ static int test_remove_beginning(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start, alignment);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, alignment);
if (ret) {
test_err("could not remove free space");
return ret;
@@ -216,7 +215,7 @@ static int test_remove_end(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
cache->start + cache->length - alignment,
alignment);
if (ret) {
@@ -240,9 +239,9 @@ static int test_remove_middle(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start + alignment,
- alignment);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start + alignment,
+ alignment);
if (ret) {
test_err("could not remove free space");
return ret;
@@ -263,23 +262,22 @@ static int test_merge_left(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start, cache->length);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path, cache->start,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path, cache->start,
+ alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + alignment,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
@@ -300,24 +298,23 @@ static int test_merge_right(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start, cache->length);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + 2 * alignment,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + 2 * alignment,
+ alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + alignment,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
@@ -338,29 +335,29 @@ static int test_merge_both(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start, cache->length);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path, cache->start,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path, cache->start,
+ alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + 2 * alignment, alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + 2 * alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + alignment, alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
@@ -383,29 +380,29 @@ static int test_merge_none(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start, cache->length);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path, cache->start,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path, cache->start,
+ alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + 4 * alignment, alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + 4 * alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + 2 * alignment, alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + 2 * alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
@@ -483,14 +480,14 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
goto out;
}
- ret = add_block_group_free_space(&trans, cache);
+ ret = btrfs_add_block_group_free_space(&trans, cache);
if (ret) {
test_err("could not add block group free space");
goto out;
}
if (bitmaps) {
- ret = convert_free_space_to_bitmaps(&trans, cache, path);
+ ret = btrfs_convert_free_space_to_bitmaps(&trans, cache, path);
if (ret) {
test_err("could not convert block group to bitmaps");
goto out;
@@ -501,7 +498,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
if (ret)
goto out;
- ret = remove_block_group_free_space(&trans, cache);
+ ret = btrfs_remove_block_group_free_space(&trans, cache);
if (ret) {
test_err("could not remove block group free space");
goto out;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index a29d2c02c2c8..a4c2b7748b95 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -950,10 +950,10 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
/* [BTRFS_MAX_EXTENT_SIZE/2][sectorsize HOLE][the rest] */
- ret = btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree,
- BTRFS_MAX_EXTENT_SIZE >> 1,
- (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
- EXTENT_DELALLOC | EXTENT_DELALLOC_NEW);
+ ret = btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ BTRFS_MAX_EXTENT_SIZE >> 1,
+ (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1017,10 +1017,10 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
/* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */
- ret = btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree,
- BTRFS_MAX_EXTENT_SIZE + sectorsize,
- BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
- EXTENT_DELALLOC | EXTENT_DELALLOC_NEW);
+ ret = btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ BTRFS_MAX_EXTENT_SIZE + sectorsize,
+ BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1051,8 +1051,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
/* Empty */
- ret = btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
- EXTENT_DELALLOC | EXTENT_DELALLOC_NEW);
+ ret = btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1066,8 +1066,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
ret = 0;
out:
if (ret)
- btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
- EXTENT_DELALLOC | EXTENT_DELALLOC_NEW);
+ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL);
iput(inode);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b96195d6480f..c5c0d9cf1a80 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1211,15 +1211,15 @@ static int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages)
{
bool errors = false;
- int err;
+ int ret;
- err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
+ ret = __btrfs_wait_marked_extents(fs_info, dirty_pages);
if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags))
errors = true;
- if (errors && !err)
- err = -EIO;
- return err;
+ if (errors && !ret)
+ ret = -EIO;
+ return ret;
}
int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
@@ -1227,22 +1227,22 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
struct btrfs_fs_info *fs_info = log_root->fs_info;
struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages;
bool errors = false;
- int err;
+ int ret;
ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID);
- err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
- if ((mark & EXTENT_DIRTY) &&
+ ret = __btrfs_wait_marked_extents(fs_info, dirty_pages);
+ if ((mark & EXTENT_DIRTY_LOG1) &&
test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags))
errors = true;
- if ((mark & EXTENT_NEW) &&
+ if ((mark & EXTENT_DIRTY_LOG2) &&
test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags))
errors = true;
- if (errors && !err)
- err = -EIO;
- return err;
+ if (errors && !ret)
+ ret = -EIO;
+ return ret;
}
/*
@@ -1735,8 +1735,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ret = btrfs_create_qgroup(trans, objectid);
if (ret && ret != -EEXIST) {
- btrfs_abort_transaction(trans, ret);
- goto fail;
+ if (ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info)) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
}
/*
@@ -2163,13 +2165,19 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans)
list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots);
}
-static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval)
+static void update_commit_stats(struct btrfs_fs_info *fs_info)
{
+ ktime_t now = ktime_get_ns();
+ ktime_t interval = now - fs_info->commit_stats.critical_section_start_time;
+
+ ASSERT(fs_info->commit_stats.critical_section_start_time);
+
fs_info->commit_stats.commit_count++;
fs_info->commit_stats.last_commit_dur = interval;
fs_info->commit_stats.max_commit_dur =
max_t(u64, fs_info->commit_stats.max_commit_dur, interval);
fs_info->commit_stats.total_commit_dur += interval;
+ fs_info->commit_stats.critical_section_start_time = 0;
}
int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
@@ -2178,8 +2186,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_transaction *prev_trans = NULL;
int ret;
- ktime_t start_time;
- ktime_t interval;
ASSERT(refcount_read(&trans->use_count) == 1);
btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
@@ -2312,8 +2318,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* Get the time spent on the work done by the commit thread and not
* the time spent waiting on a previous commit
*/
- start_time = ktime_get_ns();
-
+ fs_info->commit_stats.critical_section_start_time = ktime_get_ns();
extwriter_counter_dec(cur_trans, trans->type);
ret = btrfs_start_delalloc_flush(fs_info);
@@ -2545,6 +2550,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto scrub_continue;
+ update_commit_stats(fs_info);
/*
* We needn't acquire the lock here because there is no other task
* which can change it.
@@ -2581,8 +2587,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
trace_btrfs_transaction_commit(fs_info);
- interval = ktime_get_ns() - start_time;
-
btrfs_scrub_continue(fs_info);
if (current->journal_info == trans)
@@ -2590,8 +2594,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
kmem_cache_free(btrfs_trans_handle_cachep, trans);
- update_commit_stats(fs_info, interval);
-
return ret;
unlock_reloc:
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 8f4703b488b7..0f556f4de3f9 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -191,7 +191,7 @@ static bool check_prev_ino(struct extent_buffer *leaf,
* Only subvolume trees along with their reloc trees need this check.
* Things like log tree doesn't follow this ino requirement.
*/
- if (!is_fstree(btrfs_header_owner(leaf)))
+ if (!btrfs_is_fstree(btrfs_header_owner(leaf)))
return true;
if (key->objectid == prev_key->objectid)
@@ -475,7 +475,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
* to be COWed to be relocated.
*/
if (unlikely(is_root_item && key->objectid == BTRFS_TREE_RELOC_OBJECTID &&
- !is_fstree(key->offset))) {
+ !btrfs_is_fstree(key->offset))) {
generic_err(leaf, slot,
"invalid reloc tree for root %lld, root id is not a subvolume tree",
key->offset);
@@ -493,7 +493,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
}
/* DIR_ITEM/INDEX/INODE_REF is not allowed to point to non-fs trees */
- if (unlikely(!is_fstree(key->objectid) && !is_root_item)) {
+ if (unlikely(!btrfs_is_fstree(key->objectid) && !is_root_item)) {
dir_item_err(leaf, slot,
"invalid location key objectid, have %llu expect [%llu, %llu]",
key->objectid, BTRFS_FIRST_FREE_OBJECTID,
@@ -1311,7 +1311,7 @@ static bool is_valid_dref_root(u64 rootid)
* - tree root
* For v1 space cache
*/
- return is_fstree(rootid) || rootid == BTRFS_DATA_RELOC_TREE_OBJECTID ||
+ return btrfs_is_fstree(rootid) || rootid == BTRFS_DATA_RELOC_TREE_OBJECTID ||
rootid == BTRFS_ROOT_TREE_OBJECTID;
}
@@ -2167,7 +2167,7 @@ ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO);
int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
{
- const bool is_subvol = is_fstree(root_owner);
+ const bool is_subvol = btrfs_is_fstree(root_owner);
const u64 eb_owner = btrfs_header_owner(eb);
/*
@@ -2209,7 +2209,7 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
* For subvolume trees, owners can mismatch, but they should all belong
* to subvolume trees.
*/
- if (unlikely(is_subvol != is_fstree(eb_owner))) {
+ if (unlikely(is_subvol != btrfs_is_fstree(eb_owner))) {
btrfs_crit(eb->fs_info,
"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect [%llu, %llu]",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 97e933113b82..9f05d454b9df 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -112,7 +112,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_root *log,
struct btrfs_path *path,
- u64 dirid, int del_all);
+ u64 dirid, bool del_all);
static void wait_log_commit(struct btrfs_root *root, int transid);
/*
@@ -143,6 +143,9 @@ static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *r
unsigned int nofs_flag;
struct btrfs_inode *inode;
+ /* Only meant to be called for subvolume roots and not for log roots. */
+ ASSERT(btrfs_is_fstree(btrfs_root_id(root)));
+
/*
* We're holding a transaction handle whether we are logging or
* replaying a log tree, so we must make sure NOFS semantics apply
@@ -604,21 +607,6 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
return 0;
}
-/*
- * simple helper to read an inode off the disk from a given root
- * This can only be called for subvolume roots and not for the log
- */
-static noinline struct btrfs_inode *read_one_inode(struct btrfs_root *root,
- u64 objectid)
-{
- struct btrfs_inode *inode;
-
- inode = btrfs_iget_logging(objectid, root);
- if (IS_ERR(inode))
- return NULL;
- return inode;
-}
-
/* replays a single extent in 'eb' at 'slot' with 'key' into the
* subvolume 'root'. path is released on entry and should be released
* on exit.
@@ -668,15 +656,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
extent_end = ALIGN(start + size,
fs_info->sectorsize);
} else {
- ret = 0;
- goto out;
+ btrfs_err(fs_info,
+ "unexpected extent type=%d root=%llu inode=%llu offset=%llu",
+ found_type, btrfs_root_id(root), key->objectid, key->offset);
+ return -EUCLEAN;
}
- inode = read_one_inode(root, key->objectid);
- if (!inode) {
- ret = -EIO;
- goto out;
- }
+ inode = btrfs_iget_logging(key->objectid, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
/*
* first check to see if we already have this extent in the
@@ -948,9 +936,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
- inode = read_one_inode(root, location.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(location.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
goto out;
}
@@ -961,7 +950,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
out:
kfree(name.name);
- iput(&inode->vfs_inode);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
@@ -1051,6 +1041,126 @@ out:
return ret;
}
+static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_root *log_root,
+ struct btrfs_key *search_key,
+ struct btrfs_inode *dir,
+ struct btrfs_inode *inode,
+ u64 parent_objectid)
+{
+ struct extent_buffer *leaf = path->nodes[0];
+ unsigned long ptr;
+ unsigned long ptr_end;
+
+ /*
+ * Check all the names in this back reference to see if they are in the
+ * log. If so, we allow them to stay otherwise they must be unlinked as
+ * a conflict.
+ */
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
+ while (ptr < ptr_end) {
+ struct fscrypt_str victim_name;
+ struct btrfs_inode_ref *victim_ref;
+ int ret;
+
+ victim_ref = (struct btrfs_inode_ref *)ptr;
+ ret = read_alloc_one_name(leaf, (victim_ref + 1),
+ btrfs_inode_ref_name_len(leaf, victim_ref),
+ &victim_name);
+ if (ret)
+ return ret;
+
+ ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name);
+ if (ret) {
+ kfree(victim_name.name);
+ if (ret < 0)
+ return ret;
+ ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
+ continue;
+ }
+
+ inc_nlink(&inode->vfs_inode);
+ btrfs_release_path(path);
+
+ ret = unlink_inode_for_log_replay(trans, dir, inode, &victim_name);
+ kfree(victim_name.name);
+ if (ret)
+ return ret;
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+static int unlink_extrefs_not_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_root *root,
+ struct btrfs_root *log_root,
+ struct btrfs_key *search_key,
+ struct btrfs_inode *inode,
+ u64 inode_objectid,
+ u64 parent_objectid)
+{
+ struct extent_buffer *leaf = path->nodes[0];
+ const unsigned long base = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ const u32 item_size = btrfs_item_size(leaf, path->slots[0]);
+ u32 cur_offset = 0;
+
+ while (cur_offset < item_size) {
+ struct btrfs_inode_extref *extref;
+ struct btrfs_inode *victim_parent;
+ struct fscrypt_str victim_name;
+ int ret;
+
+ extref = (struct btrfs_inode_extref *)(base + cur_offset);
+ victim_name.len = btrfs_inode_extref_name_len(leaf, extref);
+
+ if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
+ goto next;
+
+ ret = read_alloc_one_name(leaf, &extref->name, victim_name.len,
+ &victim_name);
+ if (ret)
+ return ret;
+
+ search_key->objectid = inode_objectid;
+ search_key->type = BTRFS_INODE_EXTREF_KEY;
+ search_key->offset = btrfs_extref_hash(parent_objectid,
+ victim_name.name,
+ victim_name.len);
+ ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name);
+ if (ret) {
+ kfree(victim_name.name);
+ if (ret < 0)
+ return ret;
+next:
+ cur_offset += victim_name.len + sizeof(*extref);
+ continue;
+ }
+
+ victim_parent = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(victim_parent)) {
+ kfree(victim_name.name);
+ return PTR_ERR(victim_parent);
+ }
+
+ inc_nlink(&inode->vfs_inode);
+ btrfs_release_path(path);
+
+ ret = unlink_inode_for_log_replay(trans, victim_parent, inode,
+ &victim_name);
+ iput(&victim_parent->vfs_inode);
+ kfree(victim_name.name);
+ if (ret)
+ return ret;
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -1061,7 +1171,6 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
u64 ref_index, struct fscrypt_str *name)
{
int ret;
- struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key search_key;
struct btrfs_inode_extref *extref;
@@ -1072,121 +1181,37 @@ again:
search_key.type = BTRFS_INODE_REF_KEY;
search_key.offset = parent_objectid;
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
- if (ret == 0) {
- struct btrfs_inode_ref *victim_ref;
- unsigned long ptr;
- unsigned long ptr_end;
-
- leaf = path->nodes[0];
-
- /* are we trying to overwrite a back ref for the root directory
- * if so, just jump out, we're done
+ if (ret < 0) {
+ return ret;
+ } else if (ret == 0) {
+ /*
+ * Are we trying to overwrite a back ref for the root directory?
+ * If so, we're done.
*/
if (search_key.objectid == search_key.offset)
return 1;
- /* check all the names in this back reference to see
- * if they are in the log. if so, we allow them to stay
- * otherwise they must be unlinked as a conflict
- */
- ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
- ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
- while (ptr < ptr_end) {
- struct fscrypt_str victim_name;
-
- victim_ref = (struct btrfs_inode_ref *)ptr;
- ret = read_alloc_one_name(leaf, (victim_ref + 1),
- btrfs_inode_ref_name_len(leaf, victim_ref),
- &victim_name);
- if (ret)
- return ret;
-
- ret = backref_in_log(log_root, &search_key,
- parent_objectid, &victim_name);
- if (ret < 0) {
- kfree(victim_name.name);
- return ret;
- } else if (!ret) {
- inc_nlink(&inode->vfs_inode);
- btrfs_release_path(path);
-
- ret = unlink_inode_for_log_replay(trans, dir, inode,
- &victim_name);
- kfree(victim_name.name);
- if (ret)
- return ret;
- goto again;
- }
- kfree(victim_name.name);
-
- ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
- }
+ ret = unlink_refs_not_in_log(trans, path, log_root, &search_key,
+ dir, inode, parent_objectid);
+ if (ret == -EAGAIN)
+ goto again;
+ else if (ret)
+ return ret;
}
btrfs_release_path(path);
/* Same search but for extended refs */
- extref = btrfs_lookup_inode_extref(NULL, root, path, name,
- inode_objectid, parent_objectid, 0,
- 0);
+ extref = btrfs_lookup_inode_extref(root, path, name, inode_objectid, parent_objectid);
if (IS_ERR(extref)) {
return PTR_ERR(extref);
} else if (extref) {
- u32 item_size;
- u32 cur_offset = 0;
- unsigned long base;
- struct btrfs_inode *victim_parent;
-
- leaf = path->nodes[0];
-
- item_size = btrfs_item_size(leaf, path->slots[0]);
- base = btrfs_item_ptr_offset(leaf, path->slots[0]);
-
- while (cur_offset < item_size) {
- struct fscrypt_str victim_name;
-
- extref = (struct btrfs_inode_extref *)(base + cur_offset);
-
- if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
- goto next;
-
- ret = read_alloc_one_name(leaf, &extref->name,
- btrfs_inode_extref_name_len(leaf, extref),
- &victim_name);
- if (ret)
- return ret;
-
- search_key.objectid = inode_objectid;
- search_key.type = BTRFS_INODE_EXTREF_KEY;
- search_key.offset = btrfs_extref_hash(parent_objectid,
- victim_name.name,
- victim_name.len);
- ret = backref_in_log(log_root, &search_key,
- parent_objectid, &victim_name);
- if (ret < 0) {
- kfree(victim_name.name);
- return ret;
- } else if (!ret) {
- ret = -ENOENT;
- victim_parent = read_one_inode(root,
- parent_objectid);
- if (victim_parent) {
- inc_nlink(&inode->vfs_inode);
- btrfs_release_path(path);
-
- ret = unlink_inode_for_log_replay(trans,
- victim_parent,
- inode, &victim_name);
- }
- iput(&victim_parent->vfs_inode);
- kfree(victim_name.name);
- if (ret)
- return ret;
- goto again;
- }
- kfree(victim_name.name);
-next:
- cur_offset += victim_name.len + sizeof(*extref);
- }
+ ret = unlink_extrefs_not_in_log(trans, path, root, log_root,
+ &search_key, inode,
+ inode_objectid, parent_objectid);
+ if (ret == -EAGAIN)
+ goto again;
+ else if (ret)
+ return ret;
}
btrfs_release_path(path);
@@ -1314,9 +1339,9 @@ again:
struct btrfs_inode *dir;
btrfs_release_path(path);
- dir = read_one_inode(root, parent_id);
- if (!dir) {
- ret = -ENOENT;
+ dir = btrfs_iget_logging(parent_id, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
kfree(name.name);
goto out;
}
@@ -1360,7 +1385,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
unsigned long ref_end;
struct fscrypt_str name = { 0 };
int ret;
- int log_ref_ver = 0;
+ const bool is_extref_item = (key->type == BTRFS_INODE_EXTREF_KEY);
u64 parent_objectid;
u64 inode_objectid;
u64 ref_index = 0;
@@ -1369,11 +1394,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
ref_ptr = btrfs_item_ptr_offset(eb, slot);
ref_end = ref_ptr + btrfs_item_size(eb, slot);
- if (key->type == BTRFS_INODE_EXTREF_KEY) {
+ if (is_extref_item) {
struct btrfs_inode_extref *r;
ref_struct_size = sizeof(struct btrfs_inode_extref);
- log_ref_ver = 1;
r = (struct btrfs_inode_extref *)ref_ptr;
parent_objectid = btrfs_inode_extref_parent(eb, r);
} else {
@@ -1388,37 +1412,61 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* copy the back ref in. The link count fixup code will take
* care of the rest
*/
- dir = read_one_inode(root, parent_objectid);
- if (!dir) {
- ret = -ENOENT;
+ dir = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ if (ret == -ENOENT)
+ ret = 0;
+ dir = NULL;
goto out;
}
- inode = read_one_inode(root, inode_objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(inode_objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
goto out;
}
while (ref_ptr < ref_end) {
- if (log_ref_ver) {
+ if (is_extref_item) {
ret = extref_get_fields(eb, ref_ptr, &name,
&ref_index, &parent_objectid);
+ if (ret)
+ goto out;
/*
* parent object can change from one array
* item to another.
*/
- if (!dir)
- dir = read_one_inode(root, parent_objectid);
if (!dir) {
- ret = -ENOENT;
- goto out;
+ dir = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ dir = NULL;
+ /*
+ * A new parent dir may have not been
+ * logged and not exist in the subvolume
+ * tree, see the comment above before
+ * the loop when getting the first
+ * parent dir.
+ */
+ if (ret == -ENOENT) {
+ /*
+ * The next extref may refer to
+ * another parent dir that
+ * exists, so continue.
+ */
+ ret = 0;
+ goto next;
+ }
+ goto out;
+ }
}
} else {
ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
+ if (ret)
+ goto out;
}
- if (ret)
- goto out;
ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
ref_index, &name);
@@ -1452,10 +1500,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
}
/* Else, ret == 1, we already have a perfect match, we're done. */
+next:
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
kfree(name.name);
name.name = NULL;
- if (log_ref_ver) {
+ if (is_extref_item && dir) {
iput(&dir->vfs_inode);
dir = NULL;
}
@@ -1632,8 +1681,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (inode->vfs_inode.i_nlink == 0) {
if (S_ISDIR(inode->vfs_inode.i_mode)) {
- ret = replay_dir_deletes(trans, root, NULL, path,
- ino, 1);
+ ret = replay_dir_deletes(trans, root, NULL, path, ino, true);
if (ret)
goto out;
}
@@ -1681,9 +1729,9 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
break;
btrfs_release_path(path);
- inode = read_one_inode(root, key.offset);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(key.offset, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
break;
}
@@ -1719,9 +1767,9 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode;
struct inode *vfs_inode;
- inode = read_one_inode(root, objectid);
- if (!inode)
- return -EIO;
+ inode = btrfs_iget_logging(objectid, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
vfs_inode = &inode->vfs_inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
@@ -1760,14 +1808,14 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir;
int ret;
- inode = read_one_inode(root, location->objectid);
- if (!inode)
- return -ENOENT;
+ inode = btrfs_iget_logging(location->objectid, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
- dir = read_one_inode(root, dirid);
- if (!dir) {
+ dir = btrfs_iget_logging(dirid, root);
+ if (IS_ERR(dir)) {
iput(&inode->vfs_inode);
- return -EIO;
+ return PTR_ERR(dir);
}
ret = btrfs_add_link(trans, dir, inode, name, 1, index);
@@ -1844,9 +1892,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
bool update_size = true;
bool name_added = false;
- dir = read_one_inode(root, key->objectid);
- if (!dir)
- return -EIO;
+ dir = btrfs_iget_logging(key->objectid, root);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
if (ret)
@@ -2146,9 +2194,10 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
btrfs_dir_item_key_to_cpu(eb, di, &location);
btrfs_release_path(path);
btrfs_release_path(log_path);
- inode = read_one_inode(root, location.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(location.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
goto out;
}
@@ -2284,7 +2333,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_root *log,
struct btrfs_path *path,
- u64 dirid, int del_all)
+ u64 dirid, bool del_all)
{
u64 range_start;
u64 range_end;
@@ -2300,14 +2349,17 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
if (!log_path)
return -ENOMEM;
- dir = read_one_inode(root, dirid);
- /* it isn't an error if the inode isn't there, that can happen
- * because we replay the deletes before we copy in the inode item
- * from the log
+ dir = btrfs_iget_logging(dirid, root);
+ /*
+ * It isn't an error if the inode isn't there, that can happen because
+ * we replay the deletes before we copy in the inode item from the log.
*/
- if (!dir) {
+ if (IS_ERR(dir)) {
btrfs_free_path(log_path);
- return 0;
+ ret = PTR_ERR(dir);
+ if (ret == -ENOENT)
+ ret = 0;
+ return ret;
}
range_start = 0;
@@ -2443,8 +2495,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
break;
mode = btrfs_inode_mode(eb, inode_item);
if (S_ISDIR(mode)) {
- ret = replay_dir_deletes(wc->trans,
- root, log, path, key.objectid, 0);
+ ret = replay_dir_deletes(wc->trans, root, log, path,
+ key.objectid, false);
if (ret)
break;
}
@@ -2466,9 +2518,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
struct btrfs_inode *inode;
u64 from;
- inode = read_one_inode(root, key.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(key.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
break;
}
from = ALIGN(i_size_read(&inode->vfs_inode),
@@ -2519,9 +2571,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
key.type == BTRFS_INODE_EXTREF_KEY) {
ret = add_inode_ref(wc->trans, root, log, path,
eb, i, &key);
- if (ret && ret != -ENOENT)
+ if (ret)
break;
- ret = 0;
} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
ret = replay_one_extent(wc->trans, root, path,
eb, i, &key);
@@ -2720,7 +2771,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
level = btrfs_header_level(log->node);
orig_level = level;
path->nodes[level] = log->node;
- atomic_inc(&log->node->refs);
+ refcount_inc(&log->node->refs);
path->slots[level] = 0;
while (1) {
@@ -2961,9 +3012,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
if (log_transid % 2 == 0)
- mark = EXTENT_DIRTY;
+ mark = EXTENT_DIRTY_LOG1;
else
- mark = EXTENT_NEW;
+ mark = EXTENT_DIRTY_LOG2;
/* we start IO on all the marked extents here, but we don't actually
* wait for them until later.
@@ -3094,7 +3145,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_write_marked_extents(fs_info,
&log_root_tree->dirty_log_pages,
- EXTENT_DIRTY | EXTENT_NEW);
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
blk_finish_plug(&plug);
/*
* As described above, -EAGAIN indicates a hole in the extents. We
@@ -3114,7 +3165,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_wait_tree_log_extents(log, mark);
if (!ret)
ret = btrfs_wait_tree_log_extents(log_root_tree,
- EXTENT_NEW | EXTENT_DIRTY);
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
if (ret) {
btrfs_set_log_full_commit(trans);
mutex_unlock(&log_root_tree->log_mutex);
@@ -3240,9 +3291,9 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
*/
btrfs_write_marked_extents(log->fs_info,
&log->dirty_log_pages,
- EXTENT_DIRTY | EXTENT_NEW);
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
btrfs_wait_tree_log_extents(log,
- EXTENT_DIRTY | EXTENT_NEW);
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
if (trans)
btrfs_abort_transaction(trans, ret);
@@ -3432,7 +3483,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans,
* inode item because on log replay we update the field to reflect
* all existing entries in the directory (see overwrite_item()).
*/
- return btrfs_delete_one_dir_name(trans, log, path, di);
+ return btrfs_del_item(trans, log, path);
}
/*
@@ -3472,26 +3523,27 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
return;
}
- ret = join_running_log_trans(root);
- if (ret)
- return;
-
- mutex_lock(&dir->log_mutex);
-
path = btrfs_alloc_path();
if (!path) {
- ret = -ENOMEM;
- goto out_unlock;
+ btrfs_set_log_full_commit(trans);
+ return;
}
+ ret = join_running_log_trans(root);
+ ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret);
+ if (WARN_ON(ret))
+ goto out;
+
+ mutex_lock(&dir->log_mutex);
+
ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
name, index);
- btrfs_free_path(path);
-out_unlock:
mutex_unlock(&dir->log_mutex);
if (ret < 0)
btrfs_set_log_full_commit(trans);
btrfs_end_log_trans(root);
+out:
+ btrfs_free_path(path);
}
/* see comments for btrfs_del_dir_entries_in_log */
@@ -3501,7 +3553,6 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 dirid)
{
struct btrfs_root *log;
- u64 index;
int ret;
ret = inode_logged(trans, inode, NULL);
@@ -3513,13 +3564,13 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
}
ret = join_running_log_trans(root);
- if (ret)
+ ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret);
+ if (WARN_ON(ret))
return;
log = root->log_root;
mutex_lock(&inode->log_mutex);
- ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode),
- dirid, &index);
+ ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode), dirid, NULL);
mutex_unlock(&inode->log_mutex);
if (ret < 0 && ret != -ENOENT)
btrfs_set_log_full_commit(trans);
@@ -3684,7 +3735,7 @@ static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx)
* Add extra ref to scratch eb so that it is not freed when callers
* release the path, so we can reuse it later if needed.
*/
- atomic_inc(&ctx->scratch_eb->refs);
+ refcount_inc(&ctx->scratch_eb->refs);
return 0;
}
@@ -4172,44 +4223,37 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
struct inode *inode, int log_inode_only,
u64 logged_isize)
{
- struct btrfs_map_token token;
u64 flags;
- btrfs_init_map_token(&token, leaf);
-
if (log_inode_only) {
/* set the generation to zero so the recover code
* can tell the difference between an logging
* just to say 'this inode exists' and a logging
* to say 'update this inode with these values'
*/
- btrfs_set_token_inode_generation(&token, item, 0);
- btrfs_set_token_inode_size(&token, item, logged_isize);
+ btrfs_set_inode_generation(leaf, item, 0);
+ btrfs_set_inode_size(leaf, item, logged_isize);
} else {
- btrfs_set_token_inode_generation(&token, item,
- BTRFS_I(inode)->generation);
- btrfs_set_token_inode_size(&token, item, inode->i_size);
+ btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
+ btrfs_set_inode_size(leaf, item, inode->i_size);
}
- btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
- btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
- btrfs_set_token_inode_mode(&token, item, inode->i_mode);
- btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
+ btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
+ btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
+ btrfs_set_inode_mode(leaf, item, inode->i_mode);
+ btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
- btrfs_set_token_timespec_sec(&token, &item->atime,
- inode_get_atime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->atime,
- inode_get_atime_nsec(inode));
+ btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode));
- btrfs_set_token_timespec_sec(&token, &item->mtime,
- inode_get_mtime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->mtime,
- inode_get_mtime_nsec(inode));
+ btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode));
- btrfs_set_token_timespec_sec(&token, &item->ctime,
- inode_get_ctime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->ctime,
- inode_get_ctime_nsec(inode));
+ btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode));
+
+ btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec);
+ btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec);
/*
* We do not need to set the nbytes field, in fact during a fast fsync
@@ -4220,13 +4264,13 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
* inode item in subvolume tree as needed (see overwrite_item()).
*/
- btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
- btrfs_set_token_inode_transid(&token, item, trans->transid);
- btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
+ btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode));
+ btrfs_set_inode_transid(leaf, item, trans->transid);
+ btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
BTRFS_I(inode)->ro_flags);
- btrfs_set_token_inode_flags(&token, item, flags);
- btrfs_set_token_inode_block_group(&token, item, 0);
+ btrfs_set_inode_flags(leaf, item, flags);
+ btrfs_set_inode_block_group(leaf, item, 0);
}
static int log_inode_item(struct btrfs_trans_handle *trans,
@@ -7192,8 +7236,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
struct btrfs_key key;
- struct btrfs_key found_key;
- struct btrfs_root *log;
struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
struct walk_control wc = {
.process_func = process_one_buffer,
@@ -7227,6 +7269,9 @@ again:
key.offset = (u64)-1;
while (1) {
+ struct btrfs_root *log;
+ struct btrfs_key found_key;
+
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
if (ret < 0) {
@@ -7255,6 +7300,12 @@ again:
true);
if (IS_ERR(wc.replay_dest)) {
ret = PTR_ERR(wc.replay_dest);
+ wc.replay_dest = NULL;
+ if (ret != -ENOENT) {
+ btrfs_put_root(log);
+ btrfs_abort_transaction(trans, ret);
+ goto error;
+ }
/*
* We didn't find the subvol, likely because it was
@@ -7267,36 +7318,36 @@ again:
* block from being modified, and we'll just bail for
* each subsequent pass.
*/
- if (ret == -ENOENT)
- ret = btrfs_pin_extent_for_log_replay(trans, log->node);
- btrfs_put_root(log);
-
- if (!ret)
- goto next;
- btrfs_abort_transaction(trans, ret);
- goto error;
+ ret = btrfs_pin_extent_for_log_replay(trans, log->node);
+ if (ret) {
+ btrfs_put_root(log);
+ btrfs_abort_transaction(trans, ret);
+ goto error;
+ }
+ goto next;
}
wc.replay_dest->log_root = log;
ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
- if (ret)
- /* The loop needs to continue due to the root refs */
+ if (ret) {
btrfs_abort_transaction(trans, ret);
- else
- ret = walk_log_tree(trans, log, &wc);
+ goto next;
+ }
- if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
- ret = fixup_inode_link_counts(trans, wc.replay_dest,
- path);
- if (ret)
- btrfs_abort_transaction(trans, ret);
+ ret = walk_log_tree(trans, log, &wc);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto next;
}
- if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
+ if (wc.stage == LOG_WALK_REPLAY_ALL) {
struct btrfs_root *root = wc.replay_dest;
- btrfs_release_path(path);
-
+ ret = fixup_inode_link_counts(trans, wc.replay_dest, path);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto next;
+ }
/*
* We have just replayed everything, and the highest
* objectid of fs roots probably has changed in case
@@ -7306,17 +7357,20 @@ again:
* could only happen during mount.
*/
ret = btrfs_init_root_free_objectid(root);
- if (ret)
+ if (ret) {
btrfs_abort_transaction(trans, ret);
+ goto next;
+ }
+ }
+next:
+ if (wc.replay_dest) {
+ wc.replay_dest->log_root = NULL;
+ btrfs_put_root(wc.replay_dest);
}
-
- wc.replay_dest->log_root = NULL;
- btrfs_put_root(wc.replay_dest);
btrfs_put_root(log);
if (ret)
goto error;
-next:
if (found_key.offset == 0)
break;
key.offset = found_key.offset - 1;
@@ -7447,6 +7501,8 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
* full log sync.
* Also we don't need to worry with renames, since btrfs_rename() marks the log
* for full commit when renaming a subvolume.
+ *
+ * Must be called before creating the subvolume entry in its parent directory.
*/
void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
struct btrfs_inode *dir)
@@ -7483,6 +7539,9 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
bool log_pinned = false;
int ret;
+ btrfs_init_log_ctx(&ctx, inode);
+ ctx.logging_new_name = true;
+
/*
* this will force the logging code to walk the dentry chain
* up for the file
@@ -7514,6 +7573,13 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
ret = 0;
/*
+ * Now that we know we need to update the log, allocate the scratch eb
+ * for the context before joining a log transaction below, as this can
+ * take time and therefore we could delay log commits from other tasks.
+ */
+ btrfs_init_log_ctx_scratch_eb(&ctx);
+
+ /*
* If we are doing a rename (old_dir is not NULL) from a directory that
* was previously logged, make sure that on log replay we get the old
* dir entry deleted. This is needed because we will also log the new
@@ -7531,6 +7597,14 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
&old_dentry->d_name, 0, &fname);
if (ret)
goto out;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ fscrypt_free_filename(&fname);
+ goto out;
+ }
+
/*
* We have two inodes to update in the log, the old directory and
* the inode that got renamed, so we must pin the log to prevent
@@ -7544,19 +7618,13 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* mark the log for a full commit.
*/
if (WARN_ON_ONCE(ret < 0)) {
+ btrfs_free_path(path);
fscrypt_free_filename(&fname);
goto out;
}
log_pinned = true;
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- fscrypt_free_filename(&fname);
- goto out;
- }
-
/*
* Other concurrent task might be logging the old directory,
* as it can be triggered when logging other inode that had or
@@ -7588,9 +7656,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
goto out;
}
- btrfs_init_log_ctx(&ctx, inode);
- ctx.logging_new_name = true;
- btrfs_init_log_ctx_scratch_eb(&ctx);
/*
* We don't care about the return value. If we fail to log the new name
* then we know the next attempt to sync the log will fallback to a full
@@ -7599,7 +7664,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* inconsistent state after a rename operation.
*/
btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
- free_extent_buffer(ctx.scratch_eb);
ASSERT(list_empty(&ctx.conflict_inodes));
out:
/*
@@ -7612,5 +7676,6 @@ out:
btrfs_set_log_full_commit(trans);
if (log_pinned)
btrfs_end_log_trans(root);
+ free_extent_buffer(ctx.scratch_eb);
}
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
index 1ac2678fc4ca..9e8cb3b7c064 100644
--- a/fs/btrfs/tree-mod-log.c
+++ b/fs/btrfs/tree-mod-log.c
@@ -27,18 +27,29 @@ struct tree_mod_elem {
/* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
u64 generation;
- /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
- struct btrfs_disk_key key;
- u64 blockptr;
-
- /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
- struct {
- int dst_slot;
- int nr_items;
- } move;
-
- /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
- struct tree_mod_root old_root;
+ union {
+ /*
+ * This is used for the following op types:
+ *
+ * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING
+ * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING
+ * BTRFS_MOD_LOG_KEY_REMOVE
+ * BTRFS_MOD_LOG_KEY_REPLACE
+ */
+ struct {
+ struct btrfs_disk_key key;
+ u64 blockptr;
+ } slot_change;
+
+ /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
+ struct {
+ int dst_slot;
+ int nr_items;
+ } move;
+
+ /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
+ struct tree_mod_root old_root;
+ };
};
/*
@@ -164,6 +175,30 @@ static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
return 0;
}
+static inline bool skip_eb_logging(const struct extent_buffer *eb)
+{
+ const u64 owner = btrfs_header_owner(eb);
+
+ if (btrfs_header_level(eb) == 0)
+ return true;
+
+ /*
+ * Tree mod logging exists so that there's a consistent view of the
+ * extents and backrefs of inodes even if while a task is iterating over
+ * them other tasks are modifying subvolume trees and the extent tree
+ * (including running delayed refs). So we only need to log extent
+ * buffers from the extent tree and subvolume trees.
+ */
+
+ if (owner == BTRFS_EXTENT_TREE_OBJECTID)
+ return false;
+
+ if (btrfs_is_fstree(owner))
+ return false;
+
+ return true;
+}
+
/*
* Determines if logging can be omitted. Returns true if it can. Otherwise, it
* returns false with the tree_mod_log_lock acquired. The caller must hold
@@ -174,7 +209,7 @@ static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, const struct extent
{
if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
return true;
- if (eb && btrfs_header_level(eb) == 0)
+ if (eb && skip_eb_logging(eb))
return true;
write_lock(&fs_info->tree_mod_log_lock);
@@ -192,7 +227,7 @@ static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
{
if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
return false;
- if (eb && btrfs_header_level(eb) == 0)
+ if (eb && skip_eb_logging(eb))
return false;
return true;
@@ -204,15 +239,17 @@ static struct tree_mod_elem *alloc_tree_mod_elem(const struct extent_buffer *eb,
{
struct tree_mod_elem *tm;
+ /* Can't be one of these types, due to union in struct tree_mod_elem. */
+ ASSERT(op != BTRFS_MOD_LOG_MOVE_KEYS);
+ ASSERT(op != BTRFS_MOD_LOG_ROOT_REPLACE);
+
tm = kzalloc(sizeof(*tm), GFP_NOFS);
if (!tm)
return NULL;
tm->logical = eb->start;
- if (op != BTRFS_MOD_LOG_KEY_ADD) {
- btrfs_node_key(eb, &tm->key, slot);
- tm->blockptr = btrfs_node_blockptr(eb, slot);
- }
+ btrfs_node_key(eb, &tm->slot_change.key, slot);
+ tm->slot_change.blockptr = btrfs_node_blockptr(eb, slot);
tm->op = op;
tm->slot = slot;
tm->generation = btrfs_node_ptr_generation(eb, slot);
@@ -830,8 +867,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
fallthrough;
case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
case BTRFS_MOD_LOG_KEY_REMOVE:
- btrfs_set_node_key(eb, &tm->key, tm->slot);
- btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr);
btrfs_set_node_ptr_generation(eb, tm->slot,
tm->generation);
n++;
@@ -840,8 +877,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
break;
case BTRFS_MOD_LOG_KEY_REPLACE:
BUG_ON(tm->slot >= n);
- btrfs_set_node_key(eb, &tm->key, tm->slot);
- btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr);
btrfs_set_node_ptr_generation(eb, tm->slot,
tm->generation);
break;
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index fc59b57257d6..7e16a253fb35 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -129,21 +129,25 @@ void ulist_free(struct ulist *ulist)
kfree(ulist);
}
+static int ulist_node_val_key_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *val = key;
+ const struct ulist_node *unode = rb_entry(node, struct ulist_node, rb_node);
+
+ if (unode->val < *val)
+ return 1;
+ else if (unode->val > *val)
+ return -1;
+
+ return 0;
+}
+
static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val)
{
- struct rb_node *n = ulist->root.rb_node;
- struct ulist_node *u = NULL;
-
- while (n) {
- u = rb_entry(n, struct ulist_node, rb_node);
- if (u->val < val)
- n = n->rb_right;
- else if (u->val > val)
- n = n->rb_left;
- else
- return u;
- }
- return NULL;
+ struct rb_node *node;
+
+ node = rb_find(&val, &ulist->root, ulist_node_val_key_cmp);
+ return rb_entry_safe(node, struct ulist_node, rb_node);
}
static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node)
@@ -155,25 +159,20 @@ static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node)
ulist->nnodes--;
}
+static int ulist_node_val_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct ulist_node *unode = rb_entry(new, struct ulist_node, rb_node);
+
+ return ulist_node_val_key_cmp(&unode->val, existing);
+}
+
static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins)
{
- struct rb_node **p = &ulist->root.rb_node;
- struct rb_node *parent = NULL;
- struct ulist_node *cur = NULL;
-
- while (*p) {
- parent = *p;
- cur = rb_entry(parent, struct ulist_node, rb_node);
-
- if (cur->val < ins->val)
- p = &(*p)->rb_right;
- else if (cur->val > ins->val)
- p = &(*p)->rb_left;
- else
- return -EEXIST;
- }
- rb_link_node(&ins->rb_node, parent, p);
- rb_insert_color(&ins->rb_node, &ulist->root);
+ struct rb_node *node;
+
+ node = rb_find_add(&ins->rb_node, &ulist->root, ulist_node_val_cmp);
+ if (node)
+ return -EEXIST;
return 0;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 89835071cfea..fa7a929a0461 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -18,7 +18,6 @@
#include "transaction.h"
#include "volumes.h"
#include "raid56.h"
-#include "rcu-string.h"
#include "dev-replace.h"
#include "sysfs.h"
#include "tree-checker.h"
@@ -214,10 +213,8 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
u64 flags = bg_flags;
u32 size_bp = size_buf;
- if (!flags) {
- strcpy(bp, "NONE");
+ if (!flags)
return;
- }
#define DESCRIBE_FLAG(flag, desc) \
do { \
@@ -403,7 +400,11 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
static void btrfs_free_device(struct btrfs_device *device)
{
WARN_ON(!list_empty(&device->post_commit_list));
- rcu_string_free(device->name);
+ /*
+ * No need to call kfree_rcu() nor do RCU lock/unlock, nothing is
+ * reading the device name.
+ */
+ kfree(rcu_dereference_raw(device->name));
btrfs_extent_io_tree_release(&device->alloc_state);
btrfs_destroy_dev_zone_info(device);
kfree(device);
@@ -414,6 +415,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
struct btrfs_device *device;
WARN_ON(fs_devices->opened);
+ WARN_ON(fs_devices->holding);
while (!list_empty(&fs_devices->devices)) {
device = list_first_entry(&fs_devices->devices,
struct btrfs_device, dev_list);
@@ -473,7 +475,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
struct block_device *bdev;
int ret;
- *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL);
+ *bdev_file = bdev_file_open_by_path(device_path, flags, holder, &fs_holder_ops);
if (IS_ERR(*bdev_file)) {
ret = PTR_ERR(*bdev_file);
@@ -488,7 +490,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
if (holder) {
ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE);
if (ret) {
- fput(*bdev_file);
+ bdev_fput(*bdev_file);
goto error;
}
}
@@ -496,7 +498,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
*disk_super = btrfs_read_disk_super(bdev, 0, false);
if (IS_ERR(*disk_super)) {
ret = PTR_ERR(*disk_super);
- fput(*bdev_file);
+ bdev_fput(*bdev_file);
goto error;
}
@@ -541,7 +543,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
continue;
if (devt && devt != device->devt)
continue;
- if (fs_devices->opened) {
+ if (fs_devices->opened || fs_devices->holding) {
if (devt)
ret = -EBUSY;
break;
@@ -657,7 +659,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
if (!device->name)
return -EINVAL;
- ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
+ ret = btrfs_get_bdev_and_sb(rcu_dereference_raw(device->name), flags, holder, 1,
&bdev_file, &disk_super);
if (ret)
return ret;
@@ -674,8 +676,8 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
if (btrfs_super_incompat_flags(disk_super) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
- pr_err(
- "BTRFS: Invalid seeding and uuid-changed device detected\n");
+ btrfs_err(NULL,
+ "invalid seeding and uuid-changed device detected");
goto error_free_page;
}
@@ -701,7 +703,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
if (device->devt != device->bdev->bd_dev) {
btrfs_warn(NULL,
"device %s maj:min changed from %d:%d to %d:%d",
- device->name->str, MAJOR(device->devt),
+ rcu_dereference_raw(device->name), MAJOR(device->devt),
MINOR(device->devt), MAJOR(device->bdev->bd_dev),
MINOR(device->bdev->bd_dev));
@@ -720,7 +722,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
error_free_page:
btrfs_release_disk_super(disk_super);
- fput(bdev_file);
+ bdev_fput(bdev_file);
return -EINVAL;
}
@@ -749,7 +751,7 @@ static bool is_same_device(struct btrfs_device *device, const char *new_path)
goto out;
rcu_read_lock();
- ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX);
+ ret = strscpy(old_path, rcu_dereference(device->name), PATH_MAX);
rcu_read_unlock();
if (ret < 0)
goto out;
@@ -782,11 +784,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,
{
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices = NULL;
- struct rcu_string *name;
+ const char *name;
u64 found_transid = btrfs_super_generation(disk_super);
u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_t path_devt;
- int error;
+ int ret;
bool same_fsid_diff_dev = false;
bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
@@ -798,11 +800,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,
return ERR_PTR(-EAGAIN);
}
- error = lookup_bdev(path, &path_devt);
- if (error) {
+ ret = lookup_bdev(path, &path_devt);
+ if (ret) {
btrfs_err(NULL, "failed to lookup block device for path %s: %d",
- path, error);
- return ERR_PTR(error);
+ path, ret);
+ return ERR_PTR(ret);
}
fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev);
@@ -819,7 +821,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (same_fsid_diff_dev) {
generate_random_uuid(fs_devices->fsid);
fs_devices->temp_fsid = true;
- pr_info("BTRFS: device %s (%d:%d) using temp-fsid %pU\n",
+ btrfs_info(NULL, "device %s (%d:%d) using temp-fsid %pU",
path, MAJOR(path_devt), MINOR(path_devt),
fs_devices->fsid);
}
@@ -890,6 +892,8 @@ static noinline struct btrfs_device *device_list_add(const char *path,
current->comm, task_pid_nr(current));
} else if (!device->name || !is_same_device(device, path)) {
+ const char *old_name;
+
/*
* When FS is already mounted.
* 1. If you are here and if the device->name is NULL that
@@ -943,27 +947,31 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (device->bdev) {
if (device->devt != path_devt) {
mutex_unlock(&fs_devices->device_list_mutex);
- btrfs_warn_in_rcu(NULL,
+ btrfs_warn(NULL,
"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
path, devid, found_transid,
current->comm,
task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
- btrfs_info_in_rcu(NULL,
+ btrfs_info(NULL,
"devid %llu device path %s changed to %s scanned by %s (%d)",
devid, btrfs_dev_name(device),
path, current->comm,
task_pid_nr(current));
}
- name = rcu_string_strdup(path, GFP_NOFS);
+ name = kstrdup(path, GFP_NOFS);
if (!name) {
mutex_unlock(&fs_devices->device_list_mutex);
return ERR_PTR(-ENOMEM);
}
- rcu_string_free(device->name);
+ rcu_read_lock();
+ old_name = rcu_dereference(device->name);
+ rcu_read_unlock();
rcu_assign_pointer(device->name, name);
+ kfree_rcu_mightsleep(old_name);
+
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
fs_devices->missing_devices--;
clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
@@ -1012,7 +1020,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
* uuid mutex so nothing we touch in here is going to disappear.
*/
if (orig_dev->name)
- dev_path = orig_dev->name->str;
+ dev_path = rcu_dereference_raw(orig_dev->name);
device = btrfs_alloc_device(NULL, &orig_dev->devid,
orig_dev->uuid, dev_path);
@@ -1070,7 +1078,7 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
continue;
if (device->bdev_file) {
- fput(device->bdev_file);
+ bdev_fput(device->bdev_file);
device->bdev = NULL;
device->bdev_file = NULL;
fs_devices->open_devices--;
@@ -1117,7 +1125,7 @@ static void btrfs_close_bdev(struct btrfs_device *device)
invalidate_bdev(device->bdev);
}
- fput(device->bdev_file);
+ bdev_fput(device->bdev_file);
}
static void btrfs_close_one_device(struct btrfs_device *device)
@@ -1197,7 +1205,7 @@ void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
mutex_lock(&uuid_mutex);
close_fs_devices(fs_devices);
- if (!fs_devices->opened) {
+ if (!fs_devices->opened && !fs_devices->holding) {
list_splice_init(&fs_devices->seed_list, &list);
/*
@@ -1414,7 +1422,7 @@ static bool btrfs_skip_registration(struct btrfs_super_block *disk_super,
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (device->bdev && (device->bdev->bd_dev == devt) &&
- strcmp(device->name->str, path) != 0) {
+ strcmp(rcu_dereference_raw(device->name), path) != 0) {
mutex_unlock(&fs_devices->device_list_mutex);
/* Do not skip registration. */
@@ -1440,7 +1448,7 @@ static bool btrfs_skip_registration(struct btrfs_super_block *disk_super,
* the device or return an error. Multi-device and seeding devices are registered
* in both cases.
*/
-struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
+struct btrfs_device *btrfs_scan_one_device(const char *path,
bool mount_arg_dev)
{
struct btrfs_super_block *disk_super;
@@ -1461,7 +1469,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
* values temporarily, as the device paths of the fsid are the only
* required information for assembling the volume.
*/
- bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL);
+ bdev_file = bdev_file_open_by_path(path, BLK_OPEN_READ, NULL, NULL);
if (IS_ERR(bdev_file))
return ERR_CAST(bdev_file);
@@ -1473,7 +1481,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
devt = file_bdev(bdev_file)->bd_dev;
if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) {
- pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n",
+ btrfs_debug(NULL, "skip registering single non-seed device %s (%d:%d)",
path, MAJOR(devt), MINOR(devt));
btrfs_free_stale_devices(devt, NULL);
@@ -1490,7 +1498,7 @@ free_disk_super:
btrfs_release_disk_super(disk_super);
error_bdev_put:
- fput(bdev_file);
+ bdev_fput(bdev_file);
return device;
}
@@ -2164,7 +2172,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_devic
btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
/* Update ctime/mtime for device path for libblkid */
- update_dev_time(device->name->str);
+ update_dev_time(rcu_dereference_raw(device->name));
}
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
@@ -2204,7 +2212,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
}
if (btrfs_pinned_by_swapfile(fs_info, device)) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"cannot remove device %s (devid %llu) due to active swapfile",
btrfs_dev_name(device), device->devid);
return -ETXTBSY;
@@ -2294,7 +2302,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
* free the device.
*
* We cannot call btrfs_close_bdev() here because we're holding the sb
- * write lock, and fput() on the block device will pull in the
+ * write lock, and bdev_fput() on the block device will pull in the
* ->open_mutex on the block device and it's dependencies. Instead
* just flush the device and let the caller do the final bdev_release.
*/
@@ -2473,7 +2481,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
else
memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
btrfs_release_disk_super(disk_super);
- fput(bdev_file);
+ bdev_fput(bdev_file);
return 0;
}
@@ -2705,7 +2713,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
return -EROFS;
bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
- fs_info->bdev_holder, NULL);
+ fs_info->sb, &fs_holder_ops);
if (IS_ERR(bdev_file))
return PTR_ERR(bdev_file);
@@ -2921,7 +2929,7 @@ error_free_zone:
error_free_device:
btrfs_free_device(device);
error:
- fput(bdev_file);
+ bdev_fput(bdev_file);
if (locked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
@@ -3282,6 +3290,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
device->bytes_used - dev_extent_len);
atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
btrfs_clear_space_info_full(fs_info);
+
+ if (list_empty(&device->post_commit_list)) {
+ list_add_tail(&device->post_commit_list,
+ &trans->transaction->dev_update_list);
+ }
+
mutex_unlock(&fs_info->chunk_mutex);
}
}
@@ -3398,7 +3412,8 @@ out:
return ret;
}
-int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
+int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ bool verbose)
{
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_trans_handle *trans;
@@ -3428,7 +3443,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
/* step one, relocate all the extents inside this chunk */
btrfs_scrub_pause(fs_info);
- ret = btrfs_relocate_block_group(fs_info, chunk_offset);
+ ret = btrfs_relocate_block_group(fs_info, chunk_offset, true);
btrfs_scrub_continue(fs_info);
if (ret) {
/*
@@ -3538,7 +3553,8 @@ again:
btrfs_release_path(path);
if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
- ret = btrfs_relocate_chunk(fs_info, found_key.offset);
+ ret = btrfs_relocate_chunk(fs_info, found_key.offset,
+ true);
if (ret == -ENOSPC)
failed++;
else
@@ -4203,7 +4219,7 @@ again:
}
}
- ret = btrfs_relocate_chunk(fs_info, found_key.offset);
+ ret = btrfs_relocate_chunk(fs_info, found_key.offset, true);
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret == -ENOSPC) {
enospc_errors++;
@@ -4971,7 +4987,7 @@ again:
goto done;
}
- ret = btrfs_relocate_chunk(fs_info, chunk_offset);
+ ret = btrfs_relocate_chunk(fs_info, chunk_offset, true);
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret == -ENOSPC) {
failed++;
@@ -5003,8 +5019,8 @@ again:
mutex_lock(&fs_info->chunk_mutex);
/* Clear all state bits beyond the shrunk device size */
- btrfs_clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
- CHUNK_STATE_MASK);
+ btrfs_clear_extent_bit(&device->alloc_state, new_size, (u64)-1,
+ CHUNK_STATE_MASK, NULL);
btrfs_device_set_disk_total_bytes(device, new_size);
if (list_empty(&device->post_commit_list))
@@ -5431,9 +5447,9 @@ static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned in
struct btrfs_io_stripe *stripe = &map->stripes[i];
struct btrfs_device *device = stripe->dev;
- btrfs_clear_extent_bits(&device->alloc_state, stripe->physical,
- stripe->physical + map->stripe_size - 1,
- bits | EXTENT_NOWAIT);
+ btrfs_clear_extent_bit(&device->alloc_state, stripe->physical,
+ stripe->physical + map->stripe_size - 1,
+ bits | EXTENT_NOWAIT, NULL);
}
}
@@ -6917,9 +6933,9 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
generate_random_uuid(dev->uuid);
if (path) {
- struct rcu_string *name;
+ const char *name;
- name = rcu_string_strdup(path, GFP_KERNEL);
+ name = kstrdup(path, GFP_KERNEL);
if (!name) {
btrfs_free_device(dev);
return ERR_PTR(-ENOMEM);
@@ -7168,7 +7184,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
if (IS_ERR(fs_devices))
return fs_devices;
- ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->bdev_holder);
+ ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->sb);
if (ret) {
free_fs_devices(fs_devices);
return ERR_PTR(ret);
@@ -7700,7 +7716,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
return -ENOMEM;
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
if (ret < 0) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"error %d while searching for dev_stats item for device %s",
ret, btrfs_dev_name(device));
goto out;
@@ -7711,7 +7727,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
/* need to delete old one and insert a new one */
ret = btrfs_del_item(trans, dev_root, path);
if (ret != 0) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"delete too small dev_stats item for device %s failed %d",
btrfs_dev_name(device), ret);
goto out;
@@ -7725,7 +7741,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, dev_root, path,
&key, sizeof(*ptr));
if (ret < 0) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"insert dev_stats item for device %s failed %d",
btrfs_dev_name(device), ret);
goto out;
@@ -7788,7 +7804,7 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
if (!dev->dev_stats_valid)
return;
- btrfs_err_rl_in_rcu(dev->fs_info,
+ btrfs_err_rl(dev->fs_info,
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
btrfs_dev_name(dev),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
@@ -7808,7 +7824,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
if (i == BTRFS_DEV_STAT_VALUES_MAX)
return; /* all values == 0, suppress message */
- btrfs_info_in_rcu(dev->fs_info,
+ btrfs_info(dev->fs_info,
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
btrfs_dev_name(dev),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
@@ -7932,7 +7948,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
/*
- * Very old mkfs.btrfs (before v4.1) will not respect the reserved
+ * Very old mkfs.btrfs (before v4.15) will not respect the reserved
* space. Although kernel can handle it without problem, better to warn
* the users.
*/
@@ -8184,7 +8200,7 @@ static int relocating_repair_kthread(void *data)
btrfs_info(fs_info,
"zoned: relocating block group %llu to repair IO failure",
target);
- ret = btrfs_relocate_chunk(fs_info, target);
+ ret = btrfs_relocate_chunk(fs_info, target, true);
out:
if (cache)
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 137cc232f58e..a56e873a3029 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -21,7 +21,6 @@
#include <uapi/linux/btrfs.h>
#include <uapi/linux/btrfs_tree.h>
#include "messages.h"
-#include "rcu-string.h"
#include "extent-io-tree.h"
struct block_device;
@@ -114,7 +113,8 @@ struct btrfs_device {
struct btrfs_fs_devices *fs_devices;
struct btrfs_fs_info *fs_info;
- struct rcu_string __rcu *name;
+ /* Device path or NULL if missing. */
+ const char __rcu *name;
u64 generation;
@@ -422,6 +422,16 @@ struct btrfs_fs_devices {
/* Count fs-devices opened. */
int opened;
+ /*
+ * Counter of the processes that are holding this fs_devices but not
+ * yet opened.
+ * This is for mounting handling, as we can only open the fs_devices
+ * after a super block is created. But we cannot take uuid_mutex
+ * during sget_fc(), thus we have to hold the fs_devices (meaning it
+ * cannot be released) until a super block is returned.
+ */
+ int holding;
+
/* Set when we find or add a device that doesn't have the nonrot flag set. */
bool rotating;
/* Devices support TRIM/discard commands. */
@@ -667,7 +677,7 @@ enum btrfs_map_op {
BTRFS_MAP_GET_READ_MIRRORS,
};
-static inline enum btrfs_map_op btrfs_op(struct bio *bio)
+static inline enum btrfs_map_op btrfs_op(const struct bio *bio)
{
switch (bio_op(bio)) {
case REQ_OP_WRITE:
@@ -719,8 +729,7 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
blk_mode_t flags, void *holder);
-struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
- bool mount_arg_dev);
+struct btrfs_device *btrfs_scan_one_device(const char *path, bool mount_arg_dev);
int btrfs_forget_devices(dev_t devt);
void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices);
@@ -754,7 +763,8 @@ void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf);
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
-int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset);
+int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ bool verbose);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset);
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
@@ -846,7 +856,7 @@ static inline const char *btrfs_dev_name(const struct btrfs_device *device)
if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
return "<missing disk>";
else
- return rcu_str_deref(device->name);
+ return rcu_dereference(device->name);
}
static inline void btrfs_warn_unknown_chunk_allocation(enum btrfs_chunk_allocation_policy pol)
@@ -854,6 +864,20 @@ static inline void btrfs_warn_unknown_chunk_allocation(enum btrfs_chunk_allocati
WARN_ONCE(1, "unknown allocation policy %d, fallback to regular", pol);
}
+static inline void btrfs_fs_devices_inc_holding(struct btrfs_fs_devices *fs_devices)
+{
+ lockdep_assert_held(&uuid_mutex);
+ ASSERT(fs_devices->holding >= 0);
+ fs_devices->holding++;
+}
+
+static inline void btrfs_fs_devices_dec_holding(struct btrfs_fs_devices *fs_devices)
+{
+ lockdep_assert_held(&uuid_mutex);
+ ASSERT(fs_devices->holding > 0);
+ fs_devices->holding--;
+}
+
void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3e0edbcf73e1..79fb1614bd0c 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -510,14 +510,15 @@ static int btrfs_initxattrs(struct inode *inode,
*/
nofs_flag = memalloc_nofs_save();
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
- name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
- strlen(xattr->name) + 1, GFP_KERNEL);
+ const size_t name_len = XATTR_SECURITY_PREFIX_LEN +
+ strlen(xattr->name) + 1;
+
+ name = kmalloc(name_len, GFP_KERNEL);
if (!name) {
ret = -ENOMEM;
break;
}
- strcpy(name, XATTR_SECURITY_PREFIX);
- strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+ scnprintf(name, name_len, "%s%s", XATTR_SECURITY_PREFIX, xattr->name);
if (strcmp(name, XATTR_NAME_CAPS) == 0)
clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index b5b0156d5b95..245e813ecd78 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -9,7 +9,6 @@
#include "ctree.h"
#include "volumes.h"
#include "zoned.h"
-#include "rcu-string.h"
#include "disk-io.h"
#include "block-group.h"
#include "dev-replace.h"
@@ -17,6 +16,7 @@
#include "fs.h"
#include "accessors.h"
#include "bio.h"
+#include "transaction.h"
/* Maximum number of zones to report per blkdev_report_zones() call */
#define BTRFS_REPORT_NR_ZONES 4096
@@ -263,9 +263,9 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
copy_zone_info_cb, zones);
if (ret < 0) {
- btrfs_err_in_rcu(device->fs_info,
+ btrfs_err(device->fs_info,
"zoned: failed to read zone %llu on %s (devid %llu)",
- pos, rcu_str_deref(device->name),
+ pos, rcu_dereference(device->name),
device->devid);
return ret;
}
@@ -395,16 +395,16 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
/* We reject devices with a zone size larger than 8GB */
if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"zoned: %s: zone size %llu larger than supported maximum %llu",
- rcu_str_deref(device->name),
+ rcu_dereference(device->name),
zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
ret = -EINVAL;
goto out;
} else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"zoned: %s: zone size %llu smaller than supported minimum %u",
- rcu_str_deref(device->name),
+ rcu_dereference(device->name),
zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
ret = -EINVAL;
goto out;
@@ -418,9 +418,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
max_active_zones = bdev_max_active_zones(bdev);
if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"zoned: %s: max active zones %u is too small, need at least %u active zones",
- rcu_str_deref(device->name), max_active_zones,
+ rcu_dereference(device->name), max_active_zones,
BTRFS_MIN_ACTIVE_ZONES);
ret = -EINVAL;
goto out;
@@ -460,9 +460,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
zone_info->zone_cache = vcalloc(zone_info->nr_zones,
sizeof(struct blk_zone));
if (!zone_info->zone_cache) {
- btrfs_err_in_rcu(device->fs_info,
+ btrfs_err(device->fs_info,
"zoned: failed to allocate zone cache for %s",
- rcu_str_deref(device->name));
+ rcu_dereference(device->name));
ret = -ENOMEM;
goto out;
}
@@ -497,9 +497,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
}
if (nreported != zone_info->nr_zones) {
- btrfs_err_in_rcu(device->fs_info,
+ btrfs_err(device->fs_info,
"inconsistent number of zones on %s (%u/%u)",
- rcu_str_deref(device->name), nreported,
+ rcu_dereference(device->name), nreported,
zone_info->nr_zones);
ret = -EIO;
goto out;
@@ -507,9 +507,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
if (max_active_zones) {
if (nactive > max_active_zones) {
- btrfs_err_in_rcu(device->fs_info,
+ btrfs_err(device->fs_info,
"zoned: %u active zones on %s exceeds max_active_zones %u",
- nactive, rcu_str_deref(device->name),
+ nactive, rcu_dereference(device->name),
max_active_zones);
ret = -EIO;
goto out;
@@ -538,7 +538,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
goto out;
if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
- btrfs_err_in_rcu(device->fs_info,
+ btrfs_err(device->fs_info,
"zoned: failed to read super block log zone info at devid %llu zone %u",
device->devid, sb_zone);
ret = -EUCLEAN;
@@ -556,7 +556,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
ret = sb_write_pointer(device->bdev,
&zone_info->sb_zones[sb_pos], &sb_wp);
if (ret != -ENOENT && ret) {
- btrfs_err_in_rcu(device->fs_info,
+ btrfs_err(device->fs_info,
"zoned: super block log zone corrupted devid %llu zone %u",
device->devid, sb_zone);
ret = -EUCLEAN;
@@ -575,9 +575,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
emulated = "emulated ";
}
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"%s block device %s, %u %szones of %llu bytes",
- model, rcu_str_deref(device->name), zone_info->nr_zones,
+ model, rcu_dereference(device->name), zone_info->nr_zones,
emulated, zone_info->zone_size);
return 0;
@@ -1182,10 +1182,10 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
continue;
/* Free regions should be empty */
- btrfs_warn_in_rcu(
+ btrfs_warn(
device->fs_info,
"zoned: resetting device %s (devid %llu) zone %llu for allocation",
- rcu_str_deref(device->name), device->devid, pos >> shift);
+ rcu_dereference(device->name), device->devid, pos >> shift);
WARN_ON_ONCE(1);
ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
@@ -1345,9 +1345,9 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
}
if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"zoned: unexpected conventional zone %llu on device %s (devid %llu)",
- zone.start << SECTOR_SHIFT, rcu_str_deref(device->name),
+ zone.start << SECTOR_SHIFT, rcu_dereference(device->name),
device->devid);
up_read(&dev_replace->rwsem);
return -EIO;
@@ -1358,10 +1358,10 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
switch (zone.cond) {
case BLK_ZONE_COND_OFFLINE:
case BLK_ZONE_COND_READONLY:
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"zoned: offline/readonly zone %llu on device %s (devid %llu)",
(info->physical >> device->zone_info->zone_size_shift),
- rcu_str_deref(device->name), device->devid);
+ rcu_dereference(device->name), device->devid);
info->alloc_offset = WP_MISSING_DEV;
break;
case BLK_ZONE_COND_EMPTY:
@@ -1403,7 +1403,8 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
@@ -1426,6 +1427,13 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
zone_info[1].physical);
return -EIO;
}
+
+ if (zone_info[0].alloc_offset == WP_CONVENTIONAL)
+ zone_info[0].alloc_offset = last_alloc;
+
+ if (zone_info[1].alloc_offset == WP_CONVENTIONAL)
+ zone_info[1].alloc_offset = last_alloc;
+
if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) {
btrfs_err(bg->fs_info,
"zoned: write pointer offset mismatch of zones in DUP profile");
@@ -1446,7 +1454,8 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
int i;
@@ -1461,10 +1470,12 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity);
for (i = 0; i < map->num_stripes; i++) {
- if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
- zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ zone_info[i].alloc_offset = last_alloc;
+
if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) &&
!btrfs_test_opt(fs_info, DEGRADED)) {
btrfs_err(fs_info,
@@ -1494,7 +1505,8 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
@@ -1505,10 +1517,29 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
}
for (int i = 0; i < map->num_stripes; i++) {
- if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
- zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL) {
+ u64 stripe_nr, full_stripe_nr;
+ u64 stripe_offset;
+ int stripe_index;
+
+ stripe_nr = div64_u64(last_alloc, map->stripe_size);
+ stripe_offset = stripe_nr * map->stripe_size;
+ full_stripe_nr = div_u64(stripe_nr, map->num_stripes);
+ div_u64_rem(stripe_nr, map->num_stripes, &stripe_index);
+
+ zone_info[i].alloc_offset =
+ full_stripe_nr * map->stripe_size;
+
+ if (stripe_index > i)
+ zone_info[i].alloc_offset += map->stripe_size;
+ else if (stripe_index == i)
+ zone_info[i].alloc_offset +=
+ (last_alloc - stripe_offset);
+ }
+
if (test_bit(0, active) != test_bit(i, active)) {
if (!btrfs_zone_activate(bg))
return -EIO;
@@ -1526,7 +1557,8 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
@@ -1537,8 +1569,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
}
for (int i = 0; i < map->num_stripes; i++) {
- if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
- zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
if (test_bit(0, active) != test_bit(i, active)) {
@@ -1549,6 +1580,29 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
}
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL) {
+ u64 stripe_nr, full_stripe_nr;
+ u64 stripe_offset;
+ int stripe_index;
+
+ stripe_nr = div64_u64(last_alloc, map->stripe_size);
+ stripe_offset = stripe_nr * map->stripe_size;
+ full_stripe_nr = div_u64(stripe_nr,
+ map->num_stripes / map->sub_stripes);
+ div_u64_rem(stripe_nr,
+ (map->num_stripes / map->sub_stripes),
+ &stripe_index);
+
+ zone_info[i].alloc_offset =
+ full_stripe_nr * map->stripe_size;
+
+ if (stripe_index > (i / map->sub_stripes))
+ zone_info[i].alloc_offset += map->stripe_size;
+ else if (stripe_index == (i / map->sub_stripes))
+ zone_info[i].alloc_offset +=
+ (last_alloc - stripe_offset);
+ }
+
if ((i % map->sub_stripes) == 0) {
bg->zone_capacity += zone_info[i].capacity;
bg->alloc_offset += zone_info[i].alloc_offset;
@@ -1637,18 +1691,22 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
ret = btrfs_load_block_group_single(cache, &zone_info[0], active);
break;
case BTRFS_BLOCK_GROUP_DUP:
- ret = btrfs_load_block_group_dup(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_dup(cache, map, zone_info, active,
+ last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID1:
case BTRFS_BLOCK_GROUP_RAID1C3:
case BTRFS_BLOCK_GROUP_RAID1C4:
- ret = btrfs_load_block_group_raid1(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_raid1(cache, map, zone_info,
+ active, last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID0:
- ret = btrfs_load_block_group_raid0(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_raid0(cache, map, zone_info,
+ active, last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID10:
- ret = btrfs_load_block_group_raid10(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_raid10(cache, map, zone_info,
+ active, last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID5:
case BTRFS_BLOCK_GROUP_RAID6:
@@ -2427,7 +2485,7 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
/* For the work */
btrfs_get_block_group(bg);
- atomic_inc(&eb->refs);
+ refcount_inc(&eb->refs);
bg->last_eb = eb;
INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
queue_work(system_unbound_wq, &bg->zone_finish_work);
@@ -2443,6 +2501,66 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
spin_unlock(&fs_info->relocation_bg_lock);
}
+void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
+ struct btrfs_space_info *space_info = data_sinfo->sub_group[0];
+ struct btrfs_trans_handle *trans;
+ struct btrfs_block_group *bg;
+ struct list_head *bg_list;
+ u64 alloc_flags;
+ bool initial = false;
+ bool did_chunk_alloc = false;
+ int index;
+ int ret;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ if (fs_info->data_reloc_bg)
+ return;
+
+ if (sb_rdonly(fs_info->sb))
+ return;
+
+ ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
+ alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags);
+ index = btrfs_bg_flags_to_raid_index(alloc_flags);
+
+ bg_list = &data_sinfo->block_groups[index];
+again:
+ list_for_each_entry(bg, bg_list, list) {
+ if (bg->used > 0)
+ continue;
+
+ if (!initial) {
+ initial = true;
+ continue;
+ }
+
+ fs_info->data_reloc_bg = bg->start;
+ set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &bg->runtime_flags);
+ btrfs_zone_activate(bg);
+
+ return;
+ }
+
+ if (did_chunk_alloc)
+ return;
+
+ trans = btrfs_join_transaction(fs_info->tree_root);
+ if (IS_ERR(trans))
+ return;
+
+ ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
+ btrfs_end_transaction(trans);
+ if (ret == 1) {
+ did_chunk_alloc = true;
+ bg_list = &space_info->block_groups[index];
+ goto again;
+ }
+}
+
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
@@ -2465,8 +2583,8 @@ bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
+ u64 total = btrfs_super_total_bytes(fs_info->super_copy);
u64 used = 0;
- u64 total = 0;
u64 factor;
ASSERT(btrfs_is_zoned(fs_info));
@@ -2479,7 +2597,6 @@ bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info)
if (!device->bdev)
continue;
- total += device->disk_total_bytes;
used += device->bytes_used;
}
mutex_unlock(&fs_devices->device_list_mutex);
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 9672bf4c3335..6e11533b8e14 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -88,6 +88,7 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
struct extent_buffer *eb);
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
+void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info);
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info);
void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
@@ -241,6 +242,8 @@ static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
+static inline void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) { }
+
static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
static inline bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 4a796a049b5a..ff0292615e1f 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -200,8 +200,7 @@ void zstd_init_workspace_manager(void)
ws = zstd_alloc_workspace(ZSTD_BTRFS_MAX_LEVEL);
if (IS_ERR(ws)) {
- pr_warn(
- "BTRFS: cannot preallocate zstd compression workspace\n");
+ btrfs_warn(NULL, "cannot preallocate zstd compression workspace");
} else {
set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map);
list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]);
diff --git a/fs/buffer.c b/fs/buffer.c
index 8cf4a1dc481e..ead4dc85debd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1122,15 +1122,9 @@ __getblk_slow(struct block_device *bdev, sector_t block,
{
bool blocking = gfpflags_allow_blocking(gfp);
- /* Size must be multiple of hard sectorsize */
- if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
- (size < 512 || size > PAGE_SIZE))) {
- printk(KERN_ERR "getblk(): invalid block size %d requested\n",
- size);
- printk(KERN_ERR "logical block size: %d\n",
- bdev_logical_block_size(bdev));
-
- dump_stack();
+ if (WARN_ON_ONCE(!IS_ALIGNED(size, bdev_logical_block_size(bdev)))) {
+ printk(KERN_ERR "getblk(): block size %d not aligned to logical block size %d\n",
+ size, bdev_logical_block_size(bdev));
return NULL;
}
@@ -2271,9 +2265,8 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
}
EXPORT_SYMBOL(block_write_begin);
-int block_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+int block_write_end(loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio)
{
size_t start = pos - folio_pos(folio);
@@ -2304,15 +2297,15 @@ int block_write_end(struct file *file, struct address_space *mapping,
}
EXPORT_SYMBOL(block_write_end);
-int generic_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+int generic_write_end(const struct kiocb *iocb, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
loff_t old_size = inode->i_size;
bool i_size_changed = false;
- copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ copied = block_write_end(pos, len, copied, folio);
/*
* No need to use i_size_read() here, the i_size cannot change under us
@@ -2501,7 +2494,8 @@ out:
}
EXPORT_SYMBOL(generic_cont_expand_simple);
-static int cont_expand_zero(struct file *file, struct address_space *mapping,
+static int cont_expand_zero(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, loff_t *bytes)
{
struct inode *inode = mapping->host;
@@ -2525,12 +2519,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
}
len = PAGE_SIZE - zerofrom;
- err = aops->write_begin(file, mapping, curpos, len,
+ err = aops->write_begin(iocb, mapping, curpos, len,
&folio, &fsdata);
if (err)
goto out;
folio_zero_range(folio, offset_in_folio(folio, curpos), len);
- err = aops->write_end(file, mapping, curpos, len, len,
+ err = aops->write_end(iocb, mapping, curpos, len, len,
folio, fsdata);
if (err < 0)
goto out;
@@ -2558,12 +2552,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
}
len = offset - zerofrom;
- err = aops->write_begin(file, mapping, curpos, len,
+ err = aops->write_begin(iocb, mapping, curpos, len,
&folio, &fsdata);
if (err)
goto out;
folio_zero_range(folio, offset_in_folio(folio, curpos), len);
- err = aops->write_end(file, mapping, curpos, len, len,
+ err = aops->write_end(iocb, mapping, curpos, len, len,
folio, fsdata);
if (err < 0)
goto out;
@@ -2578,17 +2572,16 @@ out:
* For moronic filesystems that do not allow holes in file.
* We may have to extend the file.
*/
-int cont_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata,
- get_block_t *get_block, loff_t *bytes)
+int cont_write_begin(const struct kiocb *iocb, struct address_space *mapping,
+ loff_t pos, unsigned len, struct folio **foliop,
+ void **fsdata, get_block_t *get_block, loff_t *bytes)
{
struct inode *inode = mapping->host;
unsigned int blocksize = i_blocksize(inode);
unsigned int zerofrom;
int err;
- err = cont_expand_zero(file, mapping, pos, bytes);
+ err = cont_expand_zero(iocb, mapping, pos, bytes);
if (err)
return err;
@@ -2610,7 +2603,7 @@ EXPORT_SYMBOL(cont_write_begin);
* holes and correct delalloc and unwritten extent mapping on filesystems that
* support these features.
*
- * We are not allowed to take the i_mutex here so we have to play games to
+ * We are not allowed to take the i_rwsem here so we have to play games to
* protect against truncate races as the page could now be beyond EOF. Because
* truncate writes the inode size before removing pages, once we have the
* page lock we can determine safely if the page is beyond EOF. If it is not
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 92058ae43488..3e0576d9db1d 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -63,7 +63,7 @@ static void cachefiles_read_complete(struct kiocb *iocb, long ret)
ret = -ESTALE;
}
- ki->term_func(ki->term_func_priv, ret, ki->was_async);
+ ki->term_func(ki->term_func_priv, ret);
}
cachefiles_put_kiocb(ki);
@@ -188,7 +188,7 @@ in_progress:
presubmission_error:
if (term_func)
- term_func(term_func_priv, ret < 0 ? ret : skipped, false);
+ term_func(term_func_priv, ret < 0 ? ret : skipped);
return ret;
}
@@ -271,7 +271,7 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret)
atomic_long_sub(ki->b_writing, &object->volume->cache->b_writing);
set_bit(FSCACHE_COOKIE_HAVE_DATA, &object->cookie->flags);
if (ki->term_func)
- ki->term_func(ki->term_func_priv, ret, ki->was_async);
+ ki->term_func(ki->term_func_priv, ret);
cachefiles_put_kiocb(ki);
}
@@ -301,7 +301,7 @@ int __cachefiles_write(struct cachefiles_object *object,
ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL);
if (!ki) {
if (term_func)
- term_func(term_func_priv, -ENOMEM, false);
+ term_func(term_func_priv, -ENOMEM);
return -ENOMEM;
}
@@ -347,8 +347,6 @@ int __cachefiles_write(struct cachefiles_object *object,
default:
ki->was_async = false;
cachefiles_write_complete(&ki->iocb, ret);
- if (ret > 0)
- ret = 0;
break;
}
@@ -366,7 +364,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
{
if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) {
if (term_func)
- term_func(term_func_priv, -ENOBUFS, false);
+ term_func(term_func_priv, -ENOBUFS);
trace_netfs_sreq(term_func_priv, netfs_sreq_trace_cache_nowrite);
return -ENOBUFS;
}
@@ -665,7 +663,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)
pre = CACHEFILES_DIO_BLOCK_SIZE - off;
if (pre >= len) {
fscache_count_dio_misfit();
- netfs_write_subrequest_terminated(subreq, len, false);
+ netfs_write_subrequest_terminated(subreq, len);
return;
}
subreq->transferred += pre;
@@ -691,7 +689,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)
len -= post;
if (len == 0) {
fscache_count_dio_misfit();
- netfs_write_subrequest_terminated(subreq, post, false);
+ netfs_write_subrequest_terminated(subreq, post);
return;
}
iov_iter_truncate(&subreq->io_iter, len);
@@ -703,7 +701,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)
&start, &len, len, true);
cachefiles_end_secure(cache, saved_cred);
if (ret < 0) {
- netfs_write_subrequest_terminated(subreq, ret, false);
+ netfs_write_subrequest_terminated(subreq, ret);
return;
}
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index aecfc5c37b49..91dfd0231877 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -388,10 +388,10 @@ try_again:
} else {
struct renamedata rd = {
.old_mnt_idmap = &nop_mnt_idmap,
- .old_dir = d_inode(dir),
+ .old_parent = dir,
.old_dentry = rep,
.new_mnt_idmap = &nop_mnt_idmap,
- .new_dir = d_inode(cache->graveyard),
+ .new_parent = cache->graveyard,
.new_dentry = grave,
};
trace_cachefiles_rename(object, d_inode(rep)->i_ino, why);
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index d9bc67176128..a7ed86fa98bb 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -83,10 +83,8 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb,
trace_cachefiles_ondemand_fd_write(object, file_inode(file), pos, len);
ret = __cachefiles_write(object, file, pos, iter, NULL, NULL);
- if (!ret) {
- ret = len;
+ if (ret > 0)
kiocb->ki_pos += ret;
- }
out:
fput(file);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 29be367905a1..8b202d789e93 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -238,6 +238,7 @@ static void finish_netfs_read(struct ceph_osd_request *req)
if (sparse && err > 0)
err = ceph_sparse_ext_map_end(op);
if (err < subreq->len &&
+ subreq->rreq->origin != NETFS_UNBUFFERED_READ &&
subreq->rreq->origin != NETFS_DIO_READ)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
if (IS_ENCRYPTED(inode) && err > 0) {
@@ -281,7 +282,8 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
size_t len;
int mode;
- if (rreq->origin != NETFS_DIO_READ)
+ if (rreq->origin != NETFS_UNBUFFERED_READ &&
+ rreq->origin != NETFS_DIO_READ)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
__clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
@@ -407,6 +409,15 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
struct page **pages;
size_t page_off;
+ /*
+ * FIXME: io_iter.count needs to be corrected to aligned
+ * length. Otherwise, iov_iter_get_pages_alloc2() operates
+ * with the initial unaligned length value. As a result,
+ * ceph_msg_data_cursor_init() triggers BUG_ON() in the case
+ * if msg->sparse_read_total > msg->data_length.
+ */
+ subreq->io_iter.count = len;
+
err = iov_iter_get_pages_alloc2(&subreq->io_iter, &pages, len, &page_off);
if (err < 0) {
doutc(cl, "%llx.%llx failed to allocate pages, %d\n",
@@ -539,7 +550,7 @@ static void ceph_set_page_fscache(struct page *page)
folio_start_private_2(page_folio(page)); /* [DEPRECATED] */
}
-static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async)
+static void ceph_fscache_write_terminated(void *priv, ssize_t error)
{
struct inode *inode = priv;
@@ -1853,10 +1864,12 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned
* We are only allowed to write into/dirty the page if the page is
* clean, or already dirty within the same snap context.
*/
-static int ceph_write_begin(struct file *file, struct address_space *mapping,
+static int ceph_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
+ struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
int r;
@@ -1874,10 +1887,12 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
* we don't do anything in here that simple_write_end doesn't do
* except adjust dirty page accounting
*/
-static int ceph_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
+static int ceph_write_end(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos,
+ unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
{
+ struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ceph_client *cl = ceph_inode_to_client(inode);
bool check_cap = false;
@@ -2319,13 +2334,13 @@ static const struct vm_operations_struct ceph_vmops = {
.page_mkwrite = ceph_page_mkwrite,
};
-int ceph_mmap(struct file *file, struct vm_area_struct *vma)
+int ceph_mmap_prepare(struct vm_area_desc *desc)
{
- struct address_space *mapping = file->f_mapping;
+ struct address_space *mapping = desc->file->f_mapping;
if (!mapping->a_ops->read_folio)
return -ENOEXEC;
- vma->vm_ops = &ceph_vmops;
+ desc->vm_ops = &ceph_vmops;
return 0;
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index a8d8b56cf9d2..b1a8ff612c41 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4957,24 +4957,20 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
cl = ceph_inode_to_client(dir);
spin_lock(&dentry->d_lock);
if (ret && di->lease_session && di->lease_session->s_mds == mds) {
+ int len = dentry->d_name.len;
doutc(cl, "%p mds%d seq %d\n", dentry, mds,
(int)di->lease_seq);
rel->dname_seq = cpu_to_le32(di->lease_seq);
__ceph_mdsc_drop_dentry_lease(dentry);
+ memcpy(*p, dentry->d_name.name, len);
spin_unlock(&dentry->d_lock);
if (IS_ENCRYPTED(dir) && fscrypt_has_encryption_key(dir)) {
- int ret2 = ceph_encode_encrypted_fname(dir, dentry, *p);
-
- if (ret2 < 0)
- return ret2;
-
- rel->dname_len = cpu_to_le32(ret2);
- *p += ret2;
- } else {
- rel->dname_len = cpu_to_le32(dentry->d_name.len);
- memcpy(*p, dentry->d_name.name, dentry->d_name.len);
- *p += dentry->d_name.len;
+ len = ceph_encode_encrypted_dname(dir, *p, len);
+ if (len < 0)
+ return len;
}
+ rel->dname_len = cpu_to_le32(len);
+ *p += len;
} else {
spin_unlock(&dentry->d_lock);
}
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index 3b3c4d8d401e..cab722619207 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -215,35 +215,31 @@ static struct inode *parse_longname(const struct inode *parent,
struct ceph_client *cl = ceph_inode_to_client(parent);
struct inode *dir = NULL;
struct ceph_vino vino = { .snap = CEPH_NOSNAP };
- char *inode_number;
- char *name_end;
- int orig_len = *name_len;
+ char *name_end, *inode_number;
int ret = -EIO;
-
+ /* NUL-terminate */
+ char *str __free(kfree) = kmemdup_nul(name, *name_len, GFP_KERNEL);
+ if (!str)
+ return ERR_PTR(-ENOMEM);
/* Skip initial '_' */
- name++;
- name_end = strrchr(name, '_');
+ str++;
+ name_end = strrchr(str, '_');
if (!name_end) {
- doutc(cl, "failed to parse long snapshot name: %s\n", name);
+ doutc(cl, "failed to parse long snapshot name: %s\n", str);
return ERR_PTR(-EIO);
}
- *name_len = (name_end - name);
+ *name_len = (name_end - str);
if (*name_len <= 0) {
pr_err_client(cl, "failed to parse long snapshot name\n");
return ERR_PTR(-EIO);
}
/* Get the inode number */
- inode_number = kmemdup_nul(name_end + 1,
- orig_len - *name_len - 2,
- GFP_KERNEL);
- if (!inode_number)
- return ERR_PTR(-ENOMEM);
+ inode_number = name_end + 1;
ret = kstrtou64(inode_number, 10, &vino.ino);
if (ret) {
- doutc(cl, "failed to parse inode number: %s\n", name);
- dir = ERR_PTR(ret);
- goto out;
+ doutc(cl, "failed to parse inode number: %s\n", str);
+ return ERR_PTR(ret);
}
/* And finally the inode */
@@ -254,42 +250,29 @@ static struct inode *parse_longname(const struct inode *parent,
if (IS_ERR(dir))
doutc(cl, "can't find inode %s (%s)\n", inode_number, name);
}
-
-out:
- kfree(inode_number);
return dir;
}
-int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
- char *buf)
+int ceph_encode_encrypted_dname(struct inode *parent, char *buf, int elen)
{
struct ceph_client *cl = ceph_inode_to_client(parent);
struct inode *dir = parent;
- struct qstr iname;
+ char *p = buf;
u32 len;
- int name_len;
- int elen;
+ int name_len = elen;
int ret;
u8 *cryptbuf = NULL;
- iname.name = d_name->name;
- name_len = d_name->len;
-
/* Handle the special case of snapshot names that start with '_' */
- if ((ceph_snap(dir) == CEPH_SNAPDIR) && (name_len > 0) &&
- (iname.name[0] == '_')) {
- dir = parse_longname(parent, iname.name, &name_len);
+ if (ceph_snap(dir) == CEPH_SNAPDIR && *p == '_') {
+ dir = parse_longname(parent, p, &name_len);
if (IS_ERR(dir))
return PTR_ERR(dir);
- iname.name++; /* skip initial '_' */
+ p++; /* skip initial '_' */
}
- iname.len = name_len;
- if (!fscrypt_has_encryption_key(dir)) {
- memcpy(buf, d_name->name, d_name->len);
- elen = d_name->len;
+ if (!fscrypt_has_encryption_key(dir))
goto out;
- }
/*
* Convert cleartext d_name to ciphertext. If result is longer than
@@ -297,7 +280,7 @@ int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
*
* See: fscrypt_setup_filename
*/
- if (!fscrypt_fname_encrypted_size(dir, iname.len, NAME_MAX, &len)) {
+ if (!fscrypt_fname_encrypted_size(dir, name_len, NAME_MAX, &len)) {
elen = -ENAMETOOLONG;
goto out;
}
@@ -310,7 +293,9 @@ int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
goto out;
}
- ret = fscrypt_fname_encrypt(dir, &iname, cryptbuf, len);
+ ret = fscrypt_fname_encrypt(dir,
+ &(struct qstr)QSTR_INIT(p, name_len),
+ cryptbuf, len);
if (ret) {
elen = ret;
goto out;
@@ -331,18 +316,13 @@ int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
}
/* base64 encode the encrypted name */
- elen = ceph_base64_encode(cryptbuf, len, buf);
- doutc(cl, "base64-encoded ciphertext name = %.*s\n", elen, buf);
+ elen = ceph_base64_encode(cryptbuf, len, p);
+ doutc(cl, "base64-encoded ciphertext name = %.*s\n", elen, p);
/* To understand the 240 limit, see CEPH_NOHASH_NAME_MAX comments */
WARN_ON(elen > 240);
- if ((elen > 0) && (dir != parent)) {
- char tmp_buf[NAME_MAX];
-
- elen = snprintf(tmp_buf, sizeof(tmp_buf), "_%.*s_%ld",
- elen, buf, dir->i_ino);
- memcpy(buf, tmp_buf, elen);
- }
+ if (dir != parent) // leading _ is already there; append _<inum>
+ elen += 1 + sprintf(p + elen, "_%ld", dir->i_ino);
out:
kfree(cryptbuf);
@@ -355,14 +335,6 @@ out:
return elen;
}
-int ceph_encode_encrypted_fname(struct inode *parent, struct dentry *dentry,
- char *buf)
-{
- WARN_ON_ONCE(!fscrypt_has_encryption_key(parent));
-
- return ceph_encode_encrypted_dname(parent, &dentry->d_name, buf);
-}
-
/**
* ceph_fname_to_usr - convert a filename for userland presentation
* @fname: ceph_fname to be converted
@@ -516,15 +488,13 @@ int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode,
int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode,
struct page *page, unsigned int len,
- unsigned int offs, u64 lblk_num,
- gfp_t gfp_flags)
+ unsigned int offs, u64 lblk_num)
{
struct ceph_client *cl = ceph_inode_to_client(inode);
doutc(cl, "%p %llx.%llx len %u offs %u blk %llu\n", inode,
ceph_vinop(inode), len, offs, lblk_num);
- return fscrypt_encrypt_block_inplace(inode, page, len, offs, lblk_num,
- gfp_flags);
+ return fscrypt_encrypt_block_inplace(inode, page, len, offs, lblk_num);
}
/**
@@ -642,9 +612,8 @@ int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
* @page: pointer to page array
* @off: offset into the file that the data starts
* @len: max length to encrypt
- * @gfp: gfp flags to use for allocation
*
- * Decrypt an array of cleartext pages and return the amount of
+ * Encrypt an array of cleartext pages and return the amount of
* data encrypted. Any data in the page prior to the start of the
* first complete block in the read is ignored. Any incomplete
* crypto blocks at the end of the array are ignored.
@@ -652,7 +621,7 @@ int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
* Returns the length of the encrypted data or a negative errno.
*/
int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off,
- int len, gfp_t gfp)
+ int len)
{
int i, num_blocks;
u64 baseblk = off >> CEPH_FSCRYPT_BLOCK_SHIFT;
@@ -673,7 +642,7 @@ int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off,
fret = ceph_fscrypt_encrypt_block_inplace(inode, page[pgidx],
CEPH_FSCRYPT_BLOCK_SIZE, pgoffs,
- baseblk + i, gfp);
+ baseblk + i);
if (fret < 0) {
if (ret == 0)
ret = fret;
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
index d0768239a1c9..23612b2e9837 100644
--- a/fs/ceph/crypto.h
+++ b/fs/ceph/crypto.h
@@ -102,10 +102,7 @@ int ceph_fscrypt_prepare_context(struct inode *dir, struct inode *inode,
struct ceph_acl_sec_ctx *as);
void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req,
struct ceph_acl_sec_ctx *as);
-int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
- char *buf);
-int ceph_encode_encrypted_fname(struct inode *parent, struct dentry *dentry,
- char *buf);
+int ceph_encode_encrypted_dname(struct inode *parent, char *buf, int len);
static inline int ceph_fname_alloc_buffer(struct inode *parent,
struct fscrypt_str *fname)
@@ -155,15 +152,14 @@ int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode,
unsigned int offs, u64 lblk_num);
int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode,
struct page *page, unsigned int len,
- unsigned int offs, u64 lblk_num,
- gfp_t gfp_flags);
+ unsigned int offs, u64 lblk_num);
int ceph_fscrypt_decrypt_pages(struct inode *inode, struct page **page,
u64 off, int len);
int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
u64 off, struct ceph_sparse_extent *map,
u32 ext_cnt);
int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off,
- int len, gfp_t gfp);
+ int len);
static inline struct page *ceph_fscrypt_pagecache_page(struct page *page)
{
@@ -194,17 +190,10 @@ static inline void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req,
{
}
-static inline int ceph_encode_encrypted_dname(struct inode *parent,
- struct qstr *d_name, char *buf)
-{
- memcpy(buf, d_name->name, d_name->len);
- return d_name->len;
-}
-
-static inline int ceph_encode_encrypted_fname(struct inode *parent,
- struct dentry *dentry, char *buf)
+static inline int ceph_encode_encrypted_dname(struct inode *parent, char *buf,
+ int len)
{
- return -EOPNOTSUPP;
+ return len;
}
static inline int ceph_fname_alloc_buffer(struct inode *parent,
@@ -246,8 +235,7 @@ static inline int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode,
static inline int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode,
struct page *page, unsigned int len,
- unsigned int offs, u64 lblk_num,
- gfp_t gfp_flags)
+ unsigned int offs, u64 lblk_num)
{
return 0;
}
@@ -269,7 +257,7 @@ static inline int ceph_fscrypt_decrypt_extents(struct inode *inode,
static inline int ceph_fscrypt_encrypt_pages(struct inode *inode,
struct page **page, u64 off,
- int len, gfp_t gfp)
+ int len)
{
return 0;
}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index a321aa6d0ed2..8478e7e75df6 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -423,17 +423,16 @@ more:
req->r_inode_drop = CEPH_CAP_FILE_EXCL;
}
if (dfi->last_name) {
- struct qstr d_name = { .name = dfi->last_name,
- .len = strlen(dfi->last_name) };
+ int len = strlen(dfi->last_name);
req->r_path2 = kzalloc(NAME_MAX + 1, GFP_KERNEL);
if (!req->r_path2) {
ceph_mdsc_put_request(req);
return -ENOMEM;
}
+ memcpy(req->r_path2, dfi->last_name, len);
- err = ceph_encode_encrypted_dname(inode, &d_name,
- req->r_path2);
+ err = ceph_encode_encrypted_dname(inode, req->r_path2, len);
if (err < 0) {
ceph_mdsc_put_request(req);
return err;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 150076ced937..b2f2af104679 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -33,12 +33,19 @@ struct ceph_nfs_snapfh {
u32 hash;
} __attribute__ ((packed));
+#define BYTES_PER_U32 (sizeof(u32))
+#define CEPH_FH_BASIC_SIZE \
+ (sizeof(struct ceph_nfs_fh) / BYTES_PER_U32)
+#define CEPH_FH_WITH_PARENT_SIZE \
+ (sizeof(struct ceph_nfs_confh) / BYTES_PER_U32)
+#define CEPH_FH_SNAPPED_INODE_SIZE \
+ (sizeof(struct ceph_nfs_snapfh) / BYTES_PER_U32)
+
static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len,
struct inode *parent_inode)
{
struct ceph_client *cl = ceph_inode_to_client(inode);
- static const int snap_handle_length =
- sizeof(struct ceph_nfs_snapfh) >> 2;
+ static const int snap_handle_length = CEPH_FH_SNAPPED_INODE_SIZE;
struct ceph_nfs_snapfh *sfh = (void *)rawfh;
u64 snapid = ceph_snap(inode);
int ret;
@@ -88,10 +95,8 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
struct inode *parent_inode)
{
struct ceph_client *cl = ceph_inode_to_client(inode);
- static const int handle_length =
- sizeof(struct ceph_nfs_fh) >> 2;
- static const int connected_handle_length =
- sizeof(struct ceph_nfs_confh) >> 2;
+ static const int handle_length = CEPH_FH_BASIC_SIZE;
+ static const int connected_handle_length = CEPH_FH_WITH_PARENT_SIZE;
int type;
if (ceph_snap(inode) != CEPH_NOSNAP)
@@ -308,7 +313,7 @@ static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
if (fh_type != FILEID_INO32_GEN &&
fh_type != FILEID_INO32_GEN_PARENT)
return NULL;
- if (fh_len < sizeof(*fh) / 4)
+ if (fh_len < sizeof(*fh) / BYTES_PER_U32)
return NULL;
doutc(fsc->client, "%llx\n", fh->ino);
@@ -427,7 +432,7 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
if (fh_type != FILEID_INO32_GEN_PARENT)
return NULL;
- if (fh_len < sizeof(*cfh) / 4)
+ if (fh_len < sizeof(*cfh) / BYTES_PER_U32)
return NULL;
doutc(fsc->client, "%llx\n", cfh->parent_ino);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 851d70200c6b..c02f100f8552 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1992,8 +1992,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
if (IS_ENCRYPTED(inode)) {
ret = ceph_fscrypt_encrypt_pages(inode, pages,
- write_pos, write_len,
- GFP_KERNEL);
+ write_pos, write_len);
if (ret < 0) {
doutc(cl, "encryption failed with %d\n", ret);
ceph_release_page_vector(pages, num_pages);
@@ -2530,19 +2529,19 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
return generic_file_llseek(file, offset, whence);
}
-static inline void ceph_zero_partial_page(
- struct inode *inode, loff_t offset, unsigned size)
+static inline void ceph_zero_partial_page(struct inode *inode,
+ loff_t offset, size_t size)
{
- struct page *page;
- pgoff_t index = offset >> PAGE_SHIFT;
+ struct folio *folio;
- page = find_lock_page(inode->i_mapping, index);
- if (page) {
- wait_on_page_writeback(page);
- zero_user(page, offset & (PAGE_SIZE - 1), size);
- unlock_page(page);
- put_page(page);
- }
+ folio = filemap_lock_folio(inode->i_mapping, offset >> PAGE_SHIFT);
+ if (IS_ERR(folio))
+ return;
+
+ folio_wait_writeback(folio);
+ folio_zero_range(folio, offset_in_folio(folio, offset), size);
+ folio_unlock(folio);
+ folio_put(folio);
}
static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
@@ -2616,7 +2615,7 @@ static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
s32 stripe_unit = ci->i_layout.stripe_unit;
s32 stripe_count = ci->i_layout.stripe_count;
s32 object_size = ci->i_layout.object_size;
- u64 object_set_size = object_size * stripe_count;
+ u64 object_set_size = (u64) object_size * stripe_count;
u64 nearly, t;
/* round offset up to next period boundary */
@@ -3171,7 +3170,7 @@ const struct file_operations ceph_file_fops = {
.llseek = ceph_llseek,
.read_iter = ceph_read_iter,
.write_iter = ceph_write_iter,
- .mmap = ceph_mmap,
+ .mmap_prepare = ceph_mmap_prepare,
.fsync = ceph_fsync,
.lock = ceph_lock,
.setlease = simple_nosetlease,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 06cd2963e41e..fc543075b827 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2436,8 +2436,7 @@ static int fill_fscrypt_truncate(struct inode *inode,
/* encrypt the last block */
ret = ceph_fscrypt_encrypt_block_inplace(inode, page,
CEPH_FSCRYPT_BLOCK_SIZE,
- 0, block,
- GFP_KERNEL);
+ 0, block);
if (ret)
goto out;
}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 230e0c3f341f..0f497c39ff82 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2766,8 +2766,8 @@ retry:
}
if (fscrypt_has_encryption_key(d_inode(parent))) {
- len = ceph_encode_encrypted_fname(d_inode(parent),
- cur, buf);
+ len = ceph_encode_encrypted_dname(d_inode(parent),
+ buf, len);
if (len < 0) {
dput(parent);
dput(cur);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f3951253e393..c3eb651862c5 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1033,8 +1033,7 @@ void ceph_umount_begin(struct super_block *sb)
struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
doutc(fsc->client, "starting forced umount\n");
- if (!fsc)
- return;
+
fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
__ceph_umount_begin(fsc);
}
@@ -1220,13 +1219,14 @@ static int ceph_set_super(struct super_block *s, struct fs_context *fc)
fsc->max_file_size = 1ULL << 40; /* temp value until we get mdsmap */
s->s_op = &ceph_super_ops;
- s->s_d_op = &ceph_dentry_ops;
+ set_default_d_op(s, &ceph_dentry_ops);
s->s_export_op = &ceph_export_ops;
s->s_time_gran = 1;
s->s_time_min = 0;
s->s_time_max = U32_MAX;
s->s_flags |= SB_NODIRATIME | SB_NOATIME;
+ s->s_magic = CEPH_SUPER_MAGIC;
ceph_fscrypt_set_ops(s);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index bb0db0cc8003..cf176aab0f82 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1286,7 +1286,7 @@ extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
/* addr.c */
extern const struct address_space_operations ceph_aops;
extern const struct netfs_request_ops ceph_netfs_ops;
-extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
+int ceph_mmap_prepare(struct vm_area_desc *desc);
extern int ceph_uninline_data(struct file *file);
extern int ceph_pool_perm_check(struct inode *inode, int need);
extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index ab69d8f0cec2..ca9990017265 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -429,17 +429,9 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
cfi = coda_ftoc(coda_file);
host_file = cfi->cfi_container;
- if (host_file->f_op->iterate_shared) {
- struct inode *host_inode = file_inode(host_file);
- ret = -ENOENT;
- if (!IS_DEADDIR(host_inode)) {
- inode_lock_shared(host_inode);
- ret = host_file->f_op->iterate_shared(host_file, ctx);
- file_accessed(host_file);
- inode_unlock_shared(host_inode);
- }
+ ret = iterate_dir(host_file, ctx);
+ if (ret != -ENOTDIR)
return ret;
- }
/* Venus: we must read Venus dirents from a file */
return coda_venus_readdir(coda_file, ctx);
}
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 148856a582a9..a390b5d21196 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -160,7 +160,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
size_t count;
int ret;
- if (!host_file->f_op->mmap)
+ if (!can_mmap_file(host_file))
return -ENODEV;
if (WARN_ON(coda_file != vma->vm_file))
@@ -199,10 +199,10 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
spin_unlock(&cii->c_lock);
vma->vm_file = get_file(host_file);
- ret = call_mmap(vma->vm_file, vma);
+ ret = vfs_mmap(vma->vm_file, vma);
if (ret) {
- /* if call_mmap fails, our caller will put host_file so we
+ /* if vfs_mmap fails, our caller will put host_file so we
* should drop the reference to the coda_file that we got.
*/
fput(coda_file);
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 6896fce122e1..08450d006016 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -230,7 +230,7 @@ static int coda_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_blocksize_bits = 12;
sb->s_magic = CODA_SUPER_MAGIC;
sb->s_op = &coda_super_operations;
- sb->s_d_op = &coda_dentry_operations;
+ set_default_d_op(sb, &coda_dentry_operations);
sb->s_time_gran = 1;
sb->s_time_min = S64_MIN;
sb->s_time_max = S64_MAX;
diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig
index 272b64456999..1fcd761fe7be 100644
--- a/fs/configfs/Kconfig
+++ b/fs/configfs/Kconfig
@@ -1,7 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
config CONFIGFS_FS
tristate "Userspace-driven configuration filesystem"
- select SYSFS
help
configfs is a RAM-based filesystem that provides the converse
of sysfs's functionality. Where sysfs is a filesystem-based
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index ebf32822e29b..f327fbb9a0ca 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -67,7 +67,6 @@ static void configfs_d_iput(struct dentry * dentry,
const struct dentry_operations configfs_dentry_ops = {
.d_iput = configfs_d_iput,
- .d_delete = always_delete_dentry,
};
#ifdef CONFIG_LOCKDEP
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index c2d820063ec4..740f18b60c9d 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -92,7 +92,8 @@ static int configfs_fill_super(struct super_block *sb, struct fs_context *fc)
configfs_root_group.cg_item.ci_dentry = root;
root->d_fsdata = &configfs_root;
sb->s_root = root;
- sb->s_d_op = &configfs_dentry_ops; /* the rest get that */
+ set_default_d_op(sb, &configfs_dentry_ops); /* the rest get that */
+ sb->s_d_flags |= DCACHE_DONTCACHE;
return 0;
}
diff --git a/fs/coredump.c b/fs/coredump.c
index f217ebf2b3b6..fedbead956ed 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -51,6 +51,7 @@
#include <net/sock.h>
#include <uapi/linux/pidfd.h>
#include <uapi/linux/un.h>
+#include <uapi/linux/coredump.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -81,17 +82,22 @@ static unsigned int core_sort_vma;
static char core_pattern[CORENAME_MAX_SIZE] = "core";
static int core_name_size = CORENAME_MAX_SIZE;
unsigned int core_file_note_size_limit = CORE_FILE_NOTE_SIZE_DEFAULT;
+static atomic_t core_pipe_count = ATOMIC_INIT(0);
enum coredump_type_t {
- COREDUMP_FILE = 1,
- COREDUMP_PIPE = 2,
- COREDUMP_SOCK = 3,
+ COREDUMP_FILE = 1,
+ COREDUMP_PIPE = 2,
+ COREDUMP_SOCK = 3,
+ COREDUMP_SOCK_REQ = 4,
};
struct core_name {
char *corename;
int used, size;
+ unsigned int core_pipe_limit;
+ bool core_dumped;
enum coredump_type_t core_type;
+ u64 mask;
};
static int expand_corename(struct core_name *cn, int size)
@@ -222,11 +228,12 @@ put_exe_file:
return ret;
}
-/* format_corename will inspect the pattern parameter, and output a
- * name into corename, which must have space for at least
- * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
+/*
+ * coredump_parse will inspect the pattern parameter, and output a name
+ * into corename, which must have space for at least CORENAME_MAX_SIZE
+ * bytes plus one byte for the zero terminator.
*/
-static int format_corename(struct core_name *cn, struct coredump_params *cprm,
+static bool coredump_parse(struct core_name *cn, struct coredump_params *cprm,
size_t **argv, int *argc)
{
const struct cred *cred = current_cred();
@@ -235,8 +242,13 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
int pid_in_pattern = 0;
int err = 0;
+ cn->mask = COREDUMP_KERNEL;
+ if (core_pipe_limit)
+ cn->mask |= COREDUMP_WAIT;
cn->used = 0;
cn->corename = NULL;
+ cn->core_pipe_limit = 0;
+ cn->core_dumped = false;
if (*pat_ptr == '|')
cn->core_type = COREDUMP_PIPE;
else if (*pat_ptr == '@')
@@ -244,7 +256,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
else
cn->core_type = COREDUMP_FILE;
if (expand_corename(cn, core_name_size))
- return -ENOMEM;
+ return false;
cn->corename[0] = '\0';
switch (cn->core_type) {
@@ -252,26 +264,33 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
int argvs = sizeof(core_pattern) / 2;
(*argv) = kmalloc_array(argvs, sizeof(**argv), GFP_KERNEL);
if (!(*argv))
- return -ENOMEM;
+ return false;
(*argv)[(*argc)++] = 0;
++pat_ptr;
if (!(*pat_ptr))
- return -ENOMEM;
+ return false;
break;
}
case COREDUMP_SOCK: {
/* skip the @ */
pat_ptr++;
if (!(*pat_ptr))
- return -ENOMEM;
+ return false;
+ if (*pat_ptr == '@') {
+ pat_ptr++;
+ if (!(*pat_ptr))
+ return false;
+
+ cn->core_type = COREDUMP_SOCK_REQ;
+ }
err = cn_printf(cn, "%s", pat_ptr);
if (err)
- return err;
+ return false;
/* Require absolute paths. */
if (cn->corename[0] != '/')
- return -EINVAL;
+ return false;
/*
* Ensure we can uses spaces to indicate additional
@@ -279,7 +298,18 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
*/
if (strchr(cn->corename, ' ')) {
coredump_report_failure("Coredump socket may not %s contain spaces", cn->corename);
- return -EINVAL;
+ return false;
+ }
+
+ /* Must not contain ".." in the path. */
+ if (name_contains_dotdot(cn->corename)) {
+ coredump_report_failure("Coredump socket may not %s contain '..' spaces", cn->corename);
+ return false;
+ }
+
+ if (strlen(cn->corename) >= UNIX_PATH_MAX) {
+ coredump_report_failure("Coredump socket path %s too long", cn->corename);
+ return false;
}
/*
@@ -289,13 +319,13 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
* via /proc/<pid>, using the SO_PEERPIDFD to guard
* against pid recycling when opening /proc/<pid>.
*/
- return 0;
+ return true;
}
case COREDUMP_FILE:
break;
default:
WARN_ON_ONCE(true);
- return -EINVAL;
+ return false;
}
/* Repeat as long as we have more pattern to process and more output
@@ -433,7 +463,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
}
if (err)
- return err;
+ return false;
}
out:
@@ -443,9 +473,9 @@ out:
* and core_uses_pid is set, then .%pid will be appended to
* the filename. Do not do this for piped commands. */
if (cn->core_type == COREDUMP_FILE && !pid_in_pattern && core_uses_pid)
- return cn_printf(cn, ".%d", task_tgid_vnr(current));
+ return cn_printf(cn, ".%d", task_tgid_vnr(current)) == 0;
- return 0;
+ return true;
}
static int zap_process(struct signal_struct *signal, int exit_code)
@@ -632,21 +662,440 @@ static int umh_coredump_setup(struct subprocess_info *info, struct cred *new)
return 0;
}
-void do_coredump(const kernel_siginfo_t *siginfo)
+#ifdef CONFIG_UNIX
+static bool coredump_sock_connect(struct core_name *cn, struct coredump_params *cprm)
+{
+ struct file *file __free(fput) = NULL;
+ struct sockaddr_un addr = {
+ .sun_family = AF_UNIX,
+ };
+ ssize_t addr_len;
+ int retval;
+ struct socket *socket;
+
+ addr_len = strscpy(addr.sun_path, cn->corename);
+ if (addr_len < 0)
+ return false;
+ addr_len += offsetof(struct sockaddr_un, sun_path) + 1;
+
+ /*
+ * It is possible that the userspace process which is supposed
+ * to handle the coredump and is listening on the AF_UNIX socket
+ * coredumps. Userspace should just mark itself non dumpable.
+ */
+
+ retval = sock_create_kern(&init_net, AF_UNIX, SOCK_STREAM, 0, &socket);
+ if (retval < 0)
+ return false;
+
+ file = sock_alloc_file(socket, 0, NULL);
+ if (IS_ERR(file))
+ return false;
+
+ /*
+ * Set the thread-group leader pid which is used for the peer
+ * credentials during connect() below. Then immediately register
+ * it in pidfs...
+ */
+ cprm->pid = task_tgid(current);
+ retval = pidfs_register_pid(cprm->pid);
+ if (retval)
+ return false;
+
+ /*
+ * ... and set the coredump information so userspace has it
+ * available after connect()...
+ */
+ pidfs_coredump(cprm);
+
+ retval = kernel_connect(socket, (struct sockaddr *)(&addr), addr_len,
+ O_NONBLOCK | SOCK_COREDUMP);
+
+ if (retval) {
+ if (retval == -EAGAIN)
+ coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path);
+ else
+ coredump_report_failure("Coredump socket connection %s failed %d", addr.sun_path, retval);
+ return false;
+ }
+
+ /* ... and validate that @sk_peer_pid matches @cprm.pid. */
+ if (WARN_ON_ONCE(unix_peer(socket->sk)->sk_peer_pid != cprm->pid))
+ return false;
+
+ cprm->limit = RLIM_INFINITY;
+ cprm->file = no_free_ptr(file);
+
+ return true;
+}
+
+static inline bool coredump_sock_recv(struct file *file, struct coredump_ack *ack, size_t size, int flags)
+{
+ struct msghdr msg = {};
+ struct kvec iov = { .iov_base = ack, .iov_len = size };
+ ssize_t ret;
+
+ memset(ack, 0, size);
+ ret = kernel_recvmsg(sock_from_file(file), &msg, &iov, 1, size, flags);
+ return ret == size;
+}
+
+static inline bool coredump_sock_send(struct file *file, struct coredump_req *req)
+{
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL };
+ struct kvec iov = { .iov_base = req, .iov_len = sizeof(*req) };
+ ssize_t ret;
+
+ ret = kernel_sendmsg(sock_from_file(file), &msg, &iov, 1, sizeof(*req));
+ return ret == sizeof(*req);
+}
+
+static_assert(sizeof(enum coredump_mark) == sizeof(__u32));
+
+static inline bool coredump_sock_mark(struct file *file, enum coredump_mark mark)
+{
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL };
+ struct kvec iov = { .iov_base = &mark, .iov_len = sizeof(mark) };
+ ssize_t ret;
+
+ ret = kernel_sendmsg(sock_from_file(file), &msg, &iov, 1, sizeof(mark));
+ return ret == sizeof(mark);
+}
+
+static inline void coredump_sock_wait(struct file *file)
+{
+ ssize_t n;
+
+ /*
+ * We use a simple read to wait for the coredump processing to
+ * finish. Either the socket is closed or we get sent unexpected
+ * data. In both cases, we're done.
+ */
+ n = __kernel_read(file, &(char){ 0 }, 1, NULL);
+ if (n > 0)
+ coredump_report_failure("Coredump socket had unexpected data");
+ else if (n < 0)
+ coredump_report_failure("Coredump socket failed");
+}
+
+static inline void coredump_sock_shutdown(struct file *file)
+{
+ struct socket *socket;
+
+ socket = sock_from_file(file);
+ if (!socket)
+ return;
+
+ /* Let userspace know we're done processing the coredump. */
+ kernel_sock_shutdown(socket, SHUT_WR);
+}
+
+static bool coredump_sock_request(struct core_name *cn, struct coredump_params *cprm)
+{
+ struct coredump_req req = {
+ .size = sizeof(struct coredump_req),
+ .mask = COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT,
+ .size_ack = sizeof(struct coredump_ack),
+ };
+ struct coredump_ack ack = {};
+ ssize_t usize;
+
+ if (cn->core_type != COREDUMP_SOCK_REQ)
+ return true;
+
+ /* Let userspace know what we support. */
+ if (!coredump_sock_send(cprm->file, &req))
+ return false;
+
+ /* Peek the size of the coredump_ack. */
+ if (!coredump_sock_recv(cprm->file, &ack, sizeof(ack.size),
+ MSG_PEEK | MSG_WAITALL))
+ return false;
+
+ /* Refuse unknown coredump_ack sizes. */
+ usize = ack.size;
+ if (usize < COREDUMP_ACK_SIZE_VER0) {
+ coredump_sock_mark(cprm->file, COREDUMP_MARK_MINSIZE);
+ return false;
+ }
+
+ if (usize > sizeof(ack)) {
+ coredump_sock_mark(cprm->file, COREDUMP_MARK_MAXSIZE);
+ return false;
+ }
+
+ /* Now retrieve the coredump_ack. */
+ if (!coredump_sock_recv(cprm->file, &ack, usize, MSG_WAITALL))
+ return false;
+ if (ack.size != usize)
+ return false;
+
+ /* Refuse unknown coredump_ack flags. */
+ if (ack.mask & ~req.mask) {
+ coredump_sock_mark(cprm->file, COREDUMP_MARK_UNSUPPORTED);
+ return false;
+ }
+
+ /* Refuse mutually exclusive options. */
+ if (hweight64(ack.mask & (COREDUMP_USERSPACE | COREDUMP_KERNEL |
+ COREDUMP_REJECT)) != 1) {
+ coredump_sock_mark(cprm->file, COREDUMP_MARK_CONFLICTING);
+ return false;
+ }
+
+ if (ack.spare) {
+ coredump_sock_mark(cprm->file, COREDUMP_MARK_UNSUPPORTED);
+ return false;
+ }
+
+ cn->mask = ack.mask;
+ return coredump_sock_mark(cprm->file, COREDUMP_MARK_REQACK);
+}
+
+static bool coredump_socket(struct core_name *cn, struct coredump_params *cprm)
+{
+ if (!coredump_sock_connect(cn, cprm))
+ return false;
+
+ return coredump_sock_request(cn, cprm);
+}
+#else
+static inline void coredump_sock_wait(struct file *file) { }
+static inline void coredump_sock_shutdown(struct file *file) { }
+static inline bool coredump_socket(struct core_name *cn, struct coredump_params *cprm) { return false; }
+#endif
+
+/* cprm->mm_flags contains a stable snapshot of dumpability flags. */
+static inline bool coredump_force_suid_safe(const struct coredump_params *cprm)
+{
+ /* Require nonrelative corefile path and be extra careful. */
+ return __get_dumpable(cprm->mm_flags) == SUID_DUMP_ROOT;
+}
+
+static bool coredump_file(struct core_name *cn, struct coredump_params *cprm,
+ const struct linux_binfmt *binfmt)
+{
+ struct mnt_idmap *idmap;
+ struct inode *inode;
+ struct file *file __free(fput) = NULL;
+ int open_flags = O_CREAT | O_WRONLY | O_NOFOLLOW | O_LARGEFILE | O_EXCL;
+
+ if (cprm->limit < binfmt->min_coredump)
+ return false;
+
+ if (coredump_force_suid_safe(cprm) && cn->corename[0] != '/') {
+ coredump_report_failure("this process can only dump core to a fully qualified path, skipping core dump");
+ return false;
+ }
+
+ /*
+ * Unlink the file if it exists unless this is a SUID
+ * binary - in that case, we're running around with root
+ * privs and don't want to unlink another user's coredump.
+ */
+ if (!coredump_force_suid_safe(cprm)) {
+ /*
+ * If it doesn't exist, that's fine. If there's some
+ * other problem, we'll catch it at the filp_open().
+ */
+ do_unlinkat(AT_FDCWD, getname_kernel(cn->corename));
+ }
+
+ /*
+ * There is a race between unlinking and creating the
+ * file, but if that causes an EEXIST here, that's
+ * fine - another process raced with us while creating
+ * the corefile, and the other process won. To userspace,
+ * what matters is that at least one of the two processes
+ * writes its coredump successfully, not which one.
+ */
+ if (coredump_force_suid_safe(cprm)) {
+ /*
+ * Using user namespaces, normal user tasks can change
+ * their current->fs->root to point to arbitrary
+ * directories. Since the intention of the "only dump
+ * with a fully qualified path" rule is to control where
+ * coredumps may be placed using root privileges,
+ * current->fs->root must not be used. Instead, use the
+ * root directory of init_task.
+ */
+ struct path root;
+
+ task_lock(&init_task);
+ get_fs_root(init_task.fs, &root);
+ task_unlock(&init_task);
+ file = file_open_root(&root, cn->corename, open_flags, 0600);
+ path_put(&root);
+ } else {
+ file = filp_open(cn->corename, open_flags, 0600);
+ }
+ if (IS_ERR(file))
+ return false;
+
+ inode = file_inode(file);
+ if (inode->i_nlink > 1)
+ return false;
+ if (d_unhashed(file->f_path.dentry))
+ return false;
+ /*
+ * AK: actually i see no reason to not allow this for named
+ * pipes etc, but keep the previous behaviour for now.
+ */
+ if (!S_ISREG(inode->i_mode))
+ return false;
+ /*
+ * Don't dump core if the filesystem changed owner or mode
+ * of the file during file creation. This is an issue when
+ * a process dumps core while its cwd is e.g. on a vfat
+ * filesystem.
+ */
+ idmap = file_mnt_idmap(file);
+ if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) {
+ coredump_report_failure("Core dump to %s aborted: cannot preserve file owner", cn->corename);
+ return false;
+ }
+ if ((inode->i_mode & 0677) != 0600) {
+ coredump_report_failure("Core dump to %s aborted: cannot preserve file permissions", cn->corename);
+ return false;
+ }
+ if (!(file->f_mode & FMODE_CAN_WRITE))
+ return false;
+ if (do_truncate(idmap, file->f_path.dentry, 0, 0, file))
+ return false;
+
+ cprm->file = no_free_ptr(file);
+ return true;
+}
+
+static bool coredump_pipe(struct core_name *cn, struct coredump_params *cprm,
+ size_t *argv, int argc)
{
+ int argi;
+ char **helper_argv __free(kfree) = NULL;
+ struct subprocess_info *sub_info;
+
+ if (cprm->limit == 1) {
+ /* See umh_coredump_setup() which sets RLIMIT_CORE = 1.
+ *
+ * Normally core limits are irrelevant to pipes, since
+ * we're not writing to the file system, but we use
+ * cprm.limit of 1 here as a special value, this is a
+ * consistent way to catch recursive crashes.
+ * We can still crash if the core_pattern binary sets
+ * RLIM_CORE = !1, but it runs as root, and can do
+ * lots of stupid things.
+ *
+ * Note that we use task_tgid_vnr here to grab the pid
+ * of the process group leader. That way we get the
+ * right pid if a thread in a multi-threaded
+ * core_pattern process dies.
+ */
+ coredump_report_failure("RLIMIT_CORE is set to 1, aborting core");
+ return false;
+ }
+ cprm->limit = RLIM_INFINITY;
+
+ cn->core_pipe_limit = atomic_inc_return(&core_pipe_count);
+ if (core_pipe_limit && (core_pipe_limit < cn->core_pipe_limit)) {
+ coredump_report_failure("over core_pipe_limit, skipping core dump");
+ return false;
+ }
+
+ helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv), GFP_KERNEL);
+ if (!helper_argv) {
+ coredump_report_failure("%s failed to allocate memory", __func__);
+ return false;
+ }
+ for (argi = 0; argi < argc; argi++)
+ helper_argv[argi] = cn->corename + argv[argi];
+ helper_argv[argi] = NULL;
+
+ sub_info = call_usermodehelper_setup(helper_argv[0], helper_argv, NULL,
+ GFP_KERNEL, umh_coredump_setup,
+ NULL, cprm);
+ if (!sub_info)
+ return false;
+
+ if (call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC)) {
+ coredump_report_failure("|%s pipe failed", cn->corename);
+ return false;
+ }
+
+ /*
+ * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would
+ * have this set to NULL.
+ */
+ if (!cprm->file) {
+ coredump_report_failure("Core dump to |%s disabled", cn->corename);
+ return false;
+ }
+
+ return true;
+}
+
+static bool coredump_write(struct core_name *cn,
+ struct coredump_params *cprm,
+ struct linux_binfmt *binfmt)
+{
+
+ if (dump_interrupted())
+ return true;
+
+ if (!dump_vma_snapshot(cprm))
+ return false;
+
+ file_start_write(cprm->file);
+ cn->core_dumped = binfmt->core_dump(cprm);
+ /*
+ * Ensures that file size is big enough to contain the current
+ * file postion. This prevents gdb from complaining about
+ * a truncated file if the last "write" to the file was
+ * dump_skip.
+ */
+ if (cprm->to_skip) {
+ cprm->to_skip--;
+ dump_emit(cprm, "", 1);
+ }
+ file_end_write(cprm->file);
+ free_vma_snapshot(cprm);
+ return true;
+}
+
+static void coredump_cleanup(struct core_name *cn, struct coredump_params *cprm)
+{
+ if (cprm->file)
+ filp_close(cprm->file, NULL);
+ if (cn->core_pipe_limit) {
+ VFS_WARN_ON_ONCE(cn->core_type != COREDUMP_PIPE);
+ atomic_dec(&core_pipe_count);
+ }
+ kfree(cn->corename);
+ coredump_finish(cn->core_dumped);
+}
+
+static inline bool coredump_skip(const struct coredump_params *cprm,
+ const struct linux_binfmt *binfmt)
+{
+ if (!binfmt)
+ return true;
+ if (!binfmt->core_dump)
+ return true;
+ if (!__get_dumpable(cprm->mm_flags))
+ return true;
+ return false;
+}
+
+void vfs_coredump(const kernel_siginfo_t *siginfo)
+{
+ struct cred *cred __free(put_cred) = NULL;
+ size_t *argv __free(kfree) = NULL;
struct core_state core_state;
struct core_name cn;
struct mm_struct *mm = current->mm;
- struct linux_binfmt * binfmt;
+ struct linux_binfmt *binfmt = mm->binfmt;
const struct cred *old_cred;
- struct cred *cred;
- int retval = 0;
- size_t *argv = NULL;
int argc = 0;
- /* require nonrelative corefile path and be extra careful */
- bool need_suid_safe = false;
- bool core_dumped = false;
- static atomic_t core_dump_count = ATOMIC_INIT(0);
struct coredump_params cprm = {
.siginfo = siginfo,
.limit = rlimit(RLIMIT_CORE),
@@ -662,357 +1111,92 @@ void do_coredump(const kernel_siginfo_t *siginfo)
audit_core_dumps(siginfo->si_signo);
- binfmt = mm->binfmt;
- if (!binfmt || !binfmt->core_dump)
- goto fail;
- if (!__get_dumpable(cprm.mm_flags))
- goto fail;
+ if (coredump_skip(&cprm, binfmt))
+ return;
cred = prepare_creds();
if (!cred)
- goto fail;
+ return;
/*
* We cannot trust fsuid as being the "true" uid of the process
* nor do we know its entire history. We only know it was tainted
* so we dump it as root in mode 2, and only into a controlled
* environment (pipe handler or fully qualified path).
*/
- if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) {
- /* Setuid core dump mode */
- cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */
- need_suid_safe = true;
- }
+ if (coredump_force_suid_safe(&cprm))
+ cred->fsuid = GLOBAL_ROOT_UID;
- retval = coredump_wait(siginfo->si_signo, &core_state);
- if (retval < 0)
- goto fail_creds;
+ if (coredump_wait(siginfo->si_signo, &core_state) < 0)
+ return;
old_cred = override_creds(cred);
- retval = format_corename(&cn, &cprm, &argv, &argc);
- if (retval < 0) {
+ if (!coredump_parse(&cn, &cprm, &argv, &argc)) {
coredump_report_failure("format_corename failed, aborting core");
- goto fail_unlock;
+ goto close_fail;
}
switch (cn.core_type) {
- case COREDUMP_FILE: {
- struct mnt_idmap *idmap;
- struct inode *inode;
- int open_flags = O_CREAT | O_WRONLY | O_NOFOLLOW |
- O_LARGEFILE | O_EXCL;
-
- if (cprm.limit < binfmt->min_coredump)
- goto fail_unlock;
-
- if (need_suid_safe && cn.corename[0] != '/') {
- coredump_report_failure(
- "this process can only dump core to a fully qualified path, skipping core dump");
- goto fail_unlock;
- }
-
- /*
- * Unlink the file if it exists unless this is a SUID
- * binary - in that case, we're running around with root
- * privs and don't want to unlink another user's coredump.
- */
- if (!need_suid_safe) {
- /*
- * If it doesn't exist, that's fine. If there's some
- * other problem, we'll catch it at the filp_open().
- */
- do_unlinkat(AT_FDCWD, getname_kernel(cn.corename));
- }
-
- /*
- * There is a race between unlinking and creating the
- * file, but if that causes an EEXIST here, that's
- * fine - another process raced with us while creating
- * the corefile, and the other process won. To userspace,
- * what matters is that at least one of the two processes
- * writes its coredump successfully, not which one.
- */
- if (need_suid_safe) {
- /*
- * Using user namespaces, normal user tasks can change
- * their current->fs->root to point to arbitrary
- * directories. Since the intention of the "only dump
- * with a fully qualified path" rule is to control where
- * coredumps may be placed using root privileges,
- * current->fs->root must not be used. Instead, use the
- * root directory of init_task.
- */
- struct path root;
-
- task_lock(&init_task);
- get_fs_root(init_task.fs, &root);
- task_unlock(&init_task);
- cprm.file = file_open_root(&root, cn.corename,
- open_flags, 0600);
- path_put(&root);
- } else {
- cprm.file = filp_open(cn.corename, open_flags, 0600);
- }
- if (IS_ERR(cprm.file))
- goto fail_unlock;
-
- inode = file_inode(cprm.file);
- if (inode->i_nlink > 1)
- goto close_fail;
- if (d_unhashed(cprm.file->f_path.dentry))
- goto close_fail;
- /*
- * AK: actually i see no reason to not allow this for named
- * pipes etc, but keep the previous behaviour for now.
- */
- if (!S_ISREG(inode->i_mode))
- goto close_fail;
- /*
- * Don't dump core if the filesystem changed owner or mode
- * of the file during file creation. This is an issue when
- * a process dumps core while its cwd is e.g. on a vfat
- * filesystem.
- */
- idmap = file_mnt_idmap(cprm.file);
- if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode),
- current_fsuid())) {
- coredump_report_failure("Core dump to %s aborted: "
- "cannot preserve file owner", cn.corename);
- goto close_fail;
- }
- if ((inode->i_mode & 0677) != 0600) {
- coredump_report_failure("Core dump to %s aborted: "
- "cannot preserve file permissions", cn.corename);
- goto close_fail;
- }
- if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
- goto close_fail;
- if (do_truncate(idmap, cprm.file->f_path.dentry,
- 0, 0, cprm.file))
+ case COREDUMP_FILE:
+ if (!coredump_file(&cn, &cprm, binfmt))
goto close_fail;
break;
- }
- case COREDUMP_PIPE: {
- int argi;
- int dump_count;
- char **helper_argv;
- struct subprocess_info *sub_info;
-
- if (cprm.limit == 1) {
- /* See umh_coredump_setup() which sets RLIMIT_CORE = 1.
- *
- * Normally core limits are irrelevant to pipes, since
- * we're not writing to the file system, but we use
- * cprm.limit of 1 here as a special value, this is a
- * consistent way to catch recursive crashes.
- * We can still crash if the core_pattern binary sets
- * RLIM_CORE = !1, but it runs as root, and can do
- * lots of stupid things.
- *
- * Note that we use task_tgid_vnr here to grab the pid
- * of the process group leader. That way we get the
- * right pid if a thread in a multi-threaded
- * core_pattern process dies.
- */
- coredump_report_failure("RLIMIT_CORE is set to 1, aborting core");
- goto fail_unlock;
- }
- cprm.limit = RLIM_INFINITY;
-
- dump_count = atomic_inc_return(&core_dump_count);
- if (core_pipe_limit && (core_pipe_limit < dump_count)) {
- coredump_report_failure("over core_pipe_limit, skipping core dump");
- goto fail_dropcount;
- }
-
- helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv),
- GFP_KERNEL);
- if (!helper_argv) {
- coredump_report_failure("%s failed to allocate memory", __func__);
- goto fail_dropcount;
- }
- for (argi = 0; argi < argc; argi++)
- helper_argv[argi] = cn.corename + argv[argi];
- helper_argv[argi] = NULL;
-
- retval = -ENOMEM;
- sub_info = call_usermodehelper_setup(helper_argv[0],
- helper_argv, NULL, GFP_KERNEL,
- umh_coredump_setup, NULL, &cprm);
- if (sub_info)
- retval = call_usermodehelper_exec(sub_info,
- UMH_WAIT_EXEC);
-
- kfree(helper_argv);
- if (retval) {
- coredump_report_failure("|%s pipe failed", cn.corename);
+ case COREDUMP_PIPE:
+ if (!coredump_pipe(&cn, &cprm, argv, argc))
goto close_fail;
- }
break;
- }
- case COREDUMP_SOCK: {
-#ifdef CONFIG_UNIX
- struct file *file __free(fput) = NULL;
- struct sockaddr_un addr = {
- .sun_family = AF_UNIX,
- };
- ssize_t addr_len;
- struct socket *socket;
-
- addr_len = strscpy(addr.sun_path, cn.corename);
- if (addr_len < 0)
- goto close_fail;
- addr_len += offsetof(struct sockaddr_un, sun_path) + 1;
-
- /*
- * It is possible that the userspace process which is
- * supposed to handle the coredump and is listening on
- * the AF_UNIX socket coredumps. Userspace should just
- * mark itself non dumpable.
- */
-
- retval = sock_create_kern(&init_net, AF_UNIX, SOCK_STREAM, 0, &socket);
- if (retval < 0)
- goto close_fail;
-
- file = sock_alloc_file(socket, 0, NULL);
- if (IS_ERR(file))
- goto close_fail;
-
- /*
- * Set the thread-group leader pid which is used for the
- * peer credentials during connect() below. Then
- * immediately register it in pidfs...
- */
- cprm.pid = task_tgid(current);
- retval = pidfs_register_pid(cprm.pid);
- if (retval)
- goto close_fail;
-
- /*
- * ... and set the coredump information so userspace
- * has it available after connect()...
- */
- pidfs_coredump(&cprm);
-
- retval = kernel_connect(socket, (struct sockaddr *)(&addr),
- addr_len, O_NONBLOCK | SOCK_COREDUMP);
-
- /*
- * ... Make sure to only put our reference after connect() took
- * its own reference keeping the pidfs entry alive ...
- */
- pidfs_put_pid(cprm.pid);
-
- if (retval) {
- if (retval == -EAGAIN)
- coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path);
- else
- coredump_report_failure("Coredump socket connection %s failed %d", addr.sun_path, retval);
+ case COREDUMP_SOCK_REQ:
+ fallthrough;
+ case COREDUMP_SOCK:
+ if (!coredump_socket(&cn, &cprm))
goto close_fail;
- }
-
- /* ... and validate that @sk_peer_pid matches @cprm.pid. */
- if (WARN_ON_ONCE(unix_peer(socket->sk)->sk_peer_pid != cprm.pid))
- goto close_fail;
-
- cprm.limit = RLIM_INFINITY;
- cprm.file = no_free_ptr(file);
-#else
- coredump_report_failure("Core dump socket support %s disabled", cn.corename);
- goto close_fail;
-#endif
break;
- }
default:
WARN_ON_ONCE(true);
goto close_fail;
}
+ /* Don't even generate the coredump. */
+ if (cn.mask & COREDUMP_REJECT)
+ goto close_fail;
+
/* get us an unshared descriptor table; almost always a no-op */
/* The cell spufs coredump code reads the file descriptor tables */
- retval = unshare_files();
- if (retval)
+ if (unshare_files())
goto close_fail;
- if (!dump_interrupted()) {
- /*
- * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would
- * have this set to NULL.
- */
- if (!cprm.file) {
- coredump_report_failure("Core dump to |%s disabled", cn.corename);
- goto close_fail;
- }
- if (!dump_vma_snapshot(&cprm))
- goto close_fail;
- file_start_write(cprm.file);
- core_dumped = binfmt->core_dump(&cprm);
- /*
- * Ensures that file size is big enough to contain the current
- * file postion. This prevents gdb from complaining about
- * a truncated file if the last "write" to the file was
- * dump_skip.
- */
- if (cprm.to_skip) {
- cprm.to_skip--;
- dump_emit(&cprm, "", 1);
- }
- file_end_write(cprm.file);
- free_vma_snapshot(&cprm);
- }
+ if ((cn.mask & COREDUMP_KERNEL) && !coredump_write(&cn, &cprm, binfmt))
+ goto close_fail;
-#ifdef CONFIG_UNIX
- /* Let userspace know we're done processing the coredump. */
- if (sock_from_file(cprm.file))
- kernel_sock_shutdown(sock_from_file(cprm.file), SHUT_WR);
-#endif
+ coredump_sock_shutdown(cprm.file);
+
+ /* Let the parent know that a coredump was generated. */
+ if (cn.mask & COREDUMP_USERSPACE)
+ cn.core_dumped = true;
/*
* When core_pipe_limit is set we wait for the coredump server
* or usermodehelper to finish before exiting so it can e.g.,
* inspect /proc/<pid>.
*/
- if (core_pipe_limit) {
+ if (cn.mask & COREDUMP_WAIT) {
switch (cn.core_type) {
case COREDUMP_PIPE:
wait_for_dump_helpers(cprm.file);
break;
-#ifdef CONFIG_UNIX
- case COREDUMP_SOCK: {
- ssize_t n;
-
- /*
- * We use a simple read to wait for the coredump
- * processing to finish. Either the socket is
- * closed or we get sent unexpected data. In
- * both cases, we're done.
- */
- n = __kernel_read(cprm.file, &(char){ 0 }, 1, NULL);
- if (n != 0)
- coredump_report_failure("Unexpected data on coredump socket");
+ case COREDUMP_SOCK_REQ:
+ fallthrough;
+ case COREDUMP_SOCK:
+ coredump_sock_wait(cprm.file);
break;
- }
-#endif
default:
break;
}
}
close_fail:
- if (cprm.file)
- filp_close(cprm.file, NULL);
-fail_dropcount:
- if (cn.core_type == COREDUMP_PIPE)
- atomic_dec(&core_dump_count);
-fail_unlock:
- kfree(argv);
- kfree(cn.corename);
- coredump_finish(core_dumped);
+ coredump_cleanup(&cn, &cprm);
revert_creds(old_cred);
-fail_creds:
- put_cred(cred);
-fail:
return;
}
@@ -1238,6 +1422,8 @@ void validate_coredump_safety(void)
static inline bool check_coredump_socket(void)
{
+ const char *p;
+
if (core_pattern[0] != '@')
return true;
@@ -1249,8 +1435,25 @@ static inline bool check_coredump_socket(void)
if (current->nsproxy->mnt_ns != init_task.nsproxy->mnt_ns)
return false;
- /* Must be an absolute path. */
- if (*(core_pattern + 1) != '/')
+ /* Must be an absolute path... */
+ if (core_pattern[1] != '/') {
+ /* ... or the socket request protocol... */
+ if (core_pattern[1] != '@')
+ return false;
+ /* ... and if so must be an absolute path. */
+ if (core_pattern[2] != '/')
+ return false;
+ p = &core_pattern[2];
+ } else {
+ p = &core_pattern[1];
+ }
+
+ /* The path obviously cannot exceed UNIX_PATH_MAX. */
+ if (strlen(p) >= UNIX_PATH_MAX)
+ return false;
+
+ /* Must not contain ".." in the path. */
+ if (name_contains_dotdot(core_pattern))
return false;
return true;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index b84d1747a020..b002e9b734f9 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -17,7 +17,6 @@
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/pagemap.h>
-#include <linux/pfn_t.h>
#include <linux/ramfs.h>
#include <linux/init.h>
#include <linux/string.h>
@@ -412,8 +411,8 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
for (i = 0; i < pages && !ret; i++) {
vm_fault_t vmf;
unsigned long off = i * PAGE_SIZE;
- pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV);
- vmf = vmf_insert_mixed(vma, vma->vm_start + off, pfn);
+ vmf = vmf_insert_mixed(vma, vma->vm_start + off,
+ address + off);
if (vmf & VM_FAULT_ERROR)
ret = vm_fault_to_errno(vmf, 0);
}
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 0ad8c30b8fa5..486fcb2ecf13 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -7,10 +7,12 @@
* Copyright (C) 2015, Motorola Mobility
*/
-#include <linux/pagemap.h>
-#include <linux/module.h>
#include <linux/bio.h>
+#include <linux/export.h>
+#include <linux/module.h>
#include <linux/namei.h>
+#include <linux/pagemap.h>
+
#include "fscrypt_private.h"
/**
@@ -165,8 +167,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
do {
err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, du_index,
ZERO_PAGE(0), pages[i],
- du_size, offset,
- GFP_NOFS);
+ du_size, offset);
if (err)
goto out;
du_index++;
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index b74b5937e695..b6ccab524fde 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -20,12 +20,14 @@
* Special Publication 800-38E and IEEE P1619/D16.
*/
-#include <linux/pagemap.h>
+#include <crypto/skcipher.h>
+#include <linux/export.h>
#include <linux/mempool.h>
#include <linux/module.h>
-#include <linux/scatterlist.h>
+#include <linux/pagemap.h>
#include <linux/ratelimit.h>
-#include <crypto/skcipher.h>
+#include <linux/scatterlist.h>
+
#include "fscrypt_private.h"
static unsigned int num_prealloc_crypto_pages = 32;
@@ -108,15 +110,13 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index,
int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
fscrypt_direction_t rw, u64 index,
struct page *src_page, struct page *dest_page,
- unsigned int len, unsigned int offs,
- gfp_t gfp_flags)
+ unsigned int len, unsigned int offs)
{
+ struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm;
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
union fscrypt_iv iv;
- struct skcipher_request *req = NULL;
- DECLARE_CRYPTO_WAIT(wait);
struct scatterlist dst, src;
- struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
- int res = 0;
+ int err;
if (WARN_ON_ONCE(len <= 0))
return -EINVAL;
@@ -125,31 +125,23 @@ int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
fscrypt_generate_iv(&iv, index, ci);
- req = skcipher_request_alloc(tfm, gfp_flags);
- if (!req)
- return -ENOMEM;
-
skcipher_request_set_callback(
req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- crypto_req_done, &wait);
-
+ NULL, NULL);
sg_init_table(&dst, 1);
sg_set_page(&dst, dest_page, len, offs);
sg_init_table(&src, 1);
sg_set_page(&src, src_page, len, offs);
skcipher_request_set_crypt(req, &src, &dst, len, &iv);
if (rw == FS_DECRYPT)
- res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
+ err = crypto_skcipher_decrypt(req);
else
- res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
- skcipher_request_free(req);
- if (res) {
+ err = crypto_skcipher_encrypt(req);
+ if (err)
fscrypt_err(ci->ci_inode,
"%scryption failed for data unit %llu: %d",
- (rw == FS_DECRYPT ? "De" : "En"), index, res);
- return res;
- }
- return 0;
+ (rw == FS_DECRYPT ? "De" : "En"), index, err);
+ return err;
}
/**
@@ -204,7 +196,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio,
for (i = offs; i < offs + len; i += du_size, index++) {
err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, index,
&folio->page, ciphertext_page,
- du_size, i, gfp_flags);
+ du_size, i);
if (err) {
fscrypt_free_bounce_page(ciphertext_page);
return ERR_PTR(err);
@@ -225,7 +217,6 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks);
* @offs: Byte offset within @page at which the block to encrypt begins
* @lblk_num: Filesystem logical block number of the block, i.e. the 0-based
* number of the block within the file
- * @gfp_flags: Memory allocation flags
*
* Encrypt a possibly-compressed filesystem block that is located in an
* arbitrary page, not necessarily in the original pagecache page. The @inode
@@ -237,13 +228,12 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks);
*/
int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
unsigned int len, unsigned int offs,
- u64 lblk_num, gfp_t gfp_flags)
+ u64 lblk_num)
{
if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
return -EOPNOTSUPP;
return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_ENCRYPT,
- lblk_num, page, page, len, offs,
- gfp_flags);
+ lblk_num, page, page, len, offs);
}
EXPORT_SYMBOL(fscrypt_encrypt_block_inplace);
@@ -283,8 +273,7 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len,
struct page *page = folio_page(folio, i >> PAGE_SHIFT);
err = fscrypt_crypt_data_unit(ci, FS_DECRYPT, index, page,
- page, du_size, i & ~PAGE_MASK,
- GFP_NOFS);
+ page, du_size, i & ~PAGE_MASK);
if (err)
return err;
}
@@ -317,8 +306,7 @@ int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page,
if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
return -EOPNOTSUPP;
return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_DECRYPT,
- lblk_num, page, page, len, offs,
- GFP_NOFS);
+ lblk_num, page, page, len, offs);
}
EXPORT_SYMBOL(fscrypt_decrypt_block_inplace);
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 010f9c0a4c2f..f9f6713e144f 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -11,11 +11,13 @@
* This has not yet undergone a rigorous security audit.
*/
-#include <linux/namei.h>
-#include <linux/scatterlist.h>
#include <crypto/hash.h>
#include <crypto/sha2.h>
#include <crypto/skcipher.h>
+#include <linux/export.h>
+#include <linux/namei.h>
+#include <linux/scatterlist.h>
+
#include "fscrypt_private.h"
/*
@@ -92,13 +94,12 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
u8 *out, unsigned int olen)
{
- struct skcipher_request *req = NULL;
- DECLARE_CRYPTO_WAIT(wait);
const struct fscrypt_inode_info *ci = inode->i_crypt_info;
- struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
+ struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm;
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
union fscrypt_iv iv;
struct scatterlist sg;
- int res;
+ int err;
/*
* Copy the filename to the output buffer for encrypting in-place and
@@ -109,28 +110,17 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
memcpy(out, iname->name, iname->len);
memset(out + iname->len, 0, olen - iname->len);
- /* Initialize the IV */
fscrypt_generate_iv(&iv, 0, ci);
- /* Set up the encryption request */
- req = skcipher_request_alloc(tfm, GFP_NOFS);
- if (!req)
- return -ENOMEM;
- skcipher_request_set_callback(req,
- CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- crypto_req_done, &wait);
+ skcipher_request_set_callback(
+ req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ NULL, NULL);
sg_init_one(&sg, out, olen);
skcipher_request_set_crypt(req, &sg, &sg, olen, &iv);
-
- /* Do the encryption */
- res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
- skcipher_request_free(req);
- if (res < 0) {
- fscrypt_err(inode, "Filename encryption failed: %d", res);
- return res;
- }
-
- return 0;
+ err = crypto_skcipher_encrypt(req);
+ if (err)
+ fscrypt_err(inode, "Filename encryption failed: %d", err);
+ return err;
}
EXPORT_SYMBOL_GPL(fscrypt_fname_encrypt);
@@ -148,34 +138,25 @@ static int fname_decrypt(const struct inode *inode,
const struct fscrypt_str *iname,
struct fscrypt_str *oname)
{
- struct skcipher_request *req = NULL;
- DECLARE_CRYPTO_WAIT(wait);
- struct scatterlist src_sg, dst_sg;
const struct fscrypt_inode_info *ci = inode->i_crypt_info;
- struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
+ struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm;
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
union fscrypt_iv iv;
- int res;
-
- /* Allocate request */
- req = skcipher_request_alloc(tfm, GFP_NOFS);
- if (!req)
- return -ENOMEM;
- skcipher_request_set_callback(req,
- CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- crypto_req_done, &wait);
+ struct scatterlist src_sg, dst_sg;
+ int err;
- /* Initialize IV */
fscrypt_generate_iv(&iv, 0, ci);
- /* Create decryption request */
+ skcipher_request_set_callback(
+ req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ NULL, NULL);
sg_init_one(&src_sg, iname->name, iname->len);
sg_init_one(&dst_sg, oname->name, oname->len);
skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, &iv);
- res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
- skcipher_request_free(req);
- if (res < 0) {
- fscrypt_err(inode, "Filename decryption failed: %d", res);
- return res;
+ err = crypto_skcipher_decrypt(req);
+ if (err) {
+ fscrypt_err(inode, "Filename decryption failed: %d", err);
+ return err;
}
oname->len = strnlen(oname->name, iname->len);
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index c1d92074b65c..d8b485b9881c 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -45,6 +45,24 @@
*/
#undef FSCRYPT_MAX_KEY_SIZE
+/*
+ * This mask is passed as the third argument to the crypto_alloc_*() functions
+ * to prevent fscrypt from using the Crypto API drivers for non-inline crypto
+ * engines. Those drivers have been problematic for fscrypt. fscrypt users
+ * have reported hangs and even incorrect en/decryption with these drivers.
+ * Since going to the driver, off CPU, and back again is really slow, such
+ * drivers can be over 50 times slower than the CPU-based code for fscrypt's
+ * workload. Even on platforms that lack AES instructions on the CPU, using the
+ * offloads has been shown to be slower, even staying with AES. (Of course,
+ * Adiantum is faster still, and is the recommended option on such platforms...)
+ *
+ * Note that fscrypt also supports inline crypto engines. Those don't use the
+ * Crypto API and work much better than the old-style (non-inline) engines.
+ */
+#define FSCRYPT_CRYPTOAPI_MASK \
+ (CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY | \
+ CRYPTO_ALG_KERN_DRIVER_ONLY)
+
#define FSCRYPT_CONTEXT_V1 1
#define FSCRYPT_CONTEXT_V2 2
@@ -221,7 +239,7 @@ struct fscrypt_symlink_data {
* Normally only one of the fields will be non-NULL.
*/
struct fscrypt_prepared_key {
- struct crypto_skcipher *tfm;
+ struct crypto_sync_skcipher *tfm;
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
struct blk_crypto_key *blk_key;
#endif
@@ -319,8 +337,7 @@ int fscrypt_initialize(struct super_block *sb);
int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
fscrypt_direction_t rw, u64 index,
struct page *src_page, struct page *dest_page,
- unsigned int len, unsigned int offs,
- gfp_t gfp_flags);
+ unsigned int len, unsigned int offs);
struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags);
void __printf(3, 4) __cold
diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c
index 0f3028adc9c7..b1ef506cd341 100644
--- a/fs/crypto/hkdf.c
+++ b/fs/crypto/hkdf.c
@@ -8,8 +8,8 @@
*/
#include <crypto/hash.h>
-#include <crypto/sha2.h>
#include <crypto/hkdf.h>
+#include <crypto/sha2.h>
#include "fscrypt_private.h"
@@ -58,7 +58,7 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
u8 prk[HKDF_HASHLEN];
int err;
- hmac_tfm = crypto_alloc_shash(HKDF_HMAC_ALG, 0, 0);
+ hmac_tfm = crypto_alloc_shash(HKDF_HMAC_ALG, 0, FSCRYPT_CRYPTOAPI_MASK);
if (IS_ERR(hmac_tfm)) {
fscrypt_err(NULL, "Error allocating " HKDF_HMAC_ALG ": %ld",
PTR_ERR(hmac_tfm));
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index d8d5049b8fe1..e0b32ac841f7 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -5,6 +5,8 @@
* Encryption hooks for higher-level filesystem operations.
*/
+#include <linux/export.h>
+
#include "fscrypt_private.h"
/**
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 1d008c440cb6..caaff809765b 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -15,6 +15,7 @@
#include <linux/blk-crypto.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
+#include <linux/export.h>
#include <linux/sched/mm.h>
#include <linux/slab.h>
#include <linux/uio.h>
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index ace369f13068..7557f6a88b8f 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -18,12 +18,13 @@
* information about these ioctls.
*/
-#include <linux/unaligned.h>
#include <crypto/skcipher.h>
+#include <linux/export.h>
#include <linux/key-type.h>
-#include <linux/random.h>
#include <linux/once.h>
+#include <linux/random.h>
#include <linux/seq_file.h>
+#include <linux/unaligned.h>
#include "fscrypt_private.h"
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 0d71843af946..4f3b9ecbfe4e 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -9,6 +9,7 @@
*/
#include <crypto/skcipher.h>
+#include <linux/export.h>
#include <linux/random.h>
#include "fscrypt_private.h"
@@ -96,14 +97,15 @@ select_encryption_mode(const union fscrypt_policy *policy,
}
/* Create a symmetric cipher object for the given encryption mode and key */
-static struct crypto_skcipher *
+static struct crypto_sync_skcipher *
fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key,
const struct inode *inode)
{
- struct crypto_skcipher *tfm;
+ struct crypto_sync_skcipher *tfm;
int err;
- tfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0);
+ tfm = crypto_alloc_sync_skcipher(mode->cipher_str, 0,
+ FSCRYPT_CRYPTOAPI_MASK);
if (IS_ERR(tfm)) {
if (PTR_ERR(tfm) == -ENOENT) {
fscrypt_warn(inode,
@@ -123,21 +125,22 @@ fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key,
* first time a mode is used.
*/
pr_info("fscrypt: %s using implementation \"%s\"\n",
- mode->friendly_name, crypto_skcipher_driver_name(tfm));
+ mode->friendly_name,
+ crypto_skcipher_driver_name(&tfm->base));
}
- if (WARN_ON_ONCE(crypto_skcipher_ivsize(tfm) != mode->ivsize)) {
+ if (WARN_ON_ONCE(crypto_sync_skcipher_ivsize(tfm) != mode->ivsize)) {
err = -EINVAL;
goto err_free_tfm;
}
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
- err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize);
+ crypto_sync_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
+ err = crypto_sync_skcipher_setkey(tfm, raw_key, mode->keysize);
if (err)
goto err_free_tfm;
return tfm;
err_free_tfm:
- crypto_free_skcipher(tfm);
+ crypto_free_sync_skcipher(tfm);
return ERR_PTR(err);
}
@@ -150,7 +153,7 @@ err_free_tfm:
int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
const u8 *raw_key, const struct fscrypt_inode_info *ci)
{
- struct crypto_skcipher *tfm;
+ struct crypto_sync_skcipher *tfm;
if (fscrypt_using_inline_encryption(ci))
return fscrypt_prepare_inline_crypt_key(prep_key, raw_key,
@@ -174,7 +177,7 @@ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
void fscrypt_destroy_prepared_key(struct super_block *sb,
struct fscrypt_prepared_key *prep_key)
{
- crypto_free_skcipher(prep_key->tfm);
+ crypto_free_sync_skcipher(prep_key->tfm);
fscrypt_destroy_inline_crypt_key(sb, prep_key);
memzero_explicit(prep_key, sizeof(*prep_key));
}
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
index b70521c55132..c4d05168522b 100644
--- a/fs/crypto/keysetup_v1.c
+++ b/fs/crypto/keysetup_v1.c
@@ -48,39 +48,30 @@ static int derive_key_aes(const u8 *master_key,
const u8 nonce[FSCRYPT_FILE_NONCE_SIZE],
u8 *derived_key, unsigned int derived_keysize)
{
- int res = 0;
- struct skcipher_request *req = NULL;
- DECLARE_CRYPTO_WAIT(wait);
- struct scatterlist src_sg, dst_sg;
- struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
-
- if (IS_ERR(tfm)) {
- res = PTR_ERR(tfm);
- tfm = NULL;
- goto out;
- }
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
- req = skcipher_request_alloc(tfm, GFP_KERNEL);
- if (!req) {
- res = -ENOMEM;
- goto out;
- }
- skcipher_request_set_callback(req,
- CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- crypto_req_done, &wait);
- res = crypto_skcipher_setkey(tfm, nonce, FSCRYPT_FILE_NONCE_SIZE);
- if (res < 0)
- goto out;
+ struct crypto_sync_skcipher *tfm;
+ int err;
- sg_init_one(&src_sg, master_key, derived_keysize);
- sg_init_one(&dst_sg, derived_key, derived_keysize);
- skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize,
- NULL);
- res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
-out:
- skcipher_request_free(req);
- crypto_free_skcipher(tfm);
- return res;
+ tfm = crypto_alloc_sync_skcipher("ecb(aes)", 0, FSCRYPT_CRYPTOAPI_MASK);
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ err = crypto_sync_skcipher_setkey(tfm, nonce, FSCRYPT_FILE_NONCE_SIZE);
+ if (err == 0) {
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+ struct scatterlist src_sg, dst_sg;
+
+ skcipher_request_set_callback(req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG |
+ CRYPTO_TFM_REQ_MAY_SLEEP,
+ NULL, NULL);
+ sg_init_one(&src_sg, master_key, derived_keysize);
+ sg_init_one(&dst_sg, derived_key, derived_keysize);
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg,
+ derived_keysize, NULL);
+ err = crypto_skcipher_encrypt(req);
+ }
+ crypto_free_sync_skcipher(tfm);
+ return err;
}
/*
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 701259991277..6ad30ae07c06 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -10,11 +10,13 @@
* Modified by Eric Biggers, 2019 for v2 policy support.
*/
+#include <linux/export.h>
#include <linux/fs_context.h>
+#include <linux/mount.h>
#include <linux/random.h>
#include <linux/seq_file.h>
#include <linux/string.h>
-#include <linux/mount.h>
+
#include "fscrypt_private.h"
/**
diff --git a/fs/d_path.c b/fs/d_path.c
index 5f4da5c8d5db..bb365511066b 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -241,9 +241,9 @@ static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
unsigned seq;
do {
- seq = read_seqcount_begin(&fs->seq);
+ seq = read_seqbegin(&fs->seq);
*root = fs->root;
- } while (read_seqcount_retry(&fs->seq, seq));
+ } while (read_seqretry(&fs->seq, seq));
}
/**
@@ -385,10 +385,10 @@ static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root,
unsigned seq;
do {
- seq = read_seqcount_begin(&fs->seq);
+ seq = read_seqbegin(&fs->seq);
*root = fs->root;
*pwd = fs->pwd;
- } while (read_seqcount_retry(&fs->seq, seq));
+ } while (read_seqretry(&fs->seq, seq));
}
/*
diff --git a/fs/dax.c b/fs/dax.c
index 676303419e9e..4229513806be 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -20,7 +20,6 @@
#include <linux/sched/signal.h>
#include <linux/uio.h>
#include <linux/vmstat.h>
-#include <linux/pfn_t.h>
#include <linux/sizes.h>
#include <linux/mmu_notifier.h>
#include <linux/iomap.h>
@@ -76,9 +75,9 @@ static struct folio *dax_to_folio(void *entry)
return page_folio(pfn_to_page(dax_to_pfn(entry)));
}
-static void *dax_make_entry(pfn_t pfn, unsigned long flags)
+static void *dax_make_entry(unsigned long pfn, unsigned long flags)
{
- return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
+ return xa_mk_value(flags | (pfn << DAX_SHIFT));
}
static bool dax_is_locked(void *entry)
@@ -257,7 +256,7 @@ static void *wait_entry_unlocked_exclusive(struct xa_state *xas, void *entry)
wq = dax_entry_waitqueue(xas, entry, &ewait.key);
prepare_to_wait_exclusive(wq, &ewait.wait,
TASK_UNINTERRUPTIBLE);
- xas_pause(xas);
+ xas_reset(xas);
xas_unlock_irq(xas);
schedule();
finish_wait(wq, &ewait.wait);
@@ -449,9 +448,6 @@ static void dax_associate_entry(void *entry, struct address_space *mapping,
if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
return;
- if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
- return;
-
index = linear_page_index(vma, address & ~(size - 1));
if (shared && (folio->mapping || dax_folio_is_shared(folio))) {
if (folio->mapping)
@@ -474,9 +470,6 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
{
struct folio *folio = dax_to_folio(entry);
- if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
- return;
-
if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
return;
@@ -719,7 +712,7 @@ retry:
if (order > 0)
flags |= DAX_PMD;
- entry = dax_make_entry(pfn_to_pfn_t(0), flags);
+ entry = dax_make_entry(0, flags);
dax_lock_entry(xas, entry);
if (xas_error(xas))
goto out_unlock;
@@ -768,12 +761,6 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping,
pgoff_t end_idx;
XA_STATE(xas, &mapping->i_pages, start_idx);
- /*
- * In the 'limited' case get_user_pages() for dax is disabled.
- */
- if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
- return NULL;
-
if (!dax_mapping(mapping))
return NULL;
@@ -1053,7 +1040,7 @@ static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
* appropriate.
*/
static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
- const struct iomap_iter *iter, void *entry, pfn_t pfn,
+ const struct iomap_iter *iter, void *entry, unsigned long pfn,
unsigned long flags)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
@@ -1251,7 +1238,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
- size_t size, void **kaddr, pfn_t *pfnp)
+ size_t size, void **kaddr, unsigned long *pfnp)
{
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
int id, rc = 0;
@@ -1269,7 +1256,7 @@ static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
rc = -EINVAL;
if (PFN_PHYS(length) < size)
goto out;
- if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
+ if (*pfnp & (PHYS_PFN(size)-1))
goto out;
rc = 0;
@@ -1373,12 +1360,12 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
{
struct inode *inode = iter->inode;
unsigned long vaddr = vmf->address;
- pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
+ unsigned long pfn = my_zero_pfn(vaddr);
vm_fault_t ret;
*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
- ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), false);
+ ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), false);
trace_dax_load_hole(inode, vmf, ret);
return ret;
}
@@ -1395,14 +1382,14 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
struct folio *zero_folio;
spinlock_t *ptl;
pmd_t pmd_entry;
- pfn_t pfn;
+ unsigned long pfn;
zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm);
if (unlikely(!zero_folio))
goto fallback;
- pfn = page_to_pfn_t(&zero_folio->page);
+ pfn = page_to_pfn(&zero_folio->page);
*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
DAX_PMD | DAX_ZERO_PAGE);
@@ -1422,8 +1409,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
mm_inc_nr_ptes(vma->vm_mm);
}
- pmd_entry = mk_pmd(&zero_folio->page, vmf->vma->vm_page_prot);
- pmd_entry = pmd_mkhuge(pmd_entry);
+ pmd_entry = folio_mk_pmd(zero_folio, vmf->vma->vm_page_prot);
set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
spin_unlock(ptl);
trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry);
@@ -1792,7 +1778,8 @@ static vm_fault_t dax_fault_return(int error)
* insertion for now and return the pfn so that caller can insert it after the
* fsync is done.
*/
-static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
+static vm_fault_t dax_fault_synchronous_pfnp(unsigned long *pfnp,
+ unsigned long pfn)
{
if (WARN_ON_ONCE(!pfnp))
return VM_FAULT_SIGBUS;
@@ -1840,7 +1827,7 @@ static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
* @pmd: distinguish whether it is a pmd fault
*/
static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
- const struct iomap_iter *iter, pfn_t *pfnp,
+ const struct iomap_iter *iter, unsigned long *pfnp,
struct xa_state *xas, void **entry, bool pmd)
{
const struct iomap *iomap = &iter->iomap;
@@ -1851,7 +1838,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
unsigned long entry_flags = pmd ? DAX_PMD : 0;
struct folio *folio;
int ret, err = 0;
- pfn_t pfn;
+ unsigned long pfn;
void *kaddr;
if (!pmd && vmf->cow_page)
@@ -1888,16 +1875,15 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
folio_ref_inc(folio);
if (pmd)
- ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn_t_to_pfn(pfn)),
- write);
+ ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn), write);
else
- ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), write);
+ ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), write);
folio_put(folio);
return ret;
}
-static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, unsigned long *pfnp,
int *iomap_errp, const struct iomap_ops *ops)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
@@ -1938,7 +1924,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
* the PTE we need to set up. If so just return and the fault will be
* retried.
*/
- if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
+ if (pmd_trans_huge(*vmf->pmd)) {
ret = VM_FAULT_NOPAGE;
goto unlock_entry;
}
@@ -2009,7 +1995,7 @@ static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas,
return false;
}
-static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp,
const struct iomap_ops *ops)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
@@ -2061,8 +2047,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* the PMD we need to set up. If so just return and the fault will be
* retried.
*/
- if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
- !pmd_devmap(*vmf->pmd)) {
+ if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd)) {
ret = 0;
goto unlock_entry;
}
@@ -2091,7 +2076,7 @@ out:
return ret;
}
#else
-static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp,
const struct iomap_ops *ops)
{
return VM_FAULT_FALLBACK;
@@ -2112,7 +2097,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* successfully.
*/
vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
- pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
+ unsigned long *pfnp, int *iomap_errp,
+ const struct iomap_ops *ops)
{
if (order == 0)
return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
@@ -2132,8 +2118,8 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault);
* This function inserts a writeable PTE or PMD entry into the page tables
* for an mmaped DAX file. It also marks the page cache entry as dirty.
*/
-static vm_fault_t
-dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
+static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf,
+ unsigned long pfn, unsigned int order)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
@@ -2155,7 +2141,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
dax_lock_entry(&xas, entry);
xas_unlock_irq(&xas);
- folio = pfn_folio(pfn_t_to_pfn(pfn));
+ folio = pfn_folio(pfn);
folio_ref_inc(folio);
if (order == 0)
ret = vmf_insert_page_mkwrite(vmf, &folio->page, true);
@@ -2182,7 +2168,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
* table entry.
*/
vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order,
- pfn_t pfn)
+ unsigned long pfn)
{
int err;
loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
diff --git a/fs/dcache.c b/fs/dcache.c
index 03d58b2d4fa3..60046ae23d51 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1436,7 +1436,7 @@ int d_set_mounted(struct dentry *dentry)
{
struct dentry *p;
int ret = -ENOENT;
- write_seqlock(&rename_lock);
+ read_seqlock_excl(&rename_lock);
for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) {
/* Need exclusion wrt. d_invalidate() */
spin_lock(&p->d_lock);
@@ -1456,7 +1456,7 @@ int d_set_mounted(struct dentry *dentry)
}
spin_unlock(&dentry->d_lock);
out:
- write_sequnlock(&rename_lock);
+ read_sequnlock_excl(&rename_lock);
return ret;
}
@@ -1731,14 +1731,14 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
dentry->d_inode = NULL;
dentry->d_parent = dentry;
dentry->d_sb = sb;
- dentry->d_op = NULL;
+ dentry->d_op = sb->__s_d_op;
+ dentry->d_flags = sb->s_d_flags;
dentry->d_fsdata = NULL;
INIT_HLIST_BL_NODE(&dentry->d_hash);
INIT_LIST_HEAD(&dentry->d_lru);
INIT_HLIST_HEAD(&dentry->d_children);
INIT_HLIST_NODE(&dentry->d_u.d_alias);
INIT_HLIST_NODE(&dentry->d_sib);
- d_set_d_op(dentry, dentry->d_sb->s_d_op);
if (dentry->d_op && dentry->d_op->d_init) {
err = dentry->d_op->d_init(dentry);
@@ -1821,8 +1821,9 @@ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
struct dentry *dentry = __d_alloc(sb, name);
if (likely(dentry)) {
dentry->d_flags |= DCACHE_NORCU;
- if (!sb->s_d_op)
- d_set_d_op(dentry, &anon_ops);
+ /* d_op_flags(&anon_ops) is 0 */
+ if (!dentry->d_op)
+ dentry->d_op = &anon_ops;
}
return dentry;
}
@@ -1837,35 +1838,50 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
}
EXPORT_SYMBOL(d_alloc_name);
-void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
+#define DCACHE_OP_FLAGS \
+ (DCACHE_OP_HASH | DCACHE_OP_COMPARE | DCACHE_OP_REVALIDATE | \
+ DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_DELETE | DCACHE_OP_PRUNE | \
+ DCACHE_OP_REAL)
+
+static unsigned int d_op_flags(const struct dentry_operations *op)
{
+ unsigned int flags = 0;
+ if (op) {
+ if (op->d_hash)
+ flags |= DCACHE_OP_HASH;
+ if (op->d_compare)
+ flags |= DCACHE_OP_COMPARE;
+ if (op->d_revalidate)
+ flags |= DCACHE_OP_REVALIDATE;
+ if (op->d_weak_revalidate)
+ flags |= DCACHE_OP_WEAK_REVALIDATE;
+ if (op->d_delete)
+ flags |= DCACHE_OP_DELETE;
+ if (op->d_prune)
+ flags |= DCACHE_OP_PRUNE;
+ if (op->d_real)
+ flags |= DCACHE_OP_REAL;
+ }
+ return flags;
+}
+
+static void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
+{
+ unsigned int flags = d_op_flags(op);
WARN_ON_ONCE(dentry->d_op);
- WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH |
- DCACHE_OP_COMPARE |
- DCACHE_OP_REVALIDATE |
- DCACHE_OP_WEAK_REVALIDATE |
- DCACHE_OP_DELETE |
- DCACHE_OP_REAL));
+ WARN_ON_ONCE(dentry->d_flags & DCACHE_OP_FLAGS);
dentry->d_op = op;
- if (!op)
- return;
- if (op->d_hash)
- dentry->d_flags |= DCACHE_OP_HASH;
- if (op->d_compare)
- dentry->d_flags |= DCACHE_OP_COMPARE;
- if (op->d_revalidate)
- dentry->d_flags |= DCACHE_OP_REVALIDATE;
- if (op->d_weak_revalidate)
- dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE;
- if (op->d_delete)
- dentry->d_flags |= DCACHE_OP_DELETE;
- if (op->d_prune)
- dentry->d_flags |= DCACHE_OP_PRUNE;
- if (op->d_real)
- dentry->d_flags |= DCACHE_OP_REAL;
-
-}
-EXPORT_SYMBOL(d_set_d_op);
+ if (flags)
+ dentry->d_flags |= flags;
+}
+
+void set_default_d_op(struct super_block *s, const struct dentry_operations *ops)
+{
+ unsigned int flags = d_op_flags(ops);
+ s->__s_d_op = ops;
+ s->s_d_flags = (s->s_d_flags & ~DCACHE_OP_FLAGS) | flags;
+}
+EXPORT_SYMBOL(set_default_d_op);
static unsigned d_flags_for_inode(struct inode *inode)
{
@@ -2530,13 +2546,19 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
unsigned int hash = name->hash;
struct hlist_bl_head *b = in_lookup_hash(parent, hash);
struct hlist_bl_node *node;
- struct dentry *new = d_alloc(parent, name);
+ struct dentry *new = __d_alloc(parent->d_sb, name);
struct dentry *dentry;
unsigned seq, r_seq, d_seq;
if (unlikely(!new))
return ERR_PTR(-ENOMEM);
+ new->d_flags |= DCACHE_PAR_LOOKUP;
+ spin_lock(&parent->d_lock);
+ new->d_parent = dget_dlock(parent);
+ hlist_add_head(&new->d_sib, &parent->d_children);
+ spin_unlock(&parent->d_lock);
+
retry:
rcu_read_lock();
seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
@@ -2620,8 +2642,6 @@ retry:
return dentry;
}
rcu_read_unlock();
- /* we can't take ->d_lock here; it's OK, though. */
- new->d_flags |= DCACHE_PAR_LOOKUP;
new->d_wait = wq;
hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b);
hlist_bl_unlock(b);
@@ -2667,7 +2687,8 @@ EXPORT_SYMBOL(__d_lookup_unhash_wake);
/* inode->i_lock held if inode is non-NULL */
-static inline void __d_add(struct dentry *dentry, struct inode *inode)
+static inline void __d_add(struct dentry *dentry, struct inode *inode,
+ const struct dentry_operations *ops)
{
wait_queue_head_t *d_wait;
struct inode *dir = NULL;
@@ -2678,6 +2699,8 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode)
n = start_dir_add(dir);
d_wait = __d_lookup_unhash(dentry);
}
+ if (unlikely(ops))
+ d_set_d_op(dentry, ops);
if (inode) {
unsigned add_flags = d_flags_for_inode(inode);
hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
@@ -2709,7 +2732,7 @@ void d_add(struct dentry *entry, struct inode *inode)
security_d_instantiate(entry, inode);
spin_lock(&inode->i_lock);
}
- __d_add(entry, inode);
+ __d_add(entry, inode, NULL);
}
EXPORT_SYMBOL(d_add);
@@ -2774,10 +2797,10 @@ static void copy_name(struct dentry *dentry, struct dentry *target)
* @target: new dentry
* @exchange: exchange the two dentries
*
- * Update the dcache to reflect the move of a file name. Negative
- * dcache entries should not be moved in this way. Caller must hold
- * rename_lock, the i_mutex of the source and target directories,
- * and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
+ * Update the dcache to reflect the move of a file name. Negative dcache
+ * entries should not be moved in this way. Caller must hold rename_lock, the
+ * i_rwsem of the source and target directories (exclusively), and the sb->
+ * s_vfs_rename_mutex if they differ. See lock_rename().
*/
static void __d_move(struct dentry *dentry, struct dentry *target,
bool exchange)
@@ -2923,7 +2946,7 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
* This helper attempts to cope with remotely renamed directories
*
* It assumes that the caller is already holding
- * dentry->d_parent->d_inode->i_mutex, and rename_lock
+ * dentry->d_parent->d_inode->i_rwsem, and rename_lock
*
* Note: If ever the locking in lock_rename() changes, then please
* remember to update this too...
@@ -2961,30 +2984,8 @@ out_err:
return ret;
}
-/**
- * d_splice_alias - splice a disconnected dentry into the tree if one exists
- * @inode: the inode which may have a disconnected dentry
- * @dentry: a negative dentry which we want to point to the inode.
- *
- * If inode is a directory and has an IS_ROOT alias, then d_move that in
- * place of the given dentry and return it, else simply d_add the inode
- * to the dentry and return NULL.
- *
- * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
- * we should error out: directories can't have multiple aliases.
- *
- * This is needed in the lookup routine of any filesystem that is exportable
- * (via knfsd) so that we can build dcache paths to directories effectively.
- *
- * If a dentry was found and moved, then it is returned. Otherwise NULL
- * is returned. This matches the expected return value of ->lookup.
- *
- * Cluster filesystems may call this function with a negative, hashed dentry.
- * In that case, we know that the inode will be a regular file, and also this
- * will only occur during atomic_open. So we need to check for the dentry
- * being already hashed only in the final case.
- */
-struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
+struct dentry *d_splice_alias_ops(struct inode *inode, struct dentry *dentry,
+ const struct dentry_operations *ops)
{
if (IS_ERR(inode))
return ERR_CAST(inode);
@@ -3030,9 +3031,37 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
}
}
out:
- __d_add(dentry, inode);
+ __d_add(dentry, inode, ops);
return NULL;
}
+
+/**
+ * d_splice_alias - splice a disconnected dentry into the tree if one exists
+ * @inode: the inode which may have a disconnected dentry
+ * @dentry: a negative dentry which we want to point to the inode.
+ *
+ * If inode is a directory and has an IS_ROOT alias, then d_move that in
+ * place of the given dentry and return it, else simply d_add the inode
+ * to the dentry and return NULL.
+ *
+ * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
+ * we should error out: directories can't have multiple aliases.
+ *
+ * This is needed in the lookup routine of any filesystem that is exportable
+ * (via knfsd) so that we can build dcache paths to directories effectively.
+ *
+ * If a dentry was found and moved, then it is returned. Otherwise NULL
+ * is returned. This matches the expected return value of ->lookup.
+ *
+ * Cluster filesystems may call this function with a negative, hashed dentry.
+ * In that case, we know that the inode will be a regular file, and also this
+ * will only occur during atomic_open. So we need to check for the dentry
+ * being already hashed only in the final case.
+ */
+struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
+{
+ return d_splice_alias_ops(inode, dentry, NULL);
+}
EXPORT_SYMBOL(d_splice_alias);
/*
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 69e9ddcb113d..3ec3324c2060 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -47,29 +47,12 @@ const struct file_operations debugfs_noop_file_operations = {
#define F_DENTRY(filp) ((filp)->f_path.dentry)
-const void *debugfs_get_aux(const struct file *file)
+void *debugfs_get_aux(const struct file *file)
{
return DEBUGFS_I(file_inode(file))->aux;
}
EXPORT_SYMBOL_GPL(debugfs_get_aux);
-const struct file_operations *debugfs_real_fops(const struct file *filp)
-{
- struct debugfs_fsdata *fsd = F_DENTRY(filp)->d_fsdata;
-
- if (!fsd) {
- /*
- * Urgh, we've been called w/o a protecting
- * debugfs_file_get().
- */
- WARN_ON(1);
- return NULL;
- }
-
- return fsd->real_fops;
-}
-EXPORT_SYMBOL_GPL(debugfs_real_fops);
-
enum dbgfs_get_mode {
DBGFS_GET_ALREADY,
DBGFS_GET_REGULAR,
@@ -302,15 +285,13 @@ static int debugfs_locked_down(struct inode *inode,
static int open_proxy_open(struct inode *inode, struct file *filp)
{
struct dentry *dentry = F_DENTRY(filp);
- const struct file_operations *real_fops = NULL;
+ const struct file_operations *real_fops = DEBUGFS_I(inode)->real_fops;
int r;
r = __debugfs_file_get(dentry, DBGFS_GET_REGULAR);
if (r)
return r == -EIO ? -ENOENT : r;
- real_fops = debugfs_real_fops(filp);
-
r = debugfs_locked_down(inode, filp, real_fops);
if (r)
goto out;
@@ -352,7 +333,6 @@ static ret_type full_proxy_ ## name(proto) \
{ \
struct dentry *dentry = F_DENTRY(filp); \
struct debugfs_fsdata *fsd = dentry->d_fsdata; \
- const struct file_operations *real_fops; \
ret_type r; \
\
if (!(fsd->methods & bit)) \
@@ -360,14 +340,13 @@ static ret_type full_proxy_ ## name(proto) \
r = debugfs_file_get(dentry); \
if (unlikely(r)) \
return r; \
- real_fops = debugfs_real_fops(filp); \
- r = real_fops->name(args); \
+ r = fsd->real_fops->name(args); \
debugfs_file_put(dentry); \
return r; \
}
-#define FULL_PROXY_FUNC_BOTH(name, ret_type, filp, proto, args, bit, ret) \
-static ret_type full_proxy_ ## name(proto) \
+#define SHORT_PROXY_FUNC(name, ret_type, filp, proto, args, bit, ret) \
+static ret_type short_proxy_ ## name(proto) \
{ \
struct dentry *dentry = F_DENTRY(filp); \
struct debugfs_fsdata *fsd = dentry->d_fsdata; \
@@ -378,27 +357,38 @@ static ret_type full_proxy_ ## name(proto) \
r = debugfs_file_get(dentry); \
if (unlikely(r)) \
return r; \
- if (fsd->real_fops) \
- r = fsd->real_fops->name(args); \
- else \
- r = fsd->short_fops->name(args); \
+ r = fsd->short_fops->name(args); \
debugfs_file_put(dentry); \
return r; \
}
-FULL_PROXY_FUNC_BOTH(llseek, loff_t, filp,
- PROTO(struct file *filp, loff_t offset, int whence),
- ARGS(filp, offset, whence), HAS_LSEEK, -ESPIPE);
+SHORT_PROXY_FUNC(llseek, loff_t, filp,
+ PROTO(struct file *filp, loff_t offset, int whence),
+ ARGS(filp, offset, whence), HAS_LSEEK, -ESPIPE);
-FULL_PROXY_FUNC_BOTH(read, ssize_t, filp,
- PROTO(struct file *filp, char __user *buf, size_t size,
- loff_t *ppos),
- ARGS(filp, buf, size, ppos), HAS_READ, -EINVAL);
+FULL_PROXY_FUNC(llseek, loff_t, filp,
+ PROTO(struct file *filp, loff_t offset, int whence),
+ ARGS(filp, offset, whence), HAS_LSEEK, -ESPIPE);
-FULL_PROXY_FUNC_BOTH(write, ssize_t, filp,
- PROTO(struct file *filp, const char __user *buf,
- size_t size, loff_t *ppos),
- ARGS(filp, buf, size, ppos), HAS_WRITE, -EINVAL);
+SHORT_PROXY_FUNC(read, ssize_t, filp,
+ PROTO(struct file *filp, char __user *buf, size_t size,
+ loff_t *ppos),
+ ARGS(filp, buf, size, ppos), HAS_READ, -EINVAL);
+
+FULL_PROXY_FUNC(read, ssize_t, filp,
+ PROTO(struct file *filp, char __user *buf, size_t size,
+ loff_t *ppos),
+ ARGS(filp, buf, size, ppos), HAS_READ, -EINVAL);
+
+SHORT_PROXY_FUNC(write, ssize_t, filp,
+ PROTO(struct file *filp, const char __user *buf,
+ size_t size, loff_t *ppos),
+ ARGS(filp, buf, size, ppos), HAS_WRITE, -EINVAL);
+
+FULL_PROXY_FUNC(write, ssize_t, filp,
+ PROTO(struct file *filp, const char __user *buf,
+ size_t size, loff_t *ppos),
+ ARGS(filp, buf, size, ppos), HAS_WRITE, -EINVAL);
FULL_PROXY_FUNC(unlocked_ioctl, long, filp,
PROTO(struct file *filp, unsigned int cmd, unsigned long arg),
@@ -410,22 +400,21 @@ static __poll_t full_proxy_poll(struct file *filp,
struct dentry *dentry = F_DENTRY(filp);
struct debugfs_fsdata *fsd = dentry->d_fsdata;
__poll_t r = 0;
- const struct file_operations *real_fops;
if (!(fsd->methods & HAS_POLL))
return DEFAULT_POLLMASK;
if (debugfs_file_get(dentry))
return EPOLLHUP;
- real_fops = debugfs_real_fops(filp);
- r = real_fops->poll(filp, wait);
+ r = fsd->real_fops->poll(filp, wait);
debugfs_file_put(dentry);
return r;
}
-static int full_proxy_release(struct inode *inode, struct file *filp)
+static int full_proxy_release(struct inode *inode, struct file *file)
{
- const struct file_operations *real_fops = debugfs_real_fops(filp);
+ struct debugfs_fsdata *fsd = F_DENTRY(file)->d_fsdata;
+ const struct file_operations *real_fops = fsd->real_fops;
int r = 0;
/*
@@ -435,7 +424,7 @@ static int full_proxy_release(struct inode *inode, struct file *filp)
* ->i_private is still being meaningful here.
*/
if (real_fops->release)
- r = real_fops->release(inode, filp);
+ r = real_fops->release(inode, file);
fops_put(real_fops);
return r;
@@ -517,9 +506,9 @@ static int full_proxy_open_short(struct inode *inode, struct file *filp)
const struct file_operations debugfs_full_short_proxy_file_operations = {
.open = full_proxy_open_short,
- .llseek = full_proxy_llseek,
- .read = full_proxy_read,
- .write = full_proxy_write,
+ .llseek = short_proxy_llseek,
+ .read = short_proxy_read,
+ .write = short_proxy_write,
};
ssize_t debugfs_attr_read(struct file *file, char __user *buf,
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 30c4944e1862..a0357b0cf362 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -258,7 +258,6 @@ static struct vfsmount *debugfs_automount(struct path *path)
}
static const struct dentry_operations debugfs_dops = {
- .d_delete = always_delete_dentry,
.d_release = debugfs_release_dentry,
.d_automount = debugfs_automount,
};
@@ -273,7 +272,8 @@ static int debugfs_fill_super(struct super_block *sb, struct fs_context *fc)
return err;
sb->s_op = &debugfs_super_operations;
- sb->s_d_op = &debugfs_dops;
+ set_default_d_op(sb, &debugfs_dops);
+ sb->s_d_flags |= DCACHE_DONTCACHE;
debugfs_apply_options(sb);
@@ -384,27 +384,12 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
if (!parent)
parent = debugfs_mount->mnt_root;
- inode_lock(d_inode(parent));
- if (unlikely(IS_DEADDIR(d_inode(parent))))
- dentry = ERR_PTR(-ENOENT);
- else
- dentry = lookup_noperm(&QSTR(name), parent);
- if (!IS_ERR(dentry) && d_really_is_positive(dentry)) {
- if (d_is_dir(dentry))
- pr_err("Directory '%s' with parent '%s' already present!\n",
- name, parent->d_name.name);
- else
- pr_err("File '%s' in directory '%s' already present!\n",
- name, parent->d_name.name);
- dput(dentry);
- dentry = ERR_PTR(-EEXIST);
- }
-
+ dentry = simple_start_creating(parent, name);
if (IS_ERR(dentry)) {
- inode_unlock(d_inode(parent));
+ if (dentry == ERR_PTR(-EEXIST))
+ pr_err("'%s' already exists in '%pd'\n", name, parent);
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
-
return dentry;
}
@@ -459,7 +444,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
proxy_fops = &debugfs_noop_file_operations;
inode->i_fop = proxy_fops;
DEBUGFS_I(inode)->raw = real_fops;
- DEBUGFS_I(inode)->aux = aux;
+ DEBUGFS_I(inode)->aux = (void *)aux;
d_instantiate(dentry, inode);
fsnotify_create(d_inode(dentry->d_parent), dentry);
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index 93483fe84425..427987f81571 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -19,7 +19,7 @@ struct debugfs_inode_info {
const struct debugfs_short_fops *short_fops;
debugfs_automount_t automount;
};
- const void *aux;
+ void *aux;
};
static inline struct debugfs_inode_info *DEBUGFS_I(struct inode *inode)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 9c20d78e41f6..fdf22264a8e9 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -381,7 +381,7 @@ static int devpts_fill_super(struct super_block *s, struct fs_context *fc)
s->s_blocksize_bits = 10;
s->s_magic = DEVPTS_SUPER_MAGIC;
s->s_op = &devpts_sops;
- s->s_d_op = &simple_dentry_operations;
+ s->s_d_flags = DCACHE_DONTCACHE;
s->s_time_gran = 1;
fsi->sb = s;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index bbd05f1a2145..2267f5ae7f77 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -996,7 +996,7 @@ do_holes:
dio_unpin_page(dio, page);
goto out;
}
- zero_user(page, from, 1 << blkbits);
+ memzero_page(page, from, 1 << blkbits);
sdio->block_in_file++;
from += 1 << blkbits;
dio->result += 1 << blkbits;
@@ -1083,8 +1083,8 @@ static inline int drop_refcount(struct dio *dio)
* The locking rules are governed by the flags parameter:
* - if the flags value contains DIO_LOCKING we use a fancy locking
* scheme for dumb filesystems.
- * For writes this function is called under i_mutex and returns with
- * i_mutex held, for reads, i_mutex is not held on entry, but it is
+ * For writes this function is called under i_rwsem and returns with
+ * i_rwsem held, for reads, i_rwsem is not held on entry, but it is
* taken and dropped again before returning.
* - if the flags value does NOT contain DIO_LOCKING we don't use any
* internal locking but rather rely on the filesystem to synchronize
@@ -1094,7 +1094,7 @@ static inline int drop_refcount(struct dio *dio)
* counter before starting direct I/O, and decrement it once we are done.
* Truncate can wait for it to reach zero to provide exclusion. It is
* expected that filesystem provide exclusion between new direct I/O
- * and truncates. For DIO_LOCKING filesystems this is done by i_mutex,
+ * and truncates. For DIO_LOCKING filesystems this is done by i_rwsem,
* but other filesystems need to take care of this on their own.
*
* NOTE: if you pass "sdio" to anything by pointer make sure that function
@@ -1279,7 +1279,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
/*
* All block lookups have been performed. For READ requests
- * we can let i_mutex go now that its achieved its purpose
+ * we can let i_rwsem go now that its achieved its purpose
* of protecting us from looking up uninitialized blocks.
*/
if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING))
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index e01d5f29f4d2..6dd3a524cd35 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -509,7 +509,7 @@ static void add_scan(struct dlm_ls *ls, struct dlm_rsb *r)
void dlm_rsb_scan(struct timer_list *timer)
{
- struct dlm_ls *ls = from_timer(ls, timer, ls_scan_timer);
+ struct dlm_ls *ls = timer_container_of(ls, timer, ls_scan_timer);
int our_nodeid = dlm_our_nodeid();
struct dlm_rsb *r;
int rv;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index ce0a3c5ed0ca..5f8f96da09fe 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -193,7 +193,7 @@ static int ecryptfs_mmap(struct file *file, struct vm_area_struct *vma)
* natively. If FILESYSTEM_MAX_STACK_DEPTH > 2 or ecryptfs
* allows recursive mounting, this will need to be extended.
*/
- if (!lower_file->f_op->mmap)
+ if (!can_mmap_file(lower_file))
return -ENODEV;
return generic_file_mmap(file, vma);
}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 493d7f194956..72fbe1316ab8 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -635,10 +635,10 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
}
rd.old_mnt_idmap = &nop_mnt_idmap;
- rd.old_dir = d_inode(lower_old_dir_dentry);
+ rd.old_parent = lower_old_dir_dentry;
rd.old_dentry = lower_old_dentry;
rd.new_mnt_idmap = &nop_mnt_idmap;
- rd.new_dir = d_inode(lower_new_dir_dentry);
+ rd.new_parent = lower_new_dir_dentry;
rd.new_dentry = lower_new_dentry;
rc = vfs_rename(&rd);
if (rc)
@@ -1124,13 +1124,13 @@ out:
return rc;
}
-static int ecryptfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+static int ecryptfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
return vfs_fileattr_get(ecryptfs_dentry_to_lower(dentry), fa);
}
static int ecryptfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
int rc;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 8dd1d7189c3b..eab1beb846d3 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -20,6 +20,7 @@
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/fs_stack.h>
+#include <linux/sysfs.h>
#include <linux/slab.h>
#include <linux/magic.h>
#include "ecryptfs_kernel.h"
@@ -471,7 +472,7 @@ static int ecryptfs_get_tree(struct fs_context *fc)
sbi = NULL;
s->s_op = &ecryptfs_sops;
s->s_xattr = ecryptfs_xattr_handlers;
- s->s_d_op = &ecryptfs_dops;
+ set_default_d_op(s, &ecryptfs_dops);
err = "Reading sb failed";
rc = kern_path(fc->source, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
@@ -764,7 +765,7 @@ static struct kobject *ecryptfs_kobj;
static ssize_t version_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buff)
{
- return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK);
+ return sysfs_emit(buff, "%d\n", ECRYPTFS_VERSIONING_MASK);
}
static struct kobj_attribute version_attr = __ATTR_RO(version);
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 60f0ac8744b5..2c2b12fedeae 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -228,7 +228,7 @@ out:
/**
* ecryptfs_write_begin
- * @file: The eCryptfs file
+ * @iocb: I/O control block for the eCryptfs file
* @mapping: The eCryptfs object
* @pos: The file offset at which to start writing
* @len: Length of the write
@@ -239,7 +239,7 @@ out:
*
* Returns zero on success; non-zero otherwise
*/
-static int ecryptfs_write_begin(struct file *file,
+static int ecryptfs_write_begin(const struct kiocb *iocb,
struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
@@ -322,7 +322,7 @@ static int ecryptfs_write_begin(struct file *file,
* Note, this will increase i_size. */
if (index != 0) {
if (prev_page_end_size > i_size_read(mapping->host)) {
- rc = ecryptfs_truncate(file->f_path.dentry,
+ rc = ecryptfs_truncate(iocb->ki_filp->f_path.dentry,
prev_page_end_size);
if (rc) {
printk(KERN_ERR "%s: Error on attempt to "
@@ -429,7 +429,7 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
/**
* ecryptfs_write_end
- * @file: The eCryptfs file object
+ * @iocb: I/O control block for the eCryptfs file
* @mapping: The eCryptfs object
* @pos: The file position
* @len: The length of the data (unused)
@@ -437,7 +437,7 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
* @folio: The eCryptfs folio
* @fsdata: The fsdata (unused)
*/
-static int ecryptfs_write_end(struct file *file,
+static int ecryptfs_write_end(const struct kiocb *iocb,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 98a7299a9ee9..2891614abf8d 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -138,7 +138,7 @@ const struct inode_operations efivarfs_dir_inode_operations = {
};
static int
-efivarfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+efivarfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
unsigned int i_flags;
unsigned int flags = 0;
@@ -154,7 +154,7 @@ efivarfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
static int
efivarfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
unsigned int i_flags = 0;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index c900d98bf494..c4a139911356 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -183,7 +183,6 @@ static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr)
static const struct dentry_operations efivarfs_d_ops = {
.d_compare = efivarfs_d_compare,
.d_hash = efivarfs_d_hash,
- .d_delete = always_delete_dentry,
};
static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name)
@@ -350,7 +349,8 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = EFIVARFS_MAGIC;
sb->s_op = &efivarfs_ops;
- sb->s_d_op = &efivarfs_d_ops;
+ set_default_d_op(sb, &efivarfs_d_ops);
+ sb->s_d_flags |= DCACHE_DONTCACHE;
sb->s_time_gran = 1;
if (!efivar_supports_writes())
@@ -390,10 +390,16 @@ static int efivarfs_reconfigure(struct fs_context *fc)
return 0;
}
+static void efivarfs_free(struct fs_context *fc)
+{
+ kfree(fc->s_fs_info);
+}
+
static const struct fs_context_operations efivarfs_context_ops = {
.get_tree = efivarfs_get_tree,
.parse_param = efivarfs_parse_param,
.reconfigure = efivarfs_reconfigure,
+ .free = efivarfs_free,
};
static int efivarfs_check_missing(efi_char16_t *name16, efi_guid_t vendor,
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 6beeb7063871..7b26efc271ee 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -147,6 +147,8 @@ config EROFS_FS_ZIP_ZSTD
config EROFS_FS_ZIP_ACCEL
bool "EROFS hardware decompression support"
depends on EROFS_FS_ZIP
+ select CRYPTO
+ select CRYPTO_DEFLATE
help
Saying Y here includes hardware accelerator support for reading
EROFS file systems containing compressed data. It gives better
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 6a329c329f43..3b1ba571c728 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -49,11 +49,18 @@ void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap)
return buf->base + (offset & ~PAGE_MASK);
}
-void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb)
+int erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb,
+ bool in_metabox)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
buf->file = NULL;
+ if (in_metabox) {
+ if (unlikely(!sbi->metabox_inode))
+ return -EFSCORRUPTED;
+ buf->mapping = sbi->metabox_inode->i_mapping;
+ return 0;
+ }
buf->off = sbi->dif0.fsoff;
if (erofs_is_fileio_mode(sbi)) {
buf->file = sbi->dif0.file; /* some fs like FUSE needs it */
@@ -62,13 +69,18 @@ void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb)
buf->mapping = sbi->dif0.fscache->inode->i_mapping;
else
buf->mapping = sb->s_bdev->bd_mapping;
+ return 0;
}
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
- erofs_off_t offset, bool need_kmap)
+ erofs_off_t offset, bool in_metabox)
{
- erofs_init_metabuf(buf, sb);
- return erofs_bread(buf, offset, need_kmap);
+ int err;
+
+ err = erofs_init_metabuf(buf, sb, in_metabox);
+ if (err)
+ return ERR_PTR(err);
+ return erofs_bread(buf, offset, true);
}
int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
@@ -118,7 +130,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
pos = ALIGN(erofs_iloc(inode) + vi->inode_isize +
vi->xattr_isize, unit) + unit * chunknr;
- idx = erofs_read_metabuf(&buf, sb, pos, true);
+ idx = erofs_read_metabuf(&buf, sb, pos, erofs_inode_in_metabox(inode));
if (IS_ERR(idx)) {
err = PTR_ERR(idx);
goto out;
@@ -214,9 +226,11 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
/*
* bit 30: I/O error occurred on this folio
+ * bit 29: CPU has dirty data in D-cache (needs aliasing handling);
* bit 0 - 29: remaining parts to complete this folio
*/
-#define EROFS_ONLINEFOLIO_EIO (1 << 30)
+#define EROFS_ONLINEFOLIO_EIO 30
+#define EROFS_ONLINEFOLIO_DIRTY 29
void erofs_onlinefolio_init(struct folio *folio)
{
@@ -233,19 +247,23 @@ void erofs_onlinefolio_split(struct folio *folio)
atomic_inc((atomic_t *)&folio->private);
}
-void erofs_onlinefolio_end(struct folio *folio, int err)
+void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty)
{
int orig, v;
do {
orig = atomic_read((atomic_t *)&folio->private);
- v = (orig - 1) | (err ? EROFS_ONLINEFOLIO_EIO : 0);
+ DBG_BUGON(orig <= 0);
+ v = dirty << EROFS_ONLINEFOLIO_DIRTY;
+ v |= (orig - 1) | (!!err << EROFS_ONLINEFOLIO_EIO);
} while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig);
- if (v & ~EROFS_ONLINEFOLIO_EIO)
+ if (v & (BIT(EROFS_ONLINEFOLIO_DIRTY) - 1))
return;
folio->private = 0;
- folio_end_read(folio, !(v & EROFS_ONLINEFOLIO_EIO));
+ if (v & BIT(EROFS_ONLINEFOLIO_DIRTY))
+ flush_dcache_folio(folio);
+ folio_end_read(folio, !(v & BIT(EROFS_ONLINEFOLIO_EIO)));
}
static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
@@ -258,51 +276,51 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
map.m_la = offset;
map.m_llen = length;
-
ret = erofs_map_blocks(inode, &map);
if (ret < 0)
return ret;
- mdev = (struct erofs_map_dev) {
- .m_deviceid = map.m_deviceid,
- .m_pa = map.m_pa,
- };
- ret = erofs_map_dev(sb, &mdev);
- if (ret)
- return ret;
-
iomap->offset = map.m_la;
- if (flags & IOMAP_DAX)
- iomap->dax_dev = mdev.m_dif->dax_dev;
- else
- iomap->bdev = mdev.m_bdev;
iomap->length = map.m_llen;
iomap->flags = 0;
iomap->private = NULL;
-
+ iomap->addr = IOMAP_NULL_ADDR;
if (!(map.m_flags & EROFS_MAP_MAPPED)) {
iomap->type = IOMAP_HOLE;
- iomap->addr = IOMAP_NULL_ADDR;
- if (!iomap->length)
- iomap->length = length;
return 0;
}
+ if (!(map.m_flags & EROFS_MAP_META) || !erofs_inode_in_metabox(inode)) {
+ mdev = (struct erofs_map_dev) {
+ .m_deviceid = map.m_deviceid,
+ .m_pa = map.m_pa,
+ };
+ ret = erofs_map_dev(sb, &mdev);
+ if (ret)
+ return ret;
+
+ if (flags & IOMAP_DAX)
+ iomap->dax_dev = mdev.m_dif->dax_dev;
+ else
+ iomap->bdev = mdev.m_bdev;
+ iomap->addr = mdev.m_dif->fsoff + mdev.m_pa;
+ if (flags & IOMAP_DAX)
+ iomap->addr += mdev.m_dif->dax_part_off;
+ }
+
if (map.m_flags & EROFS_MAP_META) {
void *ptr;
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
iomap->type = IOMAP_INLINE;
- ptr = erofs_read_metabuf(&buf, sb, mdev.m_pa, true);
+ ptr = erofs_read_metabuf(&buf, sb, map.m_pa,
+ erofs_inode_in_metabox(inode));
if (IS_ERR(ptr))
return PTR_ERR(ptr);
iomap->inline_data = ptr;
iomap->private = buf.base;
} else {
iomap->type = IOMAP_MAPPED;
- iomap->addr = mdev.m_dif->fsoff + mdev.m_pa;
- if (flags & IOMAP_DAX)
- iomap->addr += mdev.m_dif->dax_part_off;
}
return 0;
}
@@ -351,11 +369,16 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
*/
static int erofs_read_folio(struct file *file, struct folio *folio)
{
+ trace_erofs_read_folio(folio, true);
+
return iomap_read_folio(folio, &erofs_iomap_ops);
}
static void erofs_readahead(struct readahead_control *rac)
{
+ trace_erofs_readahead(rac->mapping->host, readahead_index(rac),
+ readahead_count(rac), true);
+
return iomap_readahead(rac, &erofs_iomap_ops);
}
@@ -409,20 +432,20 @@ static const struct vm_operations_struct erofs_dax_vm_ops = {
.huge_fault = erofs_dax_huge_fault,
};
-static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int erofs_file_mmap_prepare(struct vm_area_desc *desc)
{
- if (!IS_DAX(file_inode(file)))
- return generic_file_readonly_mmap(file, vma);
+ if (!IS_DAX(file_inode(desc->file)))
+ return generic_file_readonly_mmap_prepare(desc);
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+ if ((desc->vm_flags & VM_SHARED) && (desc->vm_flags & VM_MAYWRITE))
return -EINVAL;
- vma->vm_ops = &erofs_dax_vm_ops;
- vm_flags_set(vma, VM_HUGEPAGE);
+ desc->vm_ops = &erofs_dax_vm_ops;
+ desc->vm_flags |= VM_HUGEPAGE;
return 0;
}
#else
-#define erofs_file_mmap generic_file_readonly_mmap
+#define erofs_file_mmap_prepare generic_file_readonly_mmap_prepare
#endif
static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence)
@@ -452,7 +475,7 @@ static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence)
const struct file_operations erofs_file_fops = {
.llseek = erofs_file_llseek,
.read_iter = erofs_file_read_iter,
- .mmap = erofs_file_mmap,
+ .mmap_prepare = erofs_file_mmap_prepare,
.get_unmapped_area = thp_get_unmapped_area,
.splice_read = filemap_splice_read,
};
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index bf62e2836b60..354762c9723f 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -301,13 +301,11 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
cur = min(cur, rq->outputsize);
if (cur && rq->out[0]) {
kin = kmap_local_page(rq->in[nrpages_in - 1]);
- if (rq->out[0] == rq->in[nrpages_in - 1]) {
+ if (rq->out[0] == rq->in[nrpages_in - 1])
memmove(kin + rq->pageofs_out, kin + pi, cur);
- flush_dcache_page(rq->out[0]);
- } else {
+ else
memcpy_to_page(rq->out[0], rq->pageofs_out,
kin + pi, cur);
- }
kunmap_local(kin);
}
rq->outputsize -= cur;
@@ -325,14 +323,12 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
po = (rq->pageofs_out + cur + pi) & ~PAGE_MASK;
DBG_BUGON(no >= nrpages_out);
cnt = min(insz - pi, PAGE_SIZE - po);
- if (rq->out[no] == rq->in[ni]) {
+ if (rq->out[no] == rq->in[ni])
memmove(kin + po,
kin + rq->pageofs_in + pi, cnt);
- flush_dcache_page(rq->out[no]);
- } else if (rq->out[no]) {
+ else if (rq->out[no])
memcpy_to_page(rq->out[no], po,
kin + rq->pageofs_in + pi, cnt);
- }
pi += cnt;
} while (pi < insz);
kunmap_local(kin);
@@ -471,7 +467,7 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb)
return -EOPNOTSUPP;
}
- erofs_init_metabuf(&buf, sb);
+ (void)erofs_init_metabuf(&buf, sb, false);
offset = EROFS_SUPER_OFFSET + sbi->sb_size;
alg = 0;
for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) {
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index 2fae209d0274..debf469ad6bd 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -34,7 +34,8 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
}
if (!dir_emit(ctx, de_name, de_namelen,
- le64_to_cpu(de->nid), d_type))
+ erofs_nid_to_ino64(EROFS_SB(dir->i_sb),
+ le64_to_cpu(de->nid)), d_type))
return 1;
++de;
ctx->pos += sizeof(struct erofs_dirent);
@@ -47,8 +48,12 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
struct inode *dir = file_inode(f);
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct super_block *sb = dir->i_sb;
+ struct file_ra_state *ra = &f->f_ra;
unsigned long bsz = sb->s_blocksize;
unsigned int ofs = erofs_blkoff(sb, ctx->pos);
+ pgoff_t ra_pages = DIV_ROUND_UP_POW2(
+ EROFS_I_SB(dir)->dir_ra_bytes, PAGE_SIZE);
+ pgoff_t nr_pages = DIV_ROUND_UP_POW2(dir->i_size, PAGE_SIZE);
int err = 0;
bool initial = true;
@@ -58,6 +63,21 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
struct erofs_dirent *de;
unsigned int nameoff, maxsize;
+ if (fatal_signal_pending(current)) {
+ err = -ERESTARTSYS;
+ break;
+ }
+
+ /* readahead blocks to enhance performance for large directories */
+ if (ra_pages) {
+ pgoff_t idx = DIV_ROUND_UP_POW2(ctx->pos, PAGE_SIZE);
+ pgoff_t pages = min(nr_pages - idx, ra_pages);
+
+ if (pages > 1 && !ra_has_index(ra, idx))
+ page_cache_sync_readahead(dir->i_mapping, ra,
+ f, idx, pages);
+ }
+
de = erofs_bread(&buf, dbstart, true);
if (IS_ERR(de)) {
erofs_err(sb, "failed to readdir of logical block %llu of nid %llu",
@@ -88,6 +108,7 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
break;
ctx->pos = dbstart + maxsize;
ofs = 0;
+ cond_resched();
}
erofs_put_metabuf(&buf);
if (EROFS_I(dir)->dot_omitted && ctx->pos == dir->i_size) {
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 767fb4acdc93..377ee12b8b96 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -15,6 +15,7 @@
#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004
+#define EROFS_FEATURE_COMPAT_SHARED_EA_IN_METABOX 0x00000008
/*
* Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
@@ -31,8 +32,9 @@
#define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020
#define EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES 0x00000040
#define EROFS_FEATURE_INCOMPAT_48BIT 0x00000080
+#define EROFS_FEATURE_INCOMPAT_METABOX 0x00000100
#define EROFS_ALL_FEATURE_INCOMPAT \
- ((EROFS_FEATURE_INCOMPAT_48BIT << 1) - 1)
+ ((EROFS_FEATURE_INCOMPAT_METABOX << 1) - 1)
#define EROFS_SB_EXTSLOT_SIZE 16
@@ -46,7 +48,7 @@ struct erofs_deviceslot {
};
#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot)
-/* erofs on-disk super block (currently 128 bytes) */
+/* erofs on-disk super block (currently 144 bytes at maximum) */
struct erofs_super_block {
__le32 magic; /* file system magic number */
__le32 checksum; /* crc32c to avoid unexpected on-disk overlap */
@@ -82,7 +84,9 @@ struct erofs_super_block {
__u8 reserved[3];
__le32 build_time; /* seconds added to epoch for mkfs time */
__le64 rootnid_8b; /* (48BIT on) nid of root directory */
- __u8 reserved2[8];
+ __le64 reserved2;
+ __le64 metabox_nid; /* (METABOX on) nid of the metabox inode */
+ __le64 reserved3; /* [align to extslot 1] */
};
/*
@@ -267,6 +271,9 @@ struct erofs_inode_chunk_index {
__le32 startblk_lo; /* starting block number of this chunk */
};
+#define EROFS_DIRENT_NID_METABOX_BIT 63
+#define EROFS_DIRENT_NID_MASK (BIT_ULL(EROFS_DIRENT_NID_METABOX_BIT) - 1)
+
/* dirent sorts in alphabet order, thus we can do binary search */
struct erofs_dirent {
__le64 nid; /* node number */
@@ -434,7 +441,7 @@ static inline void erofs_check_ondisk_layout_definitions(void)
.h_clusterbits = 1 << Z_EROFS_FRAGMENT_INODE_BIT
};
- BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128);
+ BUILD_BUG_ON(sizeof(struct erofs_super_block) != 144);
BUILD_BUG_ON(sizeof(struct erofs_inode_compact) != 32);
BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64);
BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12);
diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c
index 7d81f504bff0..b7b3432a9882 100644
--- a/fs/erofs/fileio.c
+++ b/fs/erofs/fileio.c
@@ -38,7 +38,7 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret)
} else {
bio_for_each_folio_all(fi, &rq->bio) {
DBG_BUGON(folio_test_uptodate(fi.folio));
- erofs_onlinefolio_end(fi.folio, ret);
+ erofs_onlinefolio_end(fi.folio, ret, false);
}
}
bio_uninit(&rq->bio);
@@ -47,6 +47,7 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret)
static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq)
{
+ const struct cred *old_cred;
struct iov_iter iter;
int ret;
@@ -60,7 +61,9 @@ static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq)
rq->iocb.ki_flags = IOCB_DIRECT;
iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt,
rq->bio.bi_iter.bi_size);
+ old_cred = override_creds(rq->iocb.ki_filp->f_cred);
ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter);
+ revert_creds(old_cred);
if (ret != -EIOCBQUEUED)
erofs_fileio_ki_complete(&rq->iocb, ret);
}
@@ -93,8 +96,6 @@ static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio)
struct erofs_map_blocks *map = &io->map;
unsigned int cur = 0, end = folio_size(folio), len, attached = 0;
loff_t pos = folio_pos(folio), ofs;
- struct iov_iter iter;
- struct bio_vec bv;
int err = 0;
erofs_onlinefolio_init(folio);
@@ -114,18 +115,12 @@ static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio)
void *src;
src = erofs_read_metabuf(&buf, inode->i_sb,
- map->m_pa + ofs, true);
+ map->m_pa + ofs, erofs_inode_in_metabox(inode));
if (IS_ERR(src)) {
err = PTR_ERR(src);
break;
}
- bvec_set_folio(&bv, folio, len, cur);
- iov_iter_bvec(&iter, ITER_DEST, &bv, 1, len);
- if (copy_to_iter(src, len, &iter) != len) {
- erofs_put_metabuf(&buf);
- err = -EIO;
- break;
- }
+ memcpy_to_folio(folio, cur, src, len);
erofs_put_metabuf(&buf);
} else if (!(map->m_flags & EROFS_MAP_MAPPED)) {
folio_zero_segment(folio, cur, cur + len);
@@ -159,7 +154,7 @@ io_retry:
}
cur += len;
}
- erofs_onlinefolio_end(folio, err);
+ erofs_onlinefolio_end(folio, err, false);
return err;
}
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index 9c9129bca346..362acf828279 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -102,8 +102,7 @@ static void erofs_fscache_req_io_put(struct erofs_fscache_io *io)
erofs_fscache_req_put(req);
}
-static void erofs_fscache_req_end_io(void *priv,
- ssize_t transferred_or_error, bool was_async)
+static void erofs_fscache_req_end_io(void *priv, ssize_t transferred_or_error)
{
struct erofs_fscache_io *io = priv;
struct erofs_fscache_rq *req = io->private;
@@ -180,8 +179,7 @@ struct erofs_fscache_bio {
struct bio_vec bvecs[BIO_MAX_VECS];
};
-static void erofs_fscache_bio_endio(void *priv,
- ssize_t transferred_or_error, bool was_async)
+static void erofs_fscache_bio_endio(void *priv, ssize_t transferred_or_error)
{
struct erofs_fscache_bio *io = priv;
@@ -276,7 +274,8 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req)
size_t size = map.m_llen;
void *src;
- src = erofs_read_metabuf(&buf, sb, map.m_pa, true);
+ src = erofs_read_metabuf(&buf, sb, map.m_pa,
+ erofs_inode_in_metabox(inode));
if (IS_ERR(src))
return PTR_ERR(src);
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index a0ae0b4f7b01..9a2f59721522 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -29,6 +29,7 @@ static int erofs_read_inode(struct inode *inode)
struct super_block *sb = inode->i_sb;
erofs_blk_t blkaddr = erofs_blknr(sb, erofs_iloc(inode));
unsigned int ofs = erofs_blkoff(sb, erofs_iloc(inode));
+ bool in_mbox = erofs_inode_in_metabox(inode);
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_sb_info *sbi = EROFS_SB(sb);
erofs_blk_t addrmask = BIT_ULL(48) - 1;
@@ -39,10 +40,10 @@ static int erofs_read_inode(struct inode *inode)
void *ptr;
int err = 0;
- ptr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), true);
+ ptr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), in_mbox);
if (IS_ERR(ptr)) {
err = PTR_ERR(ptr);
- erofs_err(sb, "failed to get inode (nid: %llu) page, err %d",
+ erofs_err(sb, "failed to read inode meta block (nid: %llu): %d",
vi->nid, err);
goto err_out;
}
@@ -78,10 +79,10 @@ static int erofs_read_inode(struct inode *inode)
memcpy(&copied, dic, gotten);
ptr = erofs_read_metabuf(&buf, sb,
- erofs_pos(sb, blkaddr + 1), true);
+ erofs_pos(sb, blkaddr + 1), in_mbox);
if (IS_ERR(ptr)) {
err = PTR_ERR(ptr);
- erofs_err(sb, "failed to get inode payload block (nid: %llu), err %d",
+ erofs_err(sb, "failed to read inode payload block (nid: %llu): %d",
vi->nid, err);
goto err_out;
}
@@ -264,13 +265,13 @@ static int erofs_fill_inode(struct inode *inode)
* ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down
* so that it will fit.
*/
-static ino_t erofs_squash_ino(erofs_nid_t nid)
+static ino_t erofs_squash_ino(struct super_block *sb, erofs_nid_t nid)
{
- ino_t ino = (ino_t)nid;
+ u64 ino64 = erofs_nid_to_ino64(EROFS_SB(sb), nid);
if (sizeof(ino_t) < sizeof(erofs_nid_t))
- ino ^= nid >> (sizeof(erofs_nid_t) - sizeof(ino_t)) * 8;
- return ino;
+ ino64 ^= ino64 >> (sizeof(erofs_nid_t) - sizeof(ino_t)) * 8;
+ return (ino_t)ino64;
}
static int erofs_iget5_eq(struct inode *inode, void *opaque)
@@ -282,7 +283,7 @@ static int erofs_iget5_set(struct inode *inode, void *opaque)
{
const erofs_nid_t nid = *(erofs_nid_t *)opaque;
- inode->i_ino = erofs_squash_ino(nid);
+ inode->i_ino = erofs_squash_ino(inode->i_sb, nid);
EROFS_I(inode)->nid = nid;
return 0;
}
@@ -291,7 +292,7 @@ struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid)
{
struct inode *inode;
- inode = iget5_locked(sb, erofs_squash_ino(nid), erofs_iget5_eq,
+ inode = iget5_locked(sb, erofs_squash_ino(sb, nid), erofs_iget5_eq,
erofs_iget5_set, &nid);
if (!inode)
return ERR_PTR(-ENOMEM);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index a32c03a80c70..4ccc5f0ee8df 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -125,6 +125,7 @@ struct erofs_sb_info {
struct erofs_sb_lz4_info lz4;
#endif /* CONFIG_EROFS_FS_ZIP */
struct inode *packed_inode;
+ struct inode *metabox_inode;
struct erofs_dev_context *devs;
u64 total_blocks;
@@ -148,6 +149,7 @@ struct erofs_sb_info {
/* what we really care is nid, rather than ino.. */
erofs_nid_t root_nid;
erofs_nid_t packed_nid;
+ erofs_nid_t metabox_nid;
/* used for statfs, f_files - f_favail */
u64 inos;
@@ -157,6 +159,7 @@ struct erofs_sb_info {
/* sysfs support */
struct kobject s_kobj; /* /sys/fs/erofs/<devname> */
struct completion s_kobj_unregister;
+ erofs_off_t dir_ra_bytes;
/* fscache support */
struct fscache_volume *volume;
@@ -227,8 +230,27 @@ EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS)
EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE)
EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES)
EROFS_FEATURE_FUNCS(48bit, incompat, INCOMPAT_48BIT)
+EROFS_FEATURE_FUNCS(metabox, incompat, INCOMPAT_METABOX)
EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER)
+EROFS_FEATURE_FUNCS(shared_ea_in_metabox, compat, COMPAT_SHARED_EA_IN_METABOX)
+
+static inline u64 erofs_nid_to_ino64(struct erofs_sb_info *sbi, erofs_nid_t nid)
+{
+ if (!erofs_sb_has_metabox(sbi))
+ return nid;
+
+ /*
+ * When metadata compression is enabled, avoid generating excessively
+ * large inode numbers for metadata-compressed inodes. Shift NIDs in
+ * the 31-62 bit range left by one and move the metabox flag to bit 31.
+ *
+ * Note: on-disk NIDs remain unchanged as they are primarily used for
+ * compatibility with non-LFS 32-bit applications.
+ */
+ return ((nid << 1) & GENMASK_ULL(63, 32)) | (nid & GENMASK(30, 0)) |
+ ((nid >> EROFS_DIRENT_NID_METABOX_BIT) << 31);
+}
/* atomic flag definitions */
#define EROFS_I_EA_INITED_BIT 0
@@ -238,6 +260,9 @@ EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER)
#define EROFS_I_BL_XATTR_BIT (BITS_PER_LONG - 1)
#define EROFS_I_BL_Z_BIT (BITS_PER_LONG - 2)
+/* default readahead size of directories */
+#define EROFS_DIR_RA_BYTES 16384
+
struct erofs_inode {
erofs_nid_t nid;
@@ -279,12 +304,20 @@ struct erofs_inode {
#define EROFS_I(ptr) container_of(ptr, struct erofs_inode, vfs_inode)
+static inline bool erofs_inode_in_metabox(struct inode *inode)
+{
+ return EROFS_I(inode)->nid & BIT_ULL(EROFS_DIRENT_NID_METABOX_BIT);
+}
+
static inline erofs_off_t erofs_iloc(struct inode *inode)
{
struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+ erofs_nid_t nid_lo = EROFS_I(inode)->nid & EROFS_DIRENT_NID_MASK;
+ if (erofs_inode_in_metabox(inode))
+ return nid_lo << sbi->islotbits;
return erofs_pos(inode->i_sb, sbi->meta_blkaddr) +
- (EROFS_I(inode)->nid << sbi->islotbits);
+ (nid_lo << sbi->islotbits);
}
static inline unsigned int erofs_inode_version(unsigned int ifmt)
@@ -315,10 +348,12 @@ static inline struct folio *erofs_grab_folio_nowait(struct address_space *as,
/* The length of extent is full */
#define EROFS_MAP_FULL_MAPPED 0x0008
/* Located in the special packed inode */
-#define EROFS_MAP_FRAGMENT 0x0010
+#define __EROFS_MAP_FRAGMENT 0x0010
/* The extent refers to partial decompressed data */
#define EROFS_MAP_PARTIAL_REF 0x0020
+#define EROFS_MAP_FRAGMENT (EROFS_MAP_MAPPED | __EROFS_MAP_FRAGMENT)
+
struct erofs_map_blocks {
struct erofs_buf buf;
@@ -381,16 +416,17 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
void erofs_unmap_metabuf(struct erofs_buf *buf);
void erofs_put_metabuf(struct erofs_buf *buf);
void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap);
-void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb);
+int erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb,
+ bool in_metabox);
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
- erofs_off_t offset, bool need_kmap);
+ erofs_off_t offset, bool in_metabox);
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map);
void erofs_onlinefolio_init(struct folio *folio);
void erofs_onlinefolio_split(struct folio *folio);
-void erofs_onlinefolio_end(struct folio *folio, int err);
+void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty);
struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid);
int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask,
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index e1e9f06e8342..e1020aa60771 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -141,7 +141,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
struct erofs_deviceslot *dis;
struct file *file;
- dis = erofs_read_metabuf(buf, sb, *pos, true);
+ dis = erofs_read_metabuf(buf, sb, *pos, false);
if (IS_ERR(dis))
return PTR_ERR(dis);
@@ -258,7 +258,7 @@ static int erofs_read_superblock(struct super_block *sb)
void *data;
int ret;
- data = erofs_read_metabuf(&buf, sb, 0, true);
+ data = erofs_read_metabuf(&buf, sb, 0, false);
if (IS_ERR(data)) {
erofs_err(sb, "cannot read erofs superblock");
return PTR_ERR(data);
@@ -319,6 +319,14 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b);
}
sbi->packed_nid = le64_to_cpu(dsb->packed_nid);
+ if (erofs_sb_has_metabox(sbi)) {
+ if (sbi->sb_size <= offsetof(struct erofs_super_block,
+ metabox_nid))
+ return -EFSCORRUPTED;
+ sbi->metabox_nid = le64_to_cpu(dsb->metabox_nid);
+ if (sbi->metabox_nid & BIT_ULL(EROFS_DIRENT_NID_METABOX_BIT))
+ return -EFSCORRUPTED; /* self-loop detection */
+ }
sbi->inos = le64_to_cpu(dsb->inos);
sbi->epoch = (s64)le64_to_cpu(dsb->epoch);
@@ -335,6 +343,8 @@ static int erofs_read_superblock(struct super_block *sb)
if (erofs_sb_has_48bit(sbi))
erofs_info(sb, "EXPERIMENTAL 48-bit layout support in use. Use at your own risk!");
+ if (erofs_sb_has_metabox(sbi))
+ erofs_info(sb, "EXPERIMENTAL metadata compression support in use. Use at your own risk!");
if (erofs_is_fscache_mode(sb))
erofs_info(sb, "[deprecated] fscache-based on-demand read feature in use. Use at your own risk!");
out:
@@ -690,6 +700,12 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
return PTR_ERR(inode);
sbi->packed_inode = inode;
}
+ if (erofs_sb_has_metabox(sbi)) {
+ inode = erofs_iget(sb, sbi->metabox_nid);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ sbi->metabox_inode = inode;
+ }
inode = erofs_iget(sb, sbi->root_nid);
if (IS_ERR(inode))
@@ -715,6 +731,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
if (err)
return err;
+ sbi->dir_ra_bytes = EROFS_DIR_RA_BYTES;
erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid);
return 0;
}
@@ -845,6 +862,8 @@ static void erofs_drop_internal_inodes(struct erofs_sb_info *sbi)
{
iput(sbi->packed_inode);
sbi->packed_inode = NULL;
+ iput(sbi->metabox_inode);
+ sbi->metabox_inode = NULL;
#ifdef CONFIG_EROFS_FS_ZIP
iput(sbi->managed_cache);
sbi->managed_cache = NULL;
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
index eed8797a193f..1e0658a1d95b 100644
--- a/fs/erofs/sysfs.c
+++ b/fs/erofs/sysfs.c
@@ -65,12 +65,14 @@ EROFS_ATTR_FUNC(drop_caches, 0200);
#ifdef CONFIG_EROFS_FS_ZIP_ACCEL
EROFS_ATTR_FUNC(accel, 0644);
#endif
+EROFS_ATTR_RW_UI(dir_ra_bytes, erofs_sb_info);
static struct attribute *erofs_sb_attrs[] = {
#ifdef CONFIG_EROFS_FS_ZIP
ATTR_LIST(sync_decompress),
ATTR_LIST(drop_caches),
#endif
+ ATTR_LIST(dir_ra_bytes),
NULL,
};
ATTRIBUTE_GROUPS(erofs_sb);
@@ -95,6 +97,7 @@ EROFS_ATTR_FEATURE(ztailpacking);
EROFS_ATTR_FEATURE(fragments);
EROFS_ATTR_FEATURE(dedupe);
EROFS_ATTR_FEATURE(48bit);
+EROFS_ATTR_FEATURE(metabox);
static struct attribute *erofs_feat_attrs[] = {
ATTR_LIST(zero_padding),
@@ -108,6 +111,7 @@ static struct attribute *erofs_feat_attrs[] = {
ATTR_LIST(fragments),
ATTR_LIST(dedupe),
ATTR_LIST(48bit),
+ ATTR_LIST(metabox),
NULL,
};
ATTRIBUTE_GROUPS(erofs_feat);
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 9cf84717a92e..eaa9efd766ee 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -72,12 +72,14 @@ static int erofs_init_inode_xattrs(struct inode *inode)
ret = -EFSCORRUPTED;
goto out_unlock; /* xattr ondisk layout error */
}
- ret = -ENOATTR;
+ ret = -ENODATA;
goto out_unlock;
}
it.buf = __EROFS_BUF_INITIALIZER;
- erofs_init_metabuf(&it.buf, sb);
+ ret = erofs_init_metabuf(&it.buf, sb, erofs_inode_in_metabox(inode));
+ if (ret)
+ goto out_unlock;
it.pos = erofs_iloc(inode) + vi->inode_isize;
/* read in shared xattr array (non-atomic, see kmalloc below) */
@@ -266,20 +268,20 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
(entry.e_name_index & EROFS_XATTR_LONG_PREFIX_MASK);
if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count)
- return -ENOATTR;
+ return -ENODATA;
if (it->index != pf->prefix->base_index ||
it->name.len != entry.e_name_len + pf->infix_len)
- return -ENOATTR;
+ return -ENODATA;
if (memcmp(it->name.name, pf->prefix->infix, pf->infix_len))
- return -ENOATTR;
+ return -ENODATA;
it->infix_len = pf->infix_len;
} else {
if (it->index != entry.e_name_index ||
it->name.len != entry.e_name_len)
- return -ENOATTR;
+ return -ENODATA;
it->infix_len = 0;
}
@@ -295,7 +297,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
entry.e_name_len - processed);
if (memcmp(it->name.name + it->infix_len + processed,
it->kaddr, slice))
- return -ENOATTR;
+ return -ENODATA;
it->pos += slice;
}
@@ -323,9 +325,12 @@ static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it,
sizeof(u32) * vi->xattr_shared_count;
if (xattr_header_sz >= vi->xattr_isize) {
DBG_BUGON(xattr_header_sz > vi->xattr_isize);
- return -ENOATTR;
+ return -ENODATA;
}
+ ret = erofs_init_metabuf(&it->buf, it->sb, erofs_inode_in_metabox(inode));
+ if (ret)
+ return ret;
remaining = vi->xattr_isize - xattr_header_sz;
it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz;
@@ -347,7 +352,7 @@ static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it,
ret = erofs_getxattr_foreach(it);
else
ret = erofs_listxattr_foreach(it);
- if ((getxattr && ret != -ENOATTR) || (!getxattr && ret))
+ if ((getxattr && ret != -ENODATA) || (!getxattr && ret))
break;
it->pos = next_pos;
@@ -361,12 +366,17 @@ static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it,
struct erofs_inode *const vi = EROFS_I(inode);
struct super_block *const sb = it->sb;
struct erofs_sb_info *sbi = EROFS_SB(sb);
- unsigned int i;
- int ret = -ENOATTR;
+ unsigned int i = 0;
+ int ret;
- for (i = 0; i < vi->xattr_shared_count; ++i) {
+ ret = erofs_init_metabuf(&it->buf, sb,
+ erofs_sb_has_shared_ea_in_metabox(sbi));
+ if (ret)
+ return ret;
+
+ while (i < vi->xattr_shared_count) {
it->pos = erofs_pos(sb, sbi->xattr_blkaddr) +
- vi->xattr_shared_xattrs[i] * sizeof(__le32);
+ vi->xattr_shared_xattrs[i++] * sizeof(__le32);
it->kaddr = erofs_bread(&it->buf, it->pos, true);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
@@ -375,10 +385,10 @@ static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it,
ret = erofs_getxattr_foreach(it);
else
ret = erofs_listxattr_foreach(it);
- if ((getxattr && ret != -ENOATTR) || (!getxattr && ret))
+ if ((getxattr && ret != -ENODATA) || (!getxattr && ret))
break;
}
- return ret;
+ return i ? ret : -ENODATA;
}
int erofs_getxattr(struct inode *inode, int index, const char *name,
@@ -403,7 +413,7 @@ int erofs_getxattr(struct inode *inode, int index, const char *name,
EROFS_XATTR_FILTER_SEED + index);
hashbit &= EROFS_XATTR_FILTER_BITS - 1;
if (vi->xattr_name_filter & (1U << hashbit))
- return -ENOATTR;
+ return -ENODATA;
}
it.index = index;
@@ -413,13 +423,12 @@ int erofs_getxattr(struct inode *inode, int index, const char *name,
it.sb = inode->i_sb;
it.buf = __EROFS_BUF_INITIALIZER;
- erofs_init_metabuf(&it.buf, it.sb);
it.buffer = buffer;
it.buffer_size = buffer_size;
it.buffer_ofs = 0;
ret = erofs_xattr_iter_inline(&it, inode, true);
- if (ret == -ENOATTR)
+ if (ret == -ENODATA)
ret = erofs_xattr_iter_shared(&it, inode, true);
erofs_put_metabuf(&it.buf);
return ret ? ret : it.buffer_ofs;
@@ -432,23 +441,22 @@ ssize_t erofs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
struct inode *inode = d_inode(dentry);
ret = erofs_init_inode_xattrs(inode);
- if (ret == -ENOATTR)
+ if (ret == -ENODATA)
return 0;
if (ret)
return ret;
it.sb = dentry->d_sb;
it.buf = __EROFS_BUF_INITIALIZER;
- erofs_init_metabuf(&it.buf, it.sb);
it.dentry = dentry;
it.buffer = buffer;
it.buffer_size = buffer_size;
it.buffer_ofs = 0;
ret = erofs_xattr_iter_inline(&it, inode, false);
- if (!ret || ret == -ENOATTR)
+ if (!ret || ret == -ENODATA)
ret = erofs_xattr_iter_shared(&it, inode, false);
- if (ret == -ENOATTR)
+ if (ret == -ENODATA)
ret = 0;
erofs_put_metabuf(&it.buf);
return ret ? ret : it.buffer_ofs;
@@ -485,7 +493,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb)
if (sbi->packed_inode)
buf.mapping = sbi->packed_inode->i_mapping;
else
- erofs_init_metabuf(&buf, sb);
+ (void)erofs_init_metabuf(&buf, sb, false);
for (i = 0; i < sbi->xattr_prefix_count; i++) {
void *ptr = erofs_read_metadata(sb, &buf, &pos, &len);
@@ -539,7 +547,7 @@ struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu)
rc = erofs_getxattr(inode, prefix, "", value, rc);
}
- if (rc == -ENOATTR)
+ if (rc == -ENODATA)
acl = NULL;
else if (rc < 0)
acl = ERR_PTR(rc);
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
index b246cd0e135e..6317caa8413e 100644
--- a/fs/erofs/xattr.h
+++ b/fs/erofs/xattr.h
@@ -10,9 +10,6 @@
#include <linux/posix_acl_xattr.h>
#include <linux/xattr.h>
-/* Attribute not found */
-#define ENOATTR ENODATA
-
#ifdef CONFIG_EROFS_FS_XATTR
extern const struct xattr_handler erofs_xattr_user_handler;
extern const struct xattr_handler erofs_xattr_trusted_handler;
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index fe8071844724..792f20888a8f 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -805,6 +805,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
struct erofs_map_blocks *map = &fe->map;
struct super_block *sb = fe->inode->i_sb;
struct z_erofs_pcluster *pcl = NULL;
+ void *ptr;
int ret;
DBG_BUGON(fe->pcl);
@@ -854,15 +855,17 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
/* bind cache first when cached decompression is preferred */
z_erofs_bind_cache(fe);
} else {
- void *mptr;
-
- mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, false);
- if (IS_ERR(mptr)) {
- ret = PTR_ERR(mptr);
- erofs_err(sb, "failed to get inline data %d", ret);
+ ret = erofs_init_metabuf(&map->buf, sb,
+ erofs_inode_in_metabox(fe->inode));
+ if (ret)
+ return ret;
+ ptr = erofs_bread(&map->buf, map->m_pa, false);
+ if (IS_ERR(ptr)) {
+ ret = PTR_ERR(ptr);
+ erofs_err(sb, "failed to get inline folio %d", ret);
return ret;
}
- get_page(map->buf.page);
+ folio_get(page_folio(map->buf.page));
WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page);
fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK;
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
@@ -1034,7 +1037,7 @@ static int z_erofs_scan_folio(struct z_erofs_frontend *f,
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
folio_zero_segment(folio, cur, end);
tight = false;
- } else if (map->m_flags & EROFS_MAP_FRAGMENT) {
+ } else if (map->m_flags & __EROFS_MAP_FRAGMENT) {
erofs_off_t fpos = offset + cur - map->m_la;
err = z_erofs_read_fragment(inode->i_sb, folio, cur,
@@ -1091,7 +1094,7 @@ static int z_erofs_scan_folio(struct z_erofs_frontend *f,
tight = (bs == PAGE_SIZE);
}
} while ((end = cur) > 0);
- erofs_onlinefolio_end(folio, err);
+ erofs_onlinefolio_end(folio, err, false);
return err;
}
@@ -1196,7 +1199,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_backend *be, int err)
cur += len;
}
kunmap_local(dst);
- erofs_onlinefolio_end(page_folio(bvi->bvec.page), err);
+ erofs_onlinefolio_end(page_folio(bvi->bvec.page), err, true);
list_del(p);
kfree(bvi);
}
@@ -1325,9 +1328,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
/* must handle all compressed pages before actual file pages */
if (pcl->from_meta) {
- page = pcl->compressed_bvecs[0].page;
+ folio_put(page_folio(pcl->compressed_bvecs[0].page));
WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
- put_page(page);
} else {
/* managed folios are still left in compressed_bvecs[] */
for (i = 0; i < pclusterpages; ++i) {
@@ -1355,7 +1357,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
DBG_BUGON(z_erofs_page_is_invalidated(page));
if (!z_erofs_is_shortlived_page(page)) {
- erofs_onlinefolio_end(page_folio(page), err);
+ erofs_onlinefolio_end(page_folio(page), err, true);
continue;
}
if (pcl->algorithmformat != Z_EROFS_COMPRESSION_LZ4) {
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 14ea47f954f5..a93efd95c555 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -17,7 +17,7 @@ struct z_erofs_maprecorder {
u16 delta[2];
erofs_blk_t pblk, compressedblks;
erofs_off_t nextpackoff;
- bool partialref;
+ bool partialref, in_mbox;
};
static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
@@ -31,7 +31,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
struct z_erofs_lcluster_index *di;
unsigned int advise;
- di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, true);
+ di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, m->in_mbox);
if (IS_ERR(di))
return PTR_ERR(di);
m->lcn = lcn;
@@ -146,7 +146,7 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
else
return -EOPNOTSUPP;
- in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, true);
+ in = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, m->in_mbox);
if (IS_ERR(in))
return PTR_ERR(in);
@@ -240,6 +240,13 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m,
unsigned int lcn, bool lookahead)
{
+ if (m->type >= Z_EROFS_LCLUSTER_TYPE_MAX) {
+ erofs_err(m->inode->i_sb, "unknown type %u @ lcn %u of nid %llu",
+ m->type, lcn, EROFS_I(m->inode)->nid);
+ DBG_BUGON(1);
+ return -EOPNOTSUPP;
+ }
+
switch (EROFS_I(m->inode)->datalayout) {
case EROFS_INODE_COMPRESSED_FULL:
return z_erofs_load_full_lcluster(m, lcn);
@@ -265,12 +272,7 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
if (err)
return err;
- if (m->type >= Z_EROFS_LCLUSTER_TYPE_MAX) {
- erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu",
- m->type, lcn, vi->nid);
- DBG_BUGON(1);
- return -EOPNOTSUPP;
- } else if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
+ if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
lookback_distance = m->delta[0];
if (!lookback_distance)
break;
@@ -325,25 +327,18 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
DBG_BUGON(lcn == initial_lcn &&
m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD);
- if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
- if (m->delta[0] != 1) {
- erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
- if (m->compressedblks)
- goto out;
- } else if (m->type < Z_EROFS_LCLUSTER_TYPE_MAX) {
- /*
- * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
- * rather than CBLKCNT, it's a 1 block-sized pcluster.
- */
- m->compressedblks = 1;
- goto out;
+ if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD && m->delta[0] != 1) {
+ erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
}
- erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
+
+ /*
+ * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type rather
+ * than CBLKCNT, it's a 1 block-sized pcluster.
+ */
+ if (m->type != Z_EROFS_LCLUSTER_TYPE_NONHEAD || !m->compressedblks)
+ m->compressedblks = 1;
out:
m->map->m_plen = erofs_pos(sb, m->compressedblks);
return 0;
@@ -379,11 +374,6 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
if (lcn != headlcn)
break; /* ends at the next HEAD lcluster */
m->delta[1] = 1;
- } else {
- erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu",
- m->type, lcn, vi->nid);
- DBG_BUGON(1);
- return -EOPNOTSUPP;
}
lcn += m->delta[1];
}
@@ -402,6 +392,7 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
struct z_erofs_maprecorder m = {
.inode = inode,
.map = map,
+ .in_mbox = erofs_inode_in_metabox(inode),
};
int err = 0;
unsigned int endoff, afmt;
@@ -413,8 +404,7 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
!vi->z_tailextent_headlcn) {
map->m_la = 0;
map->m_llen = inode->i_size;
- map->m_flags = EROFS_MAP_MAPPED |
- EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT;
+ map->m_flags = EROFS_MAP_FRAGMENT;
return 0;
}
initial_lcn = ofs >> lclusterbits;
@@ -429,44 +419,33 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED;
end = (m.lcn + 1ULL) << lclusterbits;
- switch (m.type) {
- case Z_EROFS_LCLUSTER_TYPE_PLAIN:
- case Z_EROFS_LCLUSTER_TYPE_HEAD1:
- case Z_EROFS_LCLUSTER_TYPE_HEAD2:
- if (endoff >= m.clusterofs) {
- m.headtype = m.type;
- map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
- /*
- * For ztailpacking files, in order to inline data more
- * effectively, special EOF lclusters are now supported
- * which can have three parts at most.
- */
- if (ztailpacking && end > inode->i_size)
- end = inode->i_size;
- break;
- }
- /* m.lcn should be >= 1 if endoff < m.clusterofs */
- if (!m.lcn) {
- erofs_err(sb, "invalid logical cluster 0 at nid %llu",
- vi->nid);
- err = -EFSCORRUPTED;
- goto unmap_out;
+ if (m.type != Z_EROFS_LCLUSTER_TYPE_NONHEAD && endoff >= m.clusterofs) {
+ m.headtype = m.type;
+ map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
+ /*
+ * For ztailpacking files, in order to inline data more
+ * effectively, special EOF lclusters are now supported
+ * which can have three parts at most.
+ */
+ if (ztailpacking && end > inode->i_size)
+ end = inode->i_size;
+ } else {
+ if (m.type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
+ /* m.lcn should be >= 1 if endoff < m.clusterofs */
+ if (!m.lcn) {
+ erofs_err(sb, "invalid logical cluster 0 at nid %llu",
+ vi->nid);
+ err = -EFSCORRUPTED;
+ goto unmap_out;
+ }
+ end = (m.lcn << lclusterbits) | m.clusterofs;
+ map->m_flags |= EROFS_MAP_FULL_MAPPED;
+ m.delta[0] = 1;
}
- end = (m.lcn << lclusterbits) | m.clusterofs;
- map->m_flags |= EROFS_MAP_FULL_MAPPED;
- m.delta[0] = 1;
- fallthrough;
- case Z_EROFS_LCLUSTER_TYPE_NONHEAD:
/* get the corresponding first chunk */
err = z_erofs_extent_lookback(&m, m.delta[0]);
if (err)
goto unmap_out;
- break;
- default:
- erofs_err(sb, "unknown type %u @ offset %llu of nid %llu",
- m.type, ofs, vi->nid);
- err = -EOPNOTSUPP;
- goto unmap_out;
}
if (m.partialref)
map->m_flags |= EROFS_MAP_PARTIAL_REF;
@@ -489,7 +468,7 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
goto unmap_out;
}
} else if (fragment && m.lcn == vi->z_tailextent_headlcn) {
- map->m_flags |= EROFS_MAP_FRAGMENT;
+ map->m_flags = EROFS_MAP_FRAGMENT;
} else {
map->m_pa = erofs_pos(sb, m.pblk);
err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
@@ -543,6 +522,7 @@ static int z_erofs_map_blocks_ext(struct inode *inode,
unsigned int recsz = z_erofs_extent_recsize(vi->z_advise);
erofs_off_t pos = round_up(Z_EROFS_MAP_HEADER_END(erofs_iloc(inode) +
vi->inode_isize + vi->xattr_isize), recsz);
+ bool in_mbox = erofs_inode_in_metabox(inode);
erofs_off_t lend = inode->i_size;
erofs_off_t l, r, mid, pa, la, lstart;
struct z_erofs_extent *ext;
@@ -552,7 +532,7 @@ static int z_erofs_map_blocks_ext(struct inode *inode,
map->m_flags = 0;
if (recsz <= offsetof(struct z_erofs_extent, pstart_hi)) {
if (recsz <= offsetof(struct z_erofs_extent, pstart_lo)) {
- ext = erofs_read_metabuf(&map->buf, sb, pos, true);
+ ext = erofs_read_metabuf(&map->buf, sb, pos, in_mbox);
if (IS_ERR(ext))
return PTR_ERR(ext);
pa = le64_to_cpu(*(__le64 *)ext);
@@ -565,7 +545,7 @@ static int z_erofs_map_blocks_ext(struct inode *inode,
}
for (; lstart <= map->m_la; lstart += 1 << vi->z_lclusterbits) {
- ext = erofs_read_metabuf(&map->buf, sb, pos, true);
+ ext = erofs_read_metabuf(&map->buf, sb, pos, in_mbox);
if (IS_ERR(ext))
return PTR_ERR(ext);
map->m_plen = le32_to_cpu(ext->plen);
@@ -585,7 +565,7 @@ static int z_erofs_map_blocks_ext(struct inode *inode,
for (l = 0, r = vi->z_extents; l < r; ) {
mid = l + (r - l) / 2;
ext = erofs_read_metabuf(&map->buf, sb,
- pos + mid * recsz, true);
+ pos + mid * recsz, in_mbox);
if (IS_ERR(ext))
return PTR_ERR(ext);
@@ -597,6 +577,10 @@ static int z_erofs_map_blocks_ext(struct inode *inode,
if (la > map->m_la) {
r = mid;
+ if (la > lend) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
lend = la;
} else {
l = mid + 1;
@@ -613,7 +597,7 @@ static int z_erofs_map_blocks_ext(struct inode *inode,
if (lstart < lend) {
map->m_la = lstart;
if (last && (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) {
- map->m_flags |= EROFS_MAP_MAPPED | EROFS_MAP_FRAGMENT;
+ map->m_flags = EROFS_MAP_FRAGMENT;
vi->z_fragmentoff = map->m_plen;
if (recsz > offsetof(struct z_erofs_extent, pstart_lo))
vi->z_fragmentoff |= map->m_pa << 32;
@@ -635,22 +619,15 @@ static int z_erofs_map_blocks_ext(struct inode *inode,
}
}
map->m_llen = lend - map->m_la;
- if (!last && map->m_llen < sb->s_blocksize) {
- erofs_err(sb, "extent too small %llu @ offset %llu of nid %llu",
- map->m_llen, map->m_la, vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
return 0;
}
-static int z_erofs_fill_inode_lazy(struct inode *inode)
+static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map)
{
struct erofs_inode *const vi = EROFS_I(inode);
struct super_block *const sb = inode->i_sb;
int err, headnr;
erofs_off_t pos;
- struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct z_erofs_map_header *h;
if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) {
@@ -670,7 +647,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
goto out_unlock;
pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
- h = erofs_read_metabuf(&buf, sb, pos, true);
+ h = erofs_read_metabuf(&map->buf, sb, pos, erofs_inode_in_metabox(inode));
if (IS_ERR(h)) {
err = PTR_ERR(h);
goto out_unlock;
@@ -708,7 +685,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel",
headnr + 1, vi->z_algorithmtype[headnr], vi->nid);
err = -EOPNOTSUPP;
- goto out_put_metabuf;
+ goto out_unlock;
}
if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
@@ -717,7 +694,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu",
vi->nid);
err = -EFSCORRUPTED;
- goto out_put_metabuf;
+ goto out_unlock;
}
if (vi->datalayout == EROFS_INODE_COMPRESSED_COMPACT &&
!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^
@@ -725,27 +702,25 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu",
vi->nid);
err = -EFSCORRUPTED;
- goto out_put_metabuf;
+ goto out_unlock;
}
if (vi->z_idata_size ||
(vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) {
- struct erofs_map_blocks map = {
+ struct erofs_map_blocks tm = {
.buf = __EROFS_BUF_INITIALIZER
};
- err = z_erofs_map_blocks_fo(inode, &map,
+ err = z_erofs_map_blocks_fo(inode, &tm,
EROFS_GET_BLOCKS_FINDTAIL);
- erofs_put_metabuf(&map.buf);
+ erofs_put_metabuf(&tm.buf);
if (err < 0)
- goto out_put_metabuf;
+ goto out_unlock;
}
done:
/* paired with smp_mb() at the beginning of the function */
smp_mb();
set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
-out_put_metabuf:
- erofs_put_metabuf(&buf);
out_unlock:
clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags);
return err;
@@ -763,7 +738,7 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
map->m_la = inode->i_size;
map->m_flags = 0;
} else {
- err = z_erofs_fill_inode_lazy(inode);
+ err = z_erofs_fill_inode(inode, map);
if (!err) {
if (vi->datalayout == EROFS_INODE_COMPRESSED_FULL &&
(vi->z_advise & Z_EROFS_ADVISE_EXTENTS))
@@ -799,7 +774,7 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset,
iomap->length = map.m_llen;
if (map.m_flags & EROFS_MAP_MAPPED) {
iomap->type = IOMAP_MAPPED;
- iomap->addr = map.m_flags & EROFS_MAP_FRAGMENT ?
+ iomap->addr = map.m_flags & __EROFS_MAP_FRAGMENT ?
IOMAP_NULL_ADDR : map.m_pa;
} else {
iomap->type = IOMAP_HOLE;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index d4dbffdedd08..b22d6f819f78 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -218,6 +218,7 @@ struct eventpoll {
/* used to optimize loop detection check */
u64 gen;
struct hlist_head refs;
+ u8 loop_check_depth;
/*
* usage count, used together with epitem->dying to
@@ -883,7 +884,7 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
kfree_rcu(epi, rcu);
percpu_counter_dec(&ep->user->epoll_watches);
- return ep_refcount_dec_and_test(ep);
+ return true;
}
/*
@@ -891,14 +892,14 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
*/
static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi)
{
- WARN_ON_ONCE(__ep_remove(ep, epi, false));
+ if (__ep_remove(ep, epi, false))
+ WARN_ON_ONCE(ep_refcount_dec_and_test(ep));
}
static void ep_clear_and_put(struct eventpoll *ep)
{
struct rb_node *rbp, *next;
struct epitem *epi;
- bool dispose;
/* We need to release all tasks waiting for these file */
if (waitqueue_active(&ep->poll_wait))
@@ -931,10 +932,8 @@ static void ep_clear_and_put(struct eventpoll *ep)
cond_resched();
}
- dispose = ep_refcount_dec_and_test(ep);
mutex_unlock(&ep->mtx);
-
- if (dispose)
+ if (ep_refcount_dec_and_test(ep))
ep_free(ep);
}
@@ -1137,7 +1136,7 @@ again:
dispose = __ep_remove(ep, epi, true);
mutex_unlock(&ep->mtx);
- if (dispose)
+ if (dispose && ep_refcount_dec_and_test(ep))
ep_free(ep);
goto again;
}
@@ -2142,23 +2141,24 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
}
/**
- * ep_loop_check_proc - verify that adding an epoll file inside another
- * epoll structure does not violate the constraints, in
- * terms of closed loops, or too deep chains (which can
- * result in excessive stack usage).
+ * ep_loop_check_proc - verify that adding an epoll file @ep inside another
+ * epoll file does not create closed loops, and
+ * determine the depth of the subtree starting at @ep
*
* @ep: the &struct eventpoll to be currently checked.
* @depth: Current depth of the path being checked.
*
- * Return: %zero if adding the epoll @file inside current epoll
- * structure @ep does not violate the constraints, or %-1 otherwise.
+ * Return: depth of the subtree, or INT_MAX if we found a loop or went too deep.
*/
static int ep_loop_check_proc(struct eventpoll *ep, int depth)
{
- int error = 0;
+ int result = 0;
struct rb_node *rbp;
struct epitem *epi;
+ if (ep->gen == loop_check_gen)
+ return ep->loop_check_depth;
+
mutex_lock_nested(&ep->mtx, depth + 1);
ep->gen = loop_check_gen;
for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
@@ -2166,13 +2166,11 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
if (unlikely(is_file_epoll(epi->ffd.file))) {
struct eventpoll *ep_tovisit;
ep_tovisit = epi->ffd.file->private_data;
- if (ep_tovisit->gen == loop_check_gen)
- continue;
if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
- error = -1;
+ result = INT_MAX;
else
- error = ep_loop_check_proc(ep_tovisit, depth + 1);
- if (error != 0)
+ result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1);
+ if (result > EP_MAX_NESTS)
break;
} else {
/*
@@ -2186,9 +2184,25 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
list_file(epi->ffd.file);
}
}
+ ep->loop_check_depth = result;
mutex_unlock(&ep->mtx);
- return error;
+ return result;
+}
+
+/* ep_get_upwards_depth_proc - determine depth of @ep when traversed upwards */
+static int ep_get_upwards_depth_proc(struct eventpoll *ep, int depth)
+{
+ int result = 0;
+ struct epitem *epi;
+
+ if (ep->gen == loop_check_gen)
+ return ep->loop_check_depth;
+ hlist_for_each_entry_rcu(epi, &ep->refs, fllink)
+ result = max(result, ep_get_upwards_depth_proc(epi->ep, depth + 1) + 1);
+ ep->gen = loop_check_gen;
+ ep->loop_check_depth = result;
+ return result;
}
/**
@@ -2204,8 +2218,22 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
*/
static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
{
+ int depth, upwards_depth;
+
inserting_into = ep;
- return ep_loop_check_proc(to, 0);
+ /*
+ * Check how deep down we can get from @to, and whether it is possible
+ * to loop up to @ep.
+ */
+ depth = ep_loop_check_proc(to, 0);
+ if (depth > EP_MAX_NESTS)
+ return -1;
+ /* Check how far up we can go from @ep. */
+ rcu_read_lock();
+ upwards_depth = ep_get_upwards_depth_proc(ep, 0);
+ rcu_read_unlock();
+
+ return (depth+1+upwards_depth > EP_MAX_NESTS) ? -1 : 0;
}
static void clear_tfile_check_list(void)
diff --git a/fs/exec.c b/fs/exec.c
index cfbb2b9ee3c9..2a1e5e4042a1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -78,6 +78,9 @@
#include <trace/events/sched.h>
+/* For vma exec functions. */
+#include "../mm/internal.h"
+
static int bprm_creds_from_file(struct linux_binprm *bprm);
int suid_dumpable = 0;
@@ -111,6 +114,9 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
bool path_noexec(const struct path *path)
{
+ /* If it's an anonymous inode make sure that we catch any shenanigans. */
+ VFS_WARN_ON_ONCE(IS_ANON_FILE(d_inode(path->dentry)) &&
+ !(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC));
return (path->mnt->mnt_flags & MNT_NOEXEC) ||
(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
}
@@ -182,60 +188,6 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
flush_cache_page(bprm->vma, pos, page_to_pfn(page));
}
-static int __bprm_mm_init(struct linux_binprm *bprm)
-{
- int err;
- struct vm_area_struct *vma = NULL;
- struct mm_struct *mm = bprm->mm;
-
- bprm->vma = vma = vm_area_alloc(mm);
- if (!vma)
- return -ENOMEM;
- vma_set_anonymous(vma);
-
- if (mmap_write_lock_killable(mm)) {
- err = -EINTR;
- goto err_free;
- }
-
- /*
- * Need to be called with mmap write lock
- * held, to avoid race with ksmd.
- */
- err = ksm_execve(mm);
- if (err)
- goto err_ksm;
-
- /*
- * Place the stack at the largest stack address the architecture
- * supports. Later, we'll move this to an appropriate place. We don't
- * use STACK_TOP because that can depend on attributes which aren't
- * configured yet.
- */
- BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
- vma->vm_end = STACK_TOP_MAX;
- vma->vm_start = vma->vm_end - PAGE_SIZE;
- vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP);
- vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
-
- err = insert_vm_struct(mm, vma);
- if (err)
- goto err;
-
- mm->stack_vm = mm->total_vm = 1;
- mmap_write_unlock(mm);
- bprm->p = vma->vm_end - sizeof(void *);
- return 0;
-err:
- ksm_exit(mm);
-err_ksm:
- mmap_write_unlock(mm);
-err_free:
- bprm->vma = NULL;
- vm_area_free(vma);
- return err;
-}
-
static bool valid_arg_len(struct linux_binprm *bprm, long len)
{
return len <= MAX_ARG_STRLEN;
@@ -288,12 +240,6 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
{
}
-static int __bprm_mm_init(struct linux_binprm *bprm)
-{
- bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
- return 0;
-}
-
static bool valid_arg_len(struct linux_binprm *bprm, long len)
{
return len <= bprm->p;
@@ -322,9 +268,13 @@ static int bprm_mm_init(struct linux_binprm *bprm)
bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
task_unlock(current->group_leader);
- err = __bprm_mm_init(bprm);
+#ifndef CONFIG_MMU
+ bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
+#else
+ err = create_init_stack_vma(bprm->mm, &bprm->vma, &bprm->p);
if (err)
goto err;
+#endif
return 0;
@@ -654,7 +604,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = bprm->vma;
struct vm_area_struct *prev = NULL;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
unsigned long stack_base;
unsigned long stack_size;
unsigned long stack_expand;
@@ -834,13 +784,15 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
if (IS_ERR(file))
return file;
+ if (path_noexec(&file->f_path))
+ return ERR_PTR(-EACCES);
+
/*
* In the past the regular type check was here. It moved to may_open() in
* 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is
* an invariant that all non-regular files error out before we get here.
*/
- if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) ||
- path_noexec(&file->f_path))
+ if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)))
return ERR_PTR(-EACCES);
err = exe_file_deny_write_access(file);
@@ -1563,7 +1515,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
* state is protected by cred_guard_mutex we hold.
*/
n_fs = 1;
- spin_lock(&p->fs->lock);
+ read_seqlock_excl(&p->fs->seq);
rcu_read_lock();
for_other_threads(p, t) {
if (t->fs == p->fs)
@@ -1576,7 +1528,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
bprm->unsafe |= LSM_UNSAFE_SHARE;
else
p->fs->in_exec = 1;
- spin_unlock(&p->fs->lock);
+ read_sequnlock_excl(&p->fs->seq);
}
static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 841a5b18e3df..6b82497572b4 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -532,11 +532,10 @@ int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
return blkdev_issue_flush(inode->i_sb->s_bdev);
}
-static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size)
+static int exfat_extend_valid_size(struct inode *inode, loff_t new_valid_size)
{
int err;
loff_t pos;
- struct inode *inode = file_inode(file);
struct exfat_inode_info *ei = EXFAT_I(inode);
struct address_space *mapping = inode->i_mapping;
const struct address_space_operations *ops = mapping->a_ops;
@@ -551,14 +550,14 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size)
if (pos + len > new_valid_size)
len = new_valid_size - pos;
- err = ops->write_begin(file, mapping, pos, len, &folio, NULL);
+ err = ops->write_begin(NULL, mapping, pos, len, &folio, NULL);
if (err)
goto out;
off = offset_in_folio(folio, pos);
folio_zero_new_buffers(folio, off, off + len);
- err = ops->write_end(file, mapping, pos, len, len, folio, NULL);
+ err = ops->write_end(NULL, mapping, pos, len, len, folio, NULL);
if (err < 0)
goto out;
pos += len;
@@ -604,7 +603,7 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
}
if (pos > valid_size) {
- ret = exfat_extend_valid_size(file, pos);
+ ret = exfat_extend_valid_size(inode, pos);
if (ret < 0 && ret != -ENOSPC) {
exfat_err(inode->i_sb,
"write: fail to zero from %llu to %llu(%zd)",
@@ -665,7 +664,7 @@ static vm_fault_t exfat_page_mkwrite(struct vm_fault *vmf)
start + vma->vm_end - vma->vm_start);
if (ei->valid_size < end) {
- err = exfat_extend_valid_size(file, end);
+ err = exfat_extend_valid_size(inode, end);
if (err < 0) {
inode_unlock(inode);
return vmf_fs_error(err);
@@ -683,13 +682,15 @@ static const struct vm_operations_struct exfat_file_vm_ops = {
.page_mkwrite = exfat_page_mkwrite,
};
-static int exfat_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int exfat_file_mmap_prepare(struct vm_area_desc *desc)
{
- if (unlikely(exfat_forced_shutdown(file_inode(file)->i_sb)))
+ struct file *file = desc->file;
+
+ if (unlikely(exfat_forced_shutdown(file_inode(desc->file)->i_sb)))
return -EIO;
file_accessed(file);
- vma->vm_ops = &exfat_file_vm_ops;
+ desc->vm_ops = &exfat_file_vm_ops;
return 0;
}
@@ -710,7 +711,7 @@ const struct file_operations exfat_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = exfat_compat_ioctl,
#endif
- .mmap = exfat_file_mmap,
+ .mmap_prepare = exfat_file_mmap_prepare,
.fsync = exfat_file_fsync,
.splice_read = exfat_splice_read,
.splice_write = iter_file_splice_write,
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index b22c02d6000f..c10844e1e16c 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -446,9 +446,10 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int exfat_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned int len,
- struct folio **foliop, void **fsdata)
+static int exfat_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned int len,
+ struct folio **foliop, void **fsdata)
{
int ret;
@@ -463,15 +464,16 @@ static int exfat_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
-static int exfat_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned int len, unsigned int copied,
- struct folio *folio, void *fsdata)
+static int exfat_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned int len, unsigned int copied,
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
struct exfat_inode_info *ei = EXFAT_I(inode);
int err;
- err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
if (err < len)
exfat_write_failed(mapping, pos+len);
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 7ed858937d45..ea5c1334a214 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -667,9 +667,9 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
}
if (sbi->options.utf8)
- sb->s_d_op = &exfat_utf8_dentry_ops;
+ set_default_d_op(sb, &exfat_utf8_dentry_ops);
else
- sb->s_d_op = &exfat_dentry_ops;
+ set_default_d_op(sb, &exfat_dentry_ops);
root_inode = new_inode(sb);
if (!root_inode) {
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index cdefea17986a..d3e55de4a2a2 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -549,15 +549,13 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
goto err_result;
}
- inode_lock(target_dir->d_inode);
- nresult = lookup_one(mnt_idmap(mnt), &QSTR(nbuf), target_dir);
+ nresult = lookup_one_unlocked(mnt_idmap(mnt), &QSTR(nbuf), target_dir);
if (!IS_ERR(nresult)) {
if (unlikely(nresult->d_inode != result->d_inode)) {
dput(nresult);
nresult = ERR_PTR(-ESTALE);
}
}
- inode_unlock(target_dir->d_inode);
/*
* At this point we are done with the parent, but it's pinned
* by the child dentry anyway.
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 402fecf90a44..b07b3b369710 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -87,7 +87,7 @@ static void ext2_commit_chunk(struct folio *folio, loff_t pos, unsigned len)
struct inode *dir = mapping->host;
inode_inc_iversion(dir);
- block_write_end(NULL, mapping, pos, len, len, folio, NULL);
+ block_write_end(pos, len, len, folio);
if (pos+len > dir->i_size) {
i_size_write(dir, pos+len);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 4025f875252a..cf97b76e9fd3 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -750,9 +750,9 @@ extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
/* ioctl.c */
-extern int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+extern int ext2_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
extern int ext2_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
extern long ext2_ioctl(struct file *, unsigned int, unsigned long);
extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 10b061ac5bc0..76bddce462fc 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -122,17 +122,19 @@ static const struct vm_operations_struct ext2_dax_vm_ops = {
.pfn_mkwrite = ext2_dax_fault,
};
-static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int ext2_file_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
+
if (!IS_DAX(file_inode(file)))
- return generic_file_mmap(file, vma);
+ return generic_file_mmap_prepare(desc);
file_accessed(file);
- vma->vm_ops = &ext2_dax_vm_ops;
+ desc->vm_ops = &ext2_dax_vm_ops;
return 0;
}
#else
-#define ext2_file_mmap generic_file_mmap
+#define ext2_file_mmap_prepare generic_file_mmap_prepare
#endif
/*
@@ -316,7 +318,7 @@ const struct file_operations ext2_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ext2_compat_ioctl,
#endif
- .mmap = ext2_file_mmap,
+ .mmap_prepare = ext2_file_mmap_prepare,
.open = ext2_file_open,
.release = ext2_release_file,
.fsync = ext2_fsync,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 30f8201c155f..e10c376843d7 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -895,9 +895,19 @@ int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
int ret;
+ loff_t i_size;
inode_lock(inode);
- len = min_t(u64, len, i_size_read(inode));
+ i_size = i_size_read(inode);
+ /*
+ * iomap_fiemap() returns EINVAL for 0 length. Make sure we don't trim
+ * length to 0 but still trim the range as much as possible since
+ * ext2_get_blocks() iterates unmapped space block by block which is
+ * slow.
+ */
+ if (i_size == 0)
+ i_size = 1;
+ len = min_t(u64, len, i_size);
ret = iomap_fiemap(inode, fieinfo, start, len, &ext2_iomap_ops);
inode_unlock(inode);
@@ -915,7 +925,7 @@ static void ext2_readahead(struct readahead_control *rac)
}
static int
-ext2_write_begin(struct file *file, struct address_space *mapping,
+ext2_write_begin(const struct kiocb *iocb, struct address_space *mapping,
loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
{
int ret;
@@ -926,13 +936,14 @@ ext2_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
-static int ext2_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+static int ext2_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
int ret;
- ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
if (ret < len)
ext2_write_failed(mapping, pos + len);
return ret;
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 44e04484e570..c3fea55b8efa 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -18,7 +18,7 @@
#include <linux/uaccess.h>
#include <linux/fileattr.h>
-int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int ext2_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct ext2_inode_info *ei = EXT2_I(d_inode(dentry));
@@ -28,7 +28,7 @@ int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
int ext2_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct ext2_inode_info *ei = EXT2_I(inode);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index c48fd36b2d74..c9329ed5c094 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -703,7 +703,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
* possible we just missed a transaction commit that did so
*/
smp_mb();
- if (sbi->s_mb_free_pending == 0) {
+ if (atomic_read(&sbi->s_mb_free_pending) == 0) {
if (test_opt(sb, DISCARD)) {
atomic_inc(&sbi->s_retry_alloc_pending);
flush_work(&sbi->s_discard_work);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 18373de980f2..01a6e2de7fc3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -157,7 +157,7 @@ enum criteria {
/*
* Reads each block group sequentially, performing disk IO if
- * necessary, to find find_suitable block group. Tries to
+ * necessary, to find suitable block group. Tries to
* allocate goal length but might trim the request if nothing
* is found after enough tries.
*/
@@ -185,14 +185,8 @@ enum criteria {
/* prefer goal again. length */
#define EXT4_MB_HINT_MERGE 0x0001
-/* blocks already reserved */
-#define EXT4_MB_HINT_RESERVED 0x0002
-/* metadata is being allocated */
-#define EXT4_MB_HINT_METADATA 0x0004
/* first blocks in the file */
#define EXT4_MB_HINT_FIRST 0x0008
-/* search for the best chunk */
-#define EXT4_MB_HINT_BEST 0x0010
/* data is being allocated */
#define EXT4_MB_HINT_DATA 0x0020
/* don't preallocate (for tails) */
@@ -213,15 +207,6 @@ enum criteria {
#define EXT4_MB_USE_RESERVED 0x2000
/* Do strict check for free blocks while retrying block allocation */
#define EXT4_MB_STRICT_CHECK 0x4000
-/* Large fragment size list lookup succeeded at least once for
- * CR_POWER2_ALIGNED */
-#define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000
-/* Avg fragment size rb tree lookup succeeded at least once for
- * CR_GOAL_LEN_FAST */
-#define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000
-/* Avg fragment size rb tree lookup succeeded at least once for
- * CR_BEST_AVAIL_LEN */
-#define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000
struct ext4_allocation_request {
/* target inode for block we're allocating */
@@ -1608,16 +1593,14 @@ struct ext4_sb_info {
unsigned short *s_mb_offsets;
unsigned int *s_mb_maxs;
unsigned int s_group_info_size;
- unsigned int s_mb_free_pending;
+ atomic_t s_mb_free_pending;
struct list_head s_freed_data_list[2]; /* List of blocks to be freed
after commit completed */
struct list_head s_discard_list;
struct work_struct s_discard_work;
atomic_t s_retry_alloc_pending;
- struct list_head *s_mb_avg_fragment_size;
- rwlock_t *s_mb_avg_fragment_size_locks;
- struct list_head *s_mb_largest_free_orders;
- rwlock_t *s_mb_largest_free_orders_locks;
+ struct xarray *s_mb_avg_fragment_size;
+ struct xarray *s_mb_largest_free_orders;
/* tunables */
unsigned long s_stripe;
@@ -1629,15 +1612,16 @@ struct ext4_sb_info {
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
unsigned int s_max_dir_size_kb;
- /* where last allocation was done - for stream allocation */
- unsigned long s_mb_last_group;
- unsigned long s_mb_last_start;
unsigned int s_mb_prefetch;
unsigned int s_mb_prefetch_limit;
unsigned int s_mb_best_avail_max_trim_order;
unsigned int s_sb_update_sec;
unsigned int s_sb_update_kb;
+ /* where last allocation was done - for stream allocation */
+ ext4_group_t *s_mb_last_groups;
+ unsigned int s_mb_nr_global_goals;
+
/* stats for buddy allocator */
atomic_t s_bal_reqs; /* number of reqs with len > 1 */
atomic_t s_bal_success; /* we found long enough chunks */
@@ -1646,12 +1630,10 @@ struct ext4_sb_info {
atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */
atomic_t s_bal_groups_scanned; /* number of groups scanned */
atomic_t s_bal_goals; /* goal hits */
+ atomic_t s_bal_stream_goals; /* stream allocation global goal hits */
atomic_t s_bal_len_goals; /* len goal hits */
atomic_t s_bal_breaks; /* too long searches */
atomic_t s_bal_2orders; /* 2^order hits */
- atomic_t s_bal_p2_aligned_bad_suggestions;
- atomic_t s_bal_goal_fast_bad_suggestions;
- atomic_t s_bal_best_avail_bad_suggestions;
atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS];
atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS];
atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */
@@ -3020,7 +3002,7 @@ int ext4_walk_page_buffers(handle_t *handle,
struct buffer_head *bh));
int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
-bool ext4_should_enable_large_folio(struct inode *inode);
+void ext4_set_inode_mapping_order(struct inode *inode);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA 2
@@ -3064,9 +3046,9 @@ extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
-extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
+extern int ext4_chunk_trans_extent(struct inode *inode, int nrblocks);
extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
int pextents);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
@@ -3103,8 +3085,8 @@ extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
int ext4_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
-int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
+int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
extern void ext4_reset_inode_seed(struct inode *inode);
int ext4_update_overhead(struct super_block *sb, bool force);
int ext4_force_shutdown(struct super_block *sb, u32 flags);
@@ -3489,8 +3471,6 @@ struct ext4_group_info {
void *bb_bitmap;
#endif
struct rw_semaphore alloc_sem;
- struct list_head bb_avg_fragment_size_node;
- struct list_head bb_largest_free_order_node;
ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
* regions, index is order.
* bb_counters[3] = 5 means
@@ -3541,23 +3521,28 @@ static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
}
+static inline bool ext4_try_lock_group(struct super_block *sb, ext4_group_t group)
+{
+ if (!spin_trylock(ext4_group_lock_ptr(sb, group)))
+ return false;
+ /*
+ * We're able to grab the lock right away, so drop the lock
+ * contention counter.
+ */
+ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
+ return true;
+}
+
static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
{
- spinlock_t *lock = ext4_group_lock_ptr(sb, group);
- if (spin_trylock(lock))
- /*
- * We're able to grab the lock right away, so drop the
- * lock contention counter.
- */
- atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
- else {
+ if (!ext4_try_lock_group(sb, group)) {
/*
* The lock is busy, so bump the contention counter,
* and then wait on the spin lock.
*/
atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
EXT4_MAX_CONTENTION);
- spin_lock(lock);
+ spin_lock(ext4_group_lock_ptr(sb, group));
}
}
@@ -3612,6 +3597,7 @@ extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
extern int ext4_get_max_inline_size(struct inode *inode);
extern int ext4_find_inline_data_nolock(struct inode *inode);
extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
+extern void ext4_update_final_de(void *de_buf, int old_size, int new_size);
int ext4_readpage_inline(struct inode *inode, struct folio *folio);
extern int ext4_try_to_write_inline_data(struct address_space *mapping,
@@ -3671,10 +3657,10 @@ static inline int ext4_has_inline_data(struct inode *inode)
extern const struct inode_operations ext4_dir_inode_operations;
extern const struct inode_operations ext4_special_inode_operations;
extern struct dentry *ext4_get_parent(struct dentry *child);
-extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
- struct ext4_dir_entry_2 *de,
- int blocksize, int csum_size,
- unsigned int parent_ino, int dotdot_real_len);
+extern int ext4_init_dirblock(handle_t *handle, struct inode *inode,
+ struct buffer_head *dir_block,
+ unsigned int parent_ino, void *inline_buf,
+ int inline_size);
extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
unsigned int blocksize);
extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 26435f3a3094..c484125d963f 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -31,13 +31,6 @@
#define CHECK_BINSEARCH__
/*
- * If EXT_STATS is defined then stats numbers are collected.
- * These number will be displayed at umount time.
- */
-#define EXT_STATS_
-
-
-/*
* ext4_inode has i_block array (60 bytes total).
* The first 12 bytes store ext4_extent_header;
* the remainder stores an array of ext4_extent.
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index b543a46fc809..ca5499e9412b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4501,6 +4501,8 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
struct ext4_map_blocks map;
unsigned int credits;
loff_t epos, old_size = i_size_read(inode);
+ unsigned int blkbits = inode->i_blkbits;
+ bool alloc_zero = false;
BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
map.m_lblk = offset;
@@ -4514,6 +4516,17 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
/*
+ * Do the actual write zero during a running journal transaction
+ * costs a lot. First allocate an unwritten extent and then
+ * convert it to written after zeroing it out.
+ */
+ if (flags & EXT4_GET_BLOCKS_ZERO) {
+ flags &= ~EXT4_GET_BLOCKS_ZERO;
+ flags |= EXT4_GET_BLOCKS_UNWRIT_EXT;
+ alloc_zero = true;
+ }
+
+ /*
* credits to insert 1 extent into extent tree
*/
credits = ext4_chunk_trans_blocks(inode, len);
@@ -4549,9 +4562,7 @@ retry:
* allow a full retry cycle for any remaining allocations
*/
retries = 0;
- map.m_lblk += ret;
- map.m_len = len = len - ret;
- epos = (loff_t)map.m_lblk << inode->i_blkbits;
+ epos = (loff_t)(map.m_lblk + ret) << blkbits;
inode_set_ctime_current(inode);
if (new_size) {
if (epos > new_size)
@@ -4571,6 +4582,21 @@ retry:
ret2 = ret3 ? ret3 : ret2;
if (unlikely(ret2))
break;
+
+ if (alloc_zero &&
+ (map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) {
+ ret2 = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk,
+ map.m_len);
+ if (likely(!ret2))
+ ret2 = ext4_convert_unwritten_extents(NULL,
+ inode, (loff_t)map.m_lblk << blkbits,
+ (loff_t)map.m_len << blkbits);
+ if (ret2)
+ break;
+ }
+
+ map.m_lblk += ret;
+ map.m_len = len = len - ret;
}
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
@@ -4636,7 +4662,11 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (end_lblk > start_lblk) {
ext4_lblk_t zero_blks = end_lblk - start_lblk;
- flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE);
+ if (mode & FALLOC_FL_WRITE_ZEROES)
+ flags = EXT4_GET_BLOCKS_CREATE_ZERO | EXT4_EX_NOCACHE;
+ else
+ flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
+ EXT4_EX_NOCACHE);
ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks,
new_size, flags);
if (ret)
@@ -4745,11 +4775,18 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (IS_ENCRYPTED(inode) &&
(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
return -EOPNOTSUPP;
+ /*
+ * Don't allow writing zeroes if the underlying device does not
+ * enable the unmap write zeroes operation.
+ */
+ if ((mode & FALLOC_FL_WRITE_ZEROES) &&
+ !bdev_write_zeroes_unmap_sectors(inode->i_sb->s_bdev))
+ return -EOPNOTSUPP;
/* Return error if mode is not supported */
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
- FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
- FALLOC_FL_INSERT_RANGE))
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_COLLAPSE_RANGE |
+ FALLOC_FL_INSERT_RANGE | FALLOC_FL_WRITE_ZEROES))
return -EOPNOTSUPP;
inode_lock(inode);
@@ -4780,16 +4817,23 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (ret)
goto out_invalidate_lock;
- if (mode & FALLOC_FL_PUNCH_HOLE)
+ switch (mode & FALLOC_FL_MODE_MASK) {
+ case FALLOC_FL_PUNCH_HOLE:
ret = ext4_punch_hole(file, offset, len);
- else if (mode & FALLOC_FL_COLLAPSE_RANGE)
+ break;
+ case FALLOC_FL_COLLAPSE_RANGE:
ret = ext4_collapse_range(file, offset, len);
- else if (mode & FALLOC_FL_INSERT_RANGE)
+ break;
+ case FALLOC_FL_INSERT_RANGE:
ret = ext4_insert_range(file, offset, len);
- else if (mode & FALLOC_FL_ZERO_RANGE)
+ break;
+ case FALLOC_FL_ZERO_RANGE:
+ case FALLOC_FL_WRITE_ZEROES:
ret = ext4_zero_range(file, offset, len, mode);
- else
+ break;
+ default:
ret = -EOPNOTSUPP;
+ }
out_invalidate_lock:
filemap_invalidate_unlock(mapping);
@@ -5171,7 +5215,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
credits = depth + 2;
}
- restart_credits = ext4_writepage_trans_blocks(inode);
+ restart_credits = ext4_chunk_trans_extent(inode, 0);
err = ext4_datasem_ensure_credits(handle, inode, credits,
restart_credits, 0);
if (err) {
@@ -5431,7 +5475,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
truncate_pagecache(inode, start);
- credits = ext4_writepage_trans_blocks(inode);
+ credits = ext4_chunk_trans_extent(inode, 0);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -5527,7 +5571,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
truncate_pagecache(inode, start);
- credits = ext4_writepage_trans_blocks(inode);
+ credits = ext4_chunk_trans_extent(inode, 0);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 21df81347147..93240e35ee36 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -747,7 +747,7 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order)
bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
(vmf->vma->vm_flags & VM_SHARED);
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
- pfn_t pfn;
+ unsigned long pfn;
if (write) {
sb_start_pagefault(sb);
@@ -804,9 +804,10 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
.page_mkwrite = ext4_page_mkwrite,
};
-static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int ext4_file_mmap_prepare(struct vm_area_desc *desc)
{
int ret;
+ struct file *file = desc->file;
struct inode *inode = file->f_mapping->host;
struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
@@ -821,15 +822,15 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
* We don't support synchronous mappings for non-DAX files and
* for DAX files if underneath dax_device is not synchronous.
*/
- if (!daxdev_mapping_supported(vma, dax_dev))
+ if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), dax_dev))
return -EOPNOTSUPP;
file_accessed(file);
if (IS_DAX(file_inode(file))) {
- vma->vm_ops = &ext4_dax_vm_ops;
- vm_flags_set(vma, VM_HUGEPAGE);
+ desc->vm_ops = &ext4_dax_vm_ops;
+ desc->vm_flags |= VM_HUGEPAGE;
} else {
- vma->vm_ops = &ext4_file_vm_ops;
+ desc->vm_ops = &ext4_file_vm_ops;
}
return 0;
}
@@ -968,7 +969,7 @@ const struct file_operations ext4_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
- .mmap = ext4_file_mmap,
+ .mmap_prepare = ext4_file_mmap_prepare,
.open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
@@ -977,7 +978,8 @@ const struct file_operations ext4_file_operations = {
.splice_write = iter_file_splice_write,
.fallocate = ext4_fallocate,
.fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
- FOP_DIO_PARALLEL_WRITE,
+ FOP_DIO_PARALLEL_WRITE |
+ FOP_DONTCACHE,
};
const struct inode_operations ext4_file_inode_operations = {
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 79aa3df8d019..df4051613b29 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1335,8 +1335,7 @@ got:
}
}
- if (ext4_should_enable_large_folio(inode))
- mapping_set_large_folios(inode->i_mapping);
+ ext4_set_inode_mapping_order(inode);
ext4_update_inode_fsync_trans(handle, inode, 1);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index a1bbcdf40824..1b094a4f3866 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -303,7 +303,11 @@ static int ext4_create_inline_data(handle_t *handle,
if (error)
goto out;
- BUG_ON(!is.s.not_found);
+ if (!is.s.not_found) {
+ EXT4_ERROR_INODE(inode, "unexpected inline data xattr");
+ error = -EFSCORRUPTED;
+ goto out;
+ }
error = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (error) {
@@ -354,7 +358,11 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
if (error)
goto out;
- BUG_ON(is.s.not_found);
+ if (is.s.not_found) {
+ EXT4_ERROR_INODE(inode, "missing inline data xattr");
+ error = -EFSCORRUPTED;
+ goto out;
+ }
len -= EXT4_MIN_INLINE_DATA_SIZE;
value = kzalloc(len, GFP_NOFS);
@@ -562,7 +570,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
return 0;
}
- needed_blocks = ext4_writepage_trans_blocks(inode);
+ needed_blocks = ext4_chunk_trans_extent(inode, 1);
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
@@ -612,6 +620,7 @@ retry:
} else
ret = ext4_block_write_begin(handle, folio, from, to,
ext4_get_block);
+ clear_buffer_new(folio_buffers(folio));
if (!ret && ext4_should_journal_data(inode)) {
ret = ext4_walk_page_buffers(handle, inode,
@@ -891,6 +900,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
return ret;
}
+ clear_buffer_new(folio_buffers(folio));
folio_mark_dirty(folio);
folio_mark_uptodate(folio);
ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
@@ -995,7 +1005,7 @@ static void *ext4_get_inline_xattr_pos(struct inode *inode,
}
/* Set the final de to cover the whole block. */
-static void ext4_update_final_de(void *de_buf, int old_size, int new_size)
+void ext4_update_final_de(void *de_buf, int old_size, int new_size)
{
struct ext4_dir_entry_2 *de, *prev_de;
void *limit;
@@ -1059,51 +1069,6 @@ static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
}
-static int ext4_finish_convert_inline_dir(handle_t *handle,
- struct inode *inode,
- struct buffer_head *dir_block,
- void *buf,
- int inline_size)
-{
- int err, csum_size = 0, header_size = 0;
- struct ext4_dir_entry_2 *de;
- void *target = dir_block->b_data;
-
- /*
- * First create "." and ".." and then copy the dir information
- * back to the block.
- */
- de = target;
- de = ext4_init_dot_dotdot(inode, de,
- inode->i_sb->s_blocksize, csum_size,
- le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1);
- header_size = (void *)de - target;
-
- memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
- inline_size - EXT4_INLINE_DOTDOT_SIZE);
-
- if (ext4_has_feature_metadata_csum(inode->i_sb))
- csum_size = sizeof(struct ext4_dir_entry_tail);
-
- inode->i_size = inode->i_sb->s_blocksize;
- i_size_write(inode, inode->i_sb->s_blocksize);
- EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
- ext4_update_final_de(dir_block->b_data,
- inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size,
- inode->i_sb->s_blocksize - csum_size);
-
- if (csum_size)
- ext4_initialize_dirent_tail(dir_block,
- inode->i_sb->s_blocksize);
- set_buffer_uptodate(dir_block);
- unlock_buffer(dir_block);
- err = ext4_handle_dirty_dirblock(handle, inode, dir_block);
- if (err)
- return err;
- set_buffer_verified(dir_block);
- return ext4_mark_inode_dirty(handle, inode);
-}
-
static int ext4_convert_inline_data_nolock(handle_t *handle,
struct inode *inode,
struct ext4_iloc *iloc)
@@ -1175,8 +1140,17 @@ static int ext4_convert_inline_data_nolock(handle_t *handle,
error = ext4_handle_dirty_metadata(handle,
inode, data_bh);
} else {
- error = ext4_finish_convert_inline_dir(handle, inode, data_bh,
- buf, inline_size);
+ unlock_buffer(data_bh);
+ inode->i_size = inode->i_sb->s_blocksize;
+ i_size_write(inode, inode->i_sb->s_blocksize);
+ EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+
+ error = ext4_init_dirblock(handle, inode, data_bh,
+ le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode),
+ buf + EXT4_INLINE_DOTDOT_SIZE,
+ inline_size - EXT4_INLINE_DOTDOT_SIZE);
+ if (!error)
+ error = ext4_mark_inode_dirty(handle, inode);
}
out_restore:
@@ -1315,7 +1289,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
if (pos == 0) {
fake.inode = cpu_to_le32(inode->i_ino);
fake.name_len = 1;
- strcpy(fake.name, ".");
+ memcpy(fake.name, ".", 2);
fake.rec_len = ext4_rec_len_to_disk(
ext4_dir_rec_len(fake.name_len, NULL),
inline_size);
@@ -1325,7 +1299,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
} else if (pos == EXT4_INLINE_DOTDOT_OFFSET) {
fake.inode = cpu_to_le32(parent_ino);
fake.name_len = 2;
- strcpy(fake.name, "..");
+ memcpy(fake.name, "..", 3);
fake.rec_len = ext4_rec_len_to_disk(
ext4_dir_rec_len(fake.name_len, NULL),
inline_size);
@@ -1864,7 +1838,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
};
- needed_blocks = ext4_writepage_trans_blocks(inode);
+ needed_blocks = ext4_chunk_trans_extent(inode, 1);
handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -1903,7 +1877,12 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0)
goto out_error;
- BUG_ON(is.s.not_found);
+ if (is.s.not_found) {
+ EXT4_ERROR_INODE(inode,
+ "missing inline data xattr");
+ err = -EFSCORRUPTED;
+ goto out_error;
+ }
value_len = le32_to_cpu(is.s.here->e_value_size);
value = kmalloc(value_len, GFP_NOFS);
@@ -1979,7 +1958,7 @@ int ext4_convert_inline_data(struct inode *inode)
return 0;
}
- needed_blocks = ext4_writepage_trans_blocks(inode);
+ needed_blocks = ext4_chunk_trans_extent(inode, 1);
iloc.bh = NULL;
error = ext4_get_inode_loc(inode, &iloc);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index be9a4cba35fd..ed54c4d0f2f9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -723,8 +723,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
ext4_check_map_extents_env(inode);
/* Lookup extent status tree firstly */
- if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
- ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
map->m_pblk = ext4_es_pblock(&es) +
map->m_lblk - es.es_lblk;
@@ -757,8 +756,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
orig_mlen == map->m_len)
goto found;
- if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF)
- map->m_len = orig_mlen;
+ map->m_len = orig_mlen;
}
/*
* In the query cache no-wait mode, nothing we can do more if we
@@ -877,6 +875,26 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
} while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state)));
}
+/*
+ * Make sure that the current journal transaction has enough credits to map
+ * one extent. Return -EAGAIN if it cannot extend the current running
+ * transaction.
+ */
+static inline int ext4_journal_ensure_extent_credits(handle_t *handle,
+ struct inode *inode)
+{
+ int credits;
+ int ret;
+
+ /* Called from ext4_da_write_begin() which has no handle started? */
+ if (!handle)
+ return 0;
+
+ credits = ext4_chunk_trans_blocks(inode, 1);
+ ret = __ext4_journal_ensure_credits(handle, credits, credits, 0);
+ return ret <= 0 ? ret : -EAGAIN;
+}
+
static int _ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int flags)
{
@@ -1171,11 +1189,13 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio,
}
continue;
}
- if (buffer_new(bh))
+ if (WARN_ON_ONCE(buffer_new(bh)))
clear_buffer_new(bh);
if (!buffer_mapped(bh)) {
WARN_ON(bh->b_size != blocksize);
- err = get_block(inode, block, bh, 1);
+ err = ext4_journal_ensure_extent_credits(handle, inode);
+ if (!err)
+ err = get_block(inode, block, bh, 1);
if (err)
break;
if (buffer_new(bh)) {
@@ -1252,7 +1272,8 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio,
* and the ext4_write_end(). So doing the jbd2_journal_start at the start of
* ext4_write_begin() is the right place.
*/
-static int ext4_write_begin(struct file *file, struct address_space *mapping,
+static int ext4_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
@@ -1263,7 +1284,6 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
struct folio *folio;
pgoff_t index;
unsigned from, to;
- fgf_t fgp = FGP_WRITEBEGIN;
ret = ext4_emergency_state(inode->i_sb);
if (unlikely(ret))
@@ -1274,7 +1294,8 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
* Reserve one block more for addition to orphan list in case
* we allocate blocks but write fails for some reason
*/
- needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
+ needed_blocks = ext4_chunk_trans_extent(inode,
+ ext4_journal_blocks_per_folio(inode)) + 1;
index = pos >> PAGE_SHIFT;
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
@@ -1287,16 +1308,14 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
}
/*
- * __filemap_get_folio() can take a long time if the
+ * write_begin_get_folio() can take a long time if the
* system is thrashing due to memory pressure, or if the folio
* is being written back. So grab it first before we start
* the transaction handle. This also allows us to allocate
* the folio (if needed) without using GFP_NOFS.
*/
retry_grab:
- fgp |= fgf_set_order(len);
- folio = __filemap_get_folio(mapping, index, fgp,
- mapping_gfp_mask(mapping));
+ folio = write_begin_get_folio(iocb, mapping, index, len);
if (IS_ERR(folio))
return PTR_ERR(folio);
@@ -1374,8 +1393,9 @@ retry_journal:
ext4_orphan_del(NULL, inode);
}
- if (ret == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries))
+ if (ret == -EAGAIN ||
+ (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries)))
goto retry_journal;
folio_put(folio);
return ret;
@@ -1395,17 +1415,18 @@ static int write_end_fn(handle_t *handle, struct inode *inode,
ret = ext4_dirty_journalled_data(handle, bh);
clear_buffer_meta(bh);
clear_buffer_prio(bh);
+ clear_buffer_new(bh);
return ret;
}
/*
* We need to pick up the new inode size which generic_commit_write gave us
- * `file' can be NULL - eg, when called from page_symlink().
+ * `iocb` can be NULL - eg, when called from page_symlink().
*
* ext4 never places buffers on inode->i_mapping->i_private_list. metadata
* buffers are managed internally.
*/
-static int ext4_write_end(struct file *file,
+static int ext4_write_end(const struct kiocb *iocb,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
@@ -1424,7 +1445,7 @@ static int ext4_write_end(struct file *file,
return ext4_write_inline_data_end(inode, pos, len, copied,
folio);
- copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ copied = block_write_end(pos, len, copied, folio);
/*
* it's important to update i_size while still holding folio lock:
* page writeout could otherwise come in and zero beyond i_size.
@@ -1510,7 +1531,7 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle,
} while (bh != head);
}
-static int ext4_journalled_write_end(struct file *file,
+static int ext4_journalled_write_end(const struct kiocb *iocb,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
@@ -1667,11 +1688,12 @@ struct mpage_da_data {
unsigned int can_map:1; /* Can writepages call map blocks? */
/* These are internal state of ext4_do_writepages() */
- pgoff_t first_page; /* The first page to write */
- pgoff_t next_page; /* Current page to examine */
- pgoff_t last_page; /* Last page to examine */
+ loff_t start_pos; /* The start pos to write */
+ loff_t next_pos; /* Current pos to examine */
+ loff_t end_pos; /* Last pos to examine */
+
/*
- * Extent to map - this can be after first_page because that can be
+ * Extent to map - this can be after start_pos because that can be
* fully mapped. We somewhat abuse m_flags to store whether the extent
* is delalloc or unwritten.
*/
@@ -1691,38 +1713,38 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
struct inode *inode = mpd->inode;
struct address_space *mapping = inode->i_mapping;
- /* This is necessary when next_page == 0. */
- if (mpd->first_page >= mpd->next_page)
+ /* This is necessary when next_pos == 0. */
+ if (mpd->start_pos >= mpd->next_pos)
return;
mpd->scanned_until_end = 0;
- index = mpd->first_page;
- end = mpd->next_page - 1;
if (invalidate) {
ext4_lblk_t start, last;
- start = index << (PAGE_SHIFT - inode->i_blkbits);
- last = end << (PAGE_SHIFT - inode->i_blkbits);
+ start = EXT4_B_TO_LBLK(inode, mpd->start_pos);
+ last = mpd->next_pos >> inode->i_blkbits;
/*
* avoid racing with extent status tree scans made by
* ext4_insert_delayed_block()
*/
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_es_remove_extent(inode, start, last - start + 1);
+ ext4_es_remove_extent(inode, start, last - start);
up_write(&EXT4_I(inode)->i_data_sem);
}
folio_batch_init(&fbatch);
- while (index <= end) {
- nr = filemap_get_folios(mapping, &index, end, &fbatch);
+ index = mpd->start_pos >> PAGE_SHIFT;
+ end = mpd->next_pos >> PAGE_SHIFT;
+ while (index < end) {
+ nr = filemap_get_folios(mapping, &index, end - 1, &fbatch);
if (nr == 0)
break;
for (i = 0; i < nr; i++) {
struct folio *folio = fbatch.folios[i];
- if (folio->index < mpd->first_page)
+ if (folio_pos(folio) < mpd->start_pos)
continue;
- if (folio_next_index(folio) - 1 > end)
+ if (folio_next_index(folio) > end)
continue;
BUG_ON(!folio_test_locked(folio));
BUG_ON(folio_test_writeback(folio));
@@ -2024,7 +2046,8 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio)
{
- mpd->first_page += folio_nr_pages(folio);
+ mpd->start_pos += folio_size(folio);
+ mpd->wbc->nr_to_write -= folio_nr_pages(folio);
folio_unlock(folio);
}
@@ -2034,7 +2057,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
loff_t size;
int err;
- BUG_ON(folio->index != mpd->first_page);
+ WARN_ON_ONCE(folio_pos(folio) != mpd->start_pos);
folio_clear_dirty_for_io(folio);
/*
* We have to be very careful here! Nothing protects writeback path
@@ -2055,8 +2078,6 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
!ext4_verity_in_progress(mpd->inode))
len = size & (len - 1);
err = ext4_bio_write_folio(&mpd->io_submit, folio, len);
- if (!err)
- mpd->wbc->nr_to_write -= folio_nr_pages(folio);
return err;
}
@@ -2323,6 +2344,11 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
int get_blocks_flags;
int err, dioread_nolock;
+ /* Make sure transaction has enough credits for this extent */
+ err = ext4_journal_ensure_extent_credits(handle, inode);
+ if (err < 0)
+ return err;
+
trace_ext4_da_write_pages_extent(inode, map);
/*
* Call ext4_map_blocks() to allocate any delayed allocation blocks, or
@@ -2362,6 +2388,47 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
}
/*
+ * This is used to submit mapped buffers in a single folio that is not fully
+ * mapped for various reasons, such as insufficient space or journal credits.
+ */
+static int mpage_submit_partial_folio(struct mpage_da_data *mpd)
+{
+ struct inode *inode = mpd->inode;
+ struct folio *folio;
+ loff_t pos;
+ int ret;
+
+ folio = filemap_get_folio(inode->i_mapping,
+ mpd->start_pos >> PAGE_SHIFT);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ /*
+ * The mapped position should be within the current processing folio
+ * but must not be the folio start position.
+ */
+ pos = ((loff_t)mpd->map.m_lblk) << inode->i_blkbits;
+ if (WARN_ON_ONCE((folio_pos(folio) == pos) ||
+ !folio_contains(folio, pos >> PAGE_SHIFT)))
+ return -EINVAL;
+
+ ret = mpage_submit_folio(mpd, folio);
+ if (ret)
+ goto out;
+ /*
+ * Update start_pos to prevent this folio from being released in
+ * mpage_release_unused_pages(), it will be reset to the aligned folio
+ * pos when this folio is written again in the next round. Additionally,
+ * do not update wbc->nr_to_write here, as it will be updated once the
+ * entire folio has finished processing.
+ */
+ mpd->start_pos = pos;
+out:
+ folio_unlock(folio);
+ folio_put(folio);
+ return ret;
+}
+
+/*
* mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
* mpd->len and submit pages underlying it for IO
*
@@ -2409,10 +2476,18 @@ static int mpage_map_and_submit_extent(handle_t *handle,
* In the case of ENOSPC, if ext4_count_free_blocks()
* is non-zero, a commit should free up blocks.
*/
- if ((err == -ENOMEM) ||
+ if ((err == -ENOMEM) || (err == -EAGAIN) ||
(err == -ENOSPC && ext4_count_free_clusters(sb))) {
- if (progress)
+ /*
+ * We may have already allocated extents for
+ * some bhs inside the folio, issue the
+ * corresponding data to prevent stale data.
+ */
+ if (progress) {
+ if (mpage_submit_partial_folio(mpd))
+ goto invalidate_dirty_pages;
goto update_disksize;
+ }
return err;
}
ext4_msg(sb, KERN_CRIT,
@@ -2446,7 +2521,7 @@ update_disksize:
* Update on-disk size after IO is submitted. Races with
* truncate are avoided by checking i_size under i_data_sem.
*/
- disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
+ disksize = mpd->start_pos;
if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
int err2;
loff_t i_size;
@@ -2470,21 +2545,6 @@ update_disksize:
return err;
}
-/*
- * Calculate the total number of credits to reserve for one writepages
- * iteration. This is called from ext4_writepages(). We map an extent of
- * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
- * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
- * bpp - 1 blocks in bpp different extents.
- */
-static int ext4_da_writepages_trans_blocks(struct inode *inode)
-{
- int bpp = ext4_journal_blocks_per_folio(inode);
-
- return ext4_meta_trans_blocks(inode,
- MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
-}
-
static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio,
size_t len)
{
@@ -2549,8 +2609,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
struct address_space *mapping = mpd->inode->i_mapping;
struct folio_batch fbatch;
unsigned int nr_folios;
- pgoff_t index = mpd->first_page;
- pgoff_t end = mpd->last_page;
+ pgoff_t index = mpd->start_pos >> PAGE_SHIFT;
+ pgoff_t end = mpd->end_pos >> PAGE_SHIFT;
xa_mark_t tag;
int i, err = 0;
int blkbits = mpd->inode->i_blkbits;
@@ -2565,7 +2625,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
tag = PAGECACHE_TAG_DIRTY;
mpd->map.m_len = 0;
- mpd->next_page = index;
+ mpd->next_pos = mpd->start_pos;
if (ext4_should_journal_data(mpd->inode)) {
handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE,
bpp);
@@ -2596,7 +2656,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
goto out;
/* If we can't merge this page, we are done. */
- if (mpd->map.m_len > 0 && mpd->next_page != folio->index)
+ if (mpd->map.m_len > 0 &&
+ mpd->next_pos != folio_pos(folio))
goto out;
if (handle) {
@@ -2642,8 +2703,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
}
if (mpd->map.m_len == 0)
- mpd->first_page = folio->index;
- mpd->next_page = folio_next_index(folio);
+ mpd->start_pos = folio_pos(folio);
+ mpd->next_pos = folio_pos(folio) + folio_size(folio);
/*
* Writeout when we cannot modify metadata is simple.
* Just submit the page. For data=journal mode we
@@ -2771,12 +2832,12 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
mpd->journalled_more_data = 0;
if (ext4_should_dioread_nolock(inode)) {
+ int bpf = ext4_journal_blocks_per_folio(inode);
/*
* We may need to convert up to one extent per block in
- * the page and we may dirty the inode.
+ * the folio and we may dirty the inode.
*/
- rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
- PAGE_SIZE >> inode->i_blkbits);
+ rsv_blocks = 1 + ext4_ext_index_trans_blocks(inode, bpf);
}
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
@@ -2786,18 +2847,18 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
writeback_index = mapping->writeback_index;
if (writeback_index)
cycled = 0;
- mpd->first_page = writeback_index;
- mpd->last_page = -1;
+ mpd->start_pos = writeback_index << PAGE_SHIFT;
+ mpd->end_pos = LLONG_MAX;
} else {
- mpd->first_page = wbc->range_start >> PAGE_SHIFT;
- mpd->last_page = wbc->range_end >> PAGE_SHIFT;
+ mpd->start_pos = wbc->range_start;
+ mpd->end_pos = wbc->range_end;
}
ext4_io_submit_init(&mpd->io_submit, wbc);
retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag_pages_for_writeback(mapping, mpd->first_page,
- mpd->last_page);
+ tag_pages_for_writeback(mapping, mpd->start_pos >> PAGE_SHIFT,
+ mpd->end_pos >> PAGE_SHIFT);
blk_start_plug(&plug);
/*
@@ -2840,8 +2901,14 @@ retry:
* not supported by delalloc.
*/
BUG_ON(ext4_should_journal_data(inode));
- needed_blocks = ext4_da_writepages_trans_blocks(inode);
-
+ /*
+ * Calculate the number of credits needed to reserve for one
+ * extent of up to MAX_WRITEPAGES_EXTENT_LEN blocks. It will
+ * attempt to extend the transaction or start a new iteration
+ * if the reserved credits are insufficient.
+ */
+ needed_blocks = ext4_chunk_trans_blocks(inode,
+ MAX_WRITEPAGES_EXTENT_LEN);
/* start a new transaction */
handle = ext4_journal_start_with_reserve(inode,
EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
@@ -2857,7 +2924,8 @@ retry:
}
mpd->do_map = 1;
- trace_ext4_da_write_pages(inode, mpd->first_page, wbc);
+ trace_ext4_da_write_folios_start(inode, mpd->start_pos,
+ mpd->next_pos, wbc);
ret = mpage_prepare_extent_to_map(mpd);
if (!ret && mpd->map.m_len)
ret = mpage_map_and_submit_extent(handle, mpd,
@@ -2895,6 +2963,8 @@ retry:
} else
ext4_put_io_end(mpd->io_submit.io_end);
mpd->io_submit.io_end = NULL;
+ trace_ext4_da_write_folios_end(inode, mpd->start_pos,
+ mpd->next_pos, wbc, ret);
if (ret == -ENOSPC && sbi->s_journal) {
/*
@@ -2906,6 +2976,8 @@ retry:
ret = 0;
continue;
}
+ if (ret == -EAGAIN)
+ ret = 0;
/* Fatal error - ENOMEM, EIO... */
if (ret)
break;
@@ -2914,8 +2986,8 @@ unplug:
blk_finish_plug(&plug);
if (!ret && !cycled && wbc->nr_to_write > 0) {
cycled = 1;
- mpd->last_page = writeback_index - 1;
- mpd->first_page = 0;
+ mpd->end_pos = (writeback_index << PAGE_SHIFT) - 1;
+ mpd->start_pos = 0;
goto retry;
}
@@ -2925,7 +2997,7 @@ unplug:
* Set the writeback_index so that range_cyclic
* mode will write it back later
*/
- mapping->writeback_index = mpd->first_page;
+ mapping->writeback_index = mpd->start_pos >> PAGE_SHIFT;
out_writepages:
trace_ext4_writepages_result(inode, wbc, ret,
@@ -3036,7 +3108,8 @@ static int ext4_nonda_switch(struct super_block *sb)
return 0;
}
-static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+static int ext4_da_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
@@ -3044,7 +3117,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
struct folio *folio;
pgoff_t index;
struct inode *inode = mapping->host;
- fgf_t fgp = FGP_WRITEBEGIN;
ret = ext4_emergency_state(inode->i_sb);
if (unlikely(ret))
@@ -3054,7 +3126,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) {
*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
- return ext4_write_begin(file, mapping, pos,
+ return ext4_write_begin(iocb, mapping, pos,
len, foliop, fsdata);
}
*fsdata = (void *)0;
@@ -3070,9 +3142,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
}
retry:
- fgp |= fgf_set_order(len);
- folio = __filemap_get_folio(mapping, index, fgp,
- mapping_gfp_mask(mapping));
+ folio = write_begin_get_folio(iocb, mapping, index, len);
if (IS_ERR(folio))
return PTR_ERR(folio);
@@ -3144,8 +3214,7 @@ static int ext4_da_do_write_end(struct address_space *mapping,
* block_write_end() will mark the inode as dirty with I_DIRTY_PAGES
* flag, which all that's needed to trigger page writeback.
*/
- copied = block_write_end(NULL, mapping, pos, len, copied,
- folio, NULL);
+ copied = block_write_end(pos, len, copied, folio);
new_i_size = pos + copied;
/*
@@ -3196,7 +3265,7 @@ static int ext4_da_do_write_end(struct address_space *mapping,
return copied;
}
-static int ext4_da_write_end(struct file *file,
+static int ext4_da_write_end(const struct kiocb *iocb,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
@@ -3205,7 +3274,7 @@ static int ext4_da_write_end(struct file *file,
int write_mode = (int)(unsigned long)fsdata;
if (write_mode == FALL_BACK_TO_NONDELALLOC)
- return ext4_write_end(file, mapping, pos,
+ return ext4_write_end(iocb, mapping, pos,
len, copied, folio, fsdata);
trace_ext4_da_write_end(inode, pos, len, copied);
@@ -4389,7 +4458,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
return ret;
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- credits = ext4_writepage_trans_blocks(inode);
+ credits = ext4_chunk_trans_extent(inode, 2);
else
credits = ext4_blocks_for_truncate(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
@@ -4538,7 +4607,7 @@ int ext4_truncate(struct inode *inode)
}
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- credits = ext4_writepage_trans_blocks(inode);
+ credits = ext4_chunk_trans_extent(inode, 1);
else
credits = ext4_blocks_for_truncate(inode);
@@ -5106,7 +5175,7 @@ error:
return -EFSCORRUPTED;
}
-bool ext4_should_enable_large_folio(struct inode *inode)
+static bool ext4_should_enable_large_folio(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
@@ -5123,6 +5192,22 @@ bool ext4_should_enable_large_folio(struct inode *inode)
return true;
}
+/*
+ * Limit the maximum folio order to 2048 blocks to prevent overestimation
+ * of reserve handle credits during the folio writeback in environments
+ * where the PAGE_SIZE exceeds 4KB.
+ */
+#define EXT4_MAX_PAGECACHE_ORDER(i) \
+ umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT))
+void ext4_set_inode_mapping_order(struct inode *inode)
+{
+ if (!ext4_should_enable_large_folio(inode))
+ return;
+
+ mapping_set_folio_order_range(inode->i_mapping, 0,
+ EXT4_MAX_PAGECACHE_ORDER(inode));
+}
+
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ext4_iget_flags flags, const char *function,
unsigned int line)
@@ -5440,8 +5525,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = -EFSCORRUPTED;
goto bad_inode;
}
- if (ext4_should_enable_large_folio(inode))
- mapping_set_large_folios(inode->i_mapping);
+
+ ext4_set_inode_mapping_order(inode);
ret = check_igot_inode(inode, flags, function, line);
/*
@@ -6139,7 +6224,7 @@ int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents)
int ret;
/*
- * How many index and lead blocks need to touch to map @lblocks
+ * How many index and leaf blocks need to touch to map @lblocks
* logical blocks to @pextents physical extents?
*/
idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
@@ -6148,7 +6233,7 @@ int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents)
* Now let's see how many group bitmaps and group descriptors need
* to account
*/
- groups = idxblocks;
+ groups = idxblocks + pextents;
gdpblocks = groups;
if (groups > ngroups)
groups = ngroups;
@@ -6165,25 +6250,19 @@ int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents)
}
/*
- * Calculate the total number of credits to reserve to fit
- * the modification of a single pages into a single transaction,
- * which may include multiple chunks of block allocations.
- *
- * This could be called via ext4_write_begin()
- *
- * We need to consider the worse case, when
- * one new block per extent.
+ * Calculate the journal credits for modifying the number of blocks
+ * in a single extent within one transaction. 'nrblocks' is used only
+ * for non-extent inodes. For extent type inodes, 'nrblocks' can be
+ * zero if the exact number of blocks is unknown.
*/
-int ext4_writepage_trans_blocks(struct inode *inode)
+int ext4_chunk_trans_extent(struct inode *inode, int nrblocks)
{
- int bpp = ext4_journal_blocks_per_folio(inode);
int ret;
- ret = ext4_meta_trans_blocks(inode, bpp, bpp);
-
+ ret = ext4_meta_trans_blocks(inode, nrblocks, 1);
/* Account for data blocks for journalled mode */
if (ext4_should_journal_data(inode))
- ret += bpp;
+ ret += nrblocks;
return ret;
}
@@ -6555,6 +6634,55 @@ static int ext4_bh_unmapped(handle_t *handle, struct inode *inode,
return !buffer_mapped(bh);
}
+static int ext4_block_page_mkwrite(struct inode *inode, struct folio *folio,
+ get_block_t get_block)
+{
+ handle_t *handle;
+ loff_t size;
+ unsigned long len;
+ int credits;
+ int ret;
+
+ credits = ext4_chunk_trans_extent(inode,
+ ext4_journal_blocks_per_folio(inode));
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ folio_lock(folio);
+ size = i_size_read(inode);
+ /* Page got truncated from under us? */
+ if (folio->mapping != inode->i_mapping || folio_pos(folio) > size) {
+ ret = -EFAULT;
+ goto out_error;
+ }
+
+ len = folio_size(folio);
+ if (folio_pos(folio) + len > size)
+ len = size - folio_pos(folio);
+
+ ret = ext4_block_write_begin(handle, folio, 0, len, get_block);
+ if (ret)
+ goto out_error;
+
+ if (!ext4_should_journal_data(inode)) {
+ block_commit_write(folio, 0, len);
+ folio_mark_dirty(folio);
+ } else {
+ ret = ext4_journal_folio_buffers(handle, folio, len);
+ if (ret)
+ goto out_error;
+ }
+ ext4_journal_stop(handle);
+ folio_wait_stable(folio);
+ return ret;
+
+out_error:
+ folio_unlock(folio);
+ ext4_journal_stop(handle);
+ return ret;
+}
+
vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
@@ -6566,8 +6694,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
struct file *file = vma->vm_file;
struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
- handle_t *handle;
- get_block_t *get_block;
+ get_block_t *get_block = ext4_get_block;
int retries = 0;
if (unlikely(IS_IMMUTABLE(inode)))
@@ -6635,47 +6762,11 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
/* OK, we need to fill the hole... */
if (ext4_should_dioread_nolock(inode))
get_block = ext4_get_block_unwritten;
- else
- get_block = ext4_get_block;
retry_alloc:
- handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
- ext4_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = VM_FAULT_SIGBUS;
- goto out;
- }
- /*
- * Data journalling can't use block_page_mkwrite() because it
- * will set_buffer_dirty() before do_journal_get_write_access()
- * thus might hit warning messages for dirty metadata buffers.
- */
- if (!ext4_should_journal_data(inode)) {
- err = block_page_mkwrite(vma, vmf, get_block);
- } else {
- folio_lock(folio);
- size = i_size_read(inode);
- /* Page got truncated from under us? */
- if (folio->mapping != mapping || folio_pos(folio) > size) {
- ret = VM_FAULT_NOPAGE;
- goto out_error;
- }
-
- len = folio_size(folio);
- if (folio_pos(folio) + len > size)
- len = size - folio_pos(folio);
-
- err = ext4_block_write_begin(handle, folio, 0, len,
- ext4_get_block);
- if (!err) {
- ret = VM_FAULT_SIGBUS;
- if (ext4_journal_folio_buffers(handle, folio, len))
- goto out_error;
- } else {
- folio_unlock(folio);
- }
- }
- ext4_journal_stop(handle);
- if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ /* Start journal and allocate blocks */
+ err = ext4_block_page_mkwrite(inode, folio, get_block);
+ if (err == -EAGAIN ||
+ (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)))
goto retry_alloc;
out_ret:
ret = vmf_fs_error(err);
@@ -6683,8 +6774,4 @@ out:
filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(inode->i_sb);
return ret;
-out_error:
- folio_unlock(folio);
- ext4_journal_stop(handle);
- goto out;
}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 5668a17458ae..84e3c73952d7 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -980,7 +980,7 @@ group_add_out:
return err;
}
-int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -997,7 +997,7 @@ int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
int ext4_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
u32 flags = fa->flags;
diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
index d634c12f1984..a9416b20ff64 100644
--- a/fs/ext4/mballoc-test.c
+++ b/fs/ext4/mballoc-test.c
@@ -155,6 +155,7 @@ static struct super_block *mbt_ext4_alloc_super_block(void)
bgl_lock_init(sbi->s_blockgroup_lock);
sbi->s_es = &fsb->es;
+ sbi->s_sb = sb;
sb->s_fs_info = sbi;
up_write(&sb->s_umount);
@@ -802,6 +803,8 @@ static void test_mb_mark_used(struct kunit *test)
KUNIT_ASSERT_EQ(test, ret, 0);
grp->bb_free = EXT4_CLUSTERS_PER_GROUP(sb);
+ grp->bb_largest_free_order = -1;
+ grp->bb_avg_fragment_size_order = -1;
mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
for (i = 0; i < TEST_RANGE_COUNT; i++)
test_mb_mark_used_range(test, &e4b, ranges[i].start,
@@ -875,6 +878,8 @@ static void test_mb_free_blocks(struct kunit *test)
ext4_unlock_group(sb, TEST_GOAL_GROUP);
grp->bb_free = 0;
+ grp->bb_largest_free_order = -1;
+ grp->bb_avg_fragment_size_order = -1;
memset(bitmap, 0xff, sb->s_blocksize);
mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 1e98c5be4e0a..5898d92ba19f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -132,25 +132,30 @@
* If "mb_optimize_scan" mount option is set, we maintain in memory group info
* structures in two data structures:
*
- * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders)
+ * 1) Array of largest free order xarrays (sbi->s_mb_largest_free_orders)
*
- * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks)
+ * Locking: Writers use xa_lock, readers use rcu_read_lock.
*
- * This is an array of lists where the index in the array represents the
+ * This is an array of xarrays where the index in the array represents the
* largest free order in the buddy bitmap of the participating group infos of
- * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total
- * number of buddy bitmap orders possible) number of lists. Group-infos are
- * placed in appropriate lists.
+ * that xarray. So, there are exactly MB_NUM_ORDERS(sb) (which means total
+ * number of buddy bitmap orders possible) number of xarrays. Group-infos are
+ * placed in appropriate xarrays.
*
- * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size)
+ * 2) Average fragment size xarrays (sbi->s_mb_avg_fragment_size)
*
- * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks)
+ * Locking: Writers use xa_lock, readers use rcu_read_lock.
*
- * This is an array of lists where in the i-th list there are groups with
+ * This is an array of xarrays where in the i-th xarray there are groups with
* average fragment size >= 2^i and < 2^(i+1). The average fragment size
* is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments.
- * Note that we don't bother with a special list for completely empty groups
- * so we only have MB_NUM_ORDERS(sb) lists.
+ * Note that we don't bother with a special xarray for completely empty
+ * groups so we only have MB_NUM_ORDERS(sb) xarrays. Group-infos are placed
+ * in appropriate xarrays.
+ *
+ * In xarray, the index is the block group number, the value is the block group
+ * information, and a non-empty value indicates the block group is present in
+ * the current xarray.
*
* When "mb_optimize_scan" mount option is set, mballoc consults the above data
* structures to decide the order in which groups are to be traversed for
@@ -420,8 +425,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
-static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
- ext4_group_t group, enum criteria cr);
+static int ext4_mb_scan_group(struct ext4_allocation_context *ac,
+ ext4_group_t group);
static int ext4_try_to_trim_range(struct super_block *sb,
struct ext4_buddy *e4b, ext4_grpblk_t start,
@@ -841,132 +846,161 @@ static void
mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- int new_order;
+ int new, old;
- if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_fragments == 0)
+ if (!test_opt2(sb, MB_OPTIMIZE_SCAN))
return;
- new_order = mb_avg_fragment_size_order(sb,
- grp->bb_free / grp->bb_fragments);
- if (new_order == grp->bb_avg_fragment_size_order)
+ old = grp->bb_avg_fragment_size_order;
+ new = grp->bb_fragments == 0 ? -1 :
+ mb_avg_fragment_size_order(sb, grp->bb_free / grp->bb_fragments);
+ if (new == old)
return;
- if (grp->bb_avg_fragment_size_order != -1) {
- write_lock(&sbi->s_mb_avg_fragment_size_locks[
- grp->bb_avg_fragment_size_order]);
- list_del(&grp->bb_avg_fragment_size_node);
- write_unlock(&sbi->s_mb_avg_fragment_size_locks[
- grp->bb_avg_fragment_size_order]);
+ if (old >= 0)
+ xa_erase(&sbi->s_mb_avg_fragment_size[old], grp->bb_group);
+
+ grp->bb_avg_fragment_size_order = new;
+ if (new >= 0) {
+ /*
+ * Cannot use __GFP_NOFAIL because we hold the group lock.
+ * Although allocation for insertion may fails, it's not fatal
+ * as we have linear traversal to fall back on.
+ */
+ int err = xa_insert(&sbi->s_mb_avg_fragment_size[new],
+ grp->bb_group, grp, GFP_ATOMIC);
+ if (err)
+ mb_debug(sb, "insert group: %u to s_mb_avg_fragment_size[%d] failed, err %d",
+ grp->bb_group, new, err);
+ }
+}
+
+static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac,
+ struct xarray *xa,
+ ext4_group_t start, ext4_group_t end)
+{
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ enum criteria cr = ac->ac_criteria;
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ unsigned long group = start;
+ struct ext4_group_info *grp;
+
+ if (WARN_ON_ONCE(end > ngroups || start >= end))
+ return 0;
+
+ xa_for_each_range(xa, group, grp, start, end - 1) {
+ int err;
+
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]);
+
+ err = ext4_mb_scan_group(ac, grp->bb_group);
+ if (err || ac->ac_status != AC_STATUS_CONTINUE)
+ return err;
+
+ cond_resched();
}
- grp->bb_avg_fragment_size_order = new_order;
- write_lock(&sbi->s_mb_avg_fragment_size_locks[
- grp->bb_avg_fragment_size_order]);
- list_add_tail(&grp->bb_avg_fragment_size_node,
- &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]);
- write_unlock(&sbi->s_mb_avg_fragment_size_locks[
- grp->bb_avg_fragment_size_order]);
+
+ return 0;
+}
+
+/*
+ * Find a suitable group of given order from the largest free orders xarray.
+ */
+static inline int
+ext4_mb_scan_groups_largest_free_order_range(struct ext4_allocation_context *ac,
+ int order, ext4_group_t start,
+ ext4_group_t end)
+{
+ struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_largest_free_orders[order];
+
+ if (xa_empty(xa))
+ return 0;
+
+ return ext4_mb_scan_groups_xa_range(ac, xa, start, end);
}
/*
* Choose next group by traversing largest_free_order lists. Updates *new_cr if
* cr level needs an update.
*/
-static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac,
- enum criteria *new_cr, ext4_group_t *group)
+static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac,
+ ext4_group_t group)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct ext4_group_info *iter;
int i;
+ int ret = 0;
+ ext4_group_t start, end;
- if (ac->ac_status == AC_STATUS_FOUND)
- return;
-
- if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED))
- atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions);
-
+ start = group;
+ end = ext4_get_groups_count(ac->ac_sb);
+wrap_around:
for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
- if (list_empty(&sbi->s_mb_largest_free_orders[i]))
- continue;
- read_lock(&sbi->s_mb_largest_free_orders_locks[i]);
- if (list_empty(&sbi->s_mb_largest_free_orders[i])) {
- read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
- continue;
- }
- list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i],
- bb_largest_free_order_node) {
- if (sbi->s_mb_stats)
- atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]);
- if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) {
- *group = iter->bb_group;
- ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED;
- read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
- return;
- }
- }
- read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
+ ret = ext4_mb_scan_groups_largest_free_order_range(ac, i,
+ start, end);
+ if (ret || ac->ac_status != AC_STATUS_CONTINUE)
+ return ret;
+ }
+ if (start) {
+ end = start;
+ start = 0;
+ goto wrap_around;
}
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
+
/* Increment cr and search again if no group is found */
- *new_cr = CR_GOAL_LEN_FAST;
+ ac->ac_criteria = CR_GOAL_LEN_FAST;
+ return ret;
}
/*
- * Find a suitable group of given order from the average fragments list.
+ * Find a suitable group of given order from the average fragments xarray.
*/
-static struct ext4_group_info *
-ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int order)
+static int
+ext4_mb_scan_groups_avg_frag_order_range(struct ext4_allocation_context *ac,
+ int order, ext4_group_t start,
+ ext4_group_t end)
{
- struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct list_head *frag_list = &sbi->s_mb_avg_fragment_size[order];
- rwlock_t *frag_list_lock = &sbi->s_mb_avg_fragment_size_locks[order];
- struct ext4_group_info *grp = NULL, *iter;
- enum criteria cr = ac->ac_criteria;
+ struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_avg_fragment_size[order];
- if (list_empty(frag_list))
- return NULL;
- read_lock(frag_list_lock);
- if (list_empty(frag_list)) {
- read_unlock(frag_list_lock);
- return NULL;
- }
- list_for_each_entry(iter, frag_list, bb_avg_fragment_size_node) {
- if (sbi->s_mb_stats)
- atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]);
- if (likely(ext4_mb_good_group(ac, iter->bb_group, cr))) {
- grp = iter;
- break;
- }
- }
- read_unlock(frag_list_lock);
- return grp;
+ if (xa_empty(xa))
+ return 0;
+
+ return ext4_mb_scan_groups_xa_range(ac, xa, start, end);
}
/*
* Choose next group by traversing average fragment size list of suitable
* order. Updates *new_cr if cr level needs an update.
*/
-static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac,
- enum criteria *new_cr, ext4_group_t *group)
+static int ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac,
+ ext4_group_t group)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct ext4_group_info *grp = NULL;
- int i;
-
- if (unlikely(ac->ac_flags & EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED)) {
- if (sbi->s_mb_stats)
- atomic_inc(&sbi->s_bal_goal_fast_bad_suggestions);
+ int i, ret = 0;
+ ext4_group_t start, end;
+
+ start = group;
+ end = ext4_get_groups_count(ac->ac_sb);
+wrap_around:
+ i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
+ for (; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
+ ret = ext4_mb_scan_groups_avg_frag_order_range(ac, i,
+ start, end);
+ if (ret || ac->ac_status != AC_STATUS_CONTINUE)
+ return ret;
}
-
- for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
- i < MB_NUM_ORDERS(ac->ac_sb); i++) {
- grp = ext4_mb_find_good_group_avg_frag_lists(ac, i);
- if (grp) {
- *group = grp->bb_group;
- ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED;
- return;
- }
+ if (start) {
+ end = start;
+ start = 0;
+ goto wrap_around;
}
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
/*
* CR_BEST_AVAIL_LEN works based on the concept that we have
* a larger normalized goal len request which can be trimmed to
@@ -976,9 +1010,11 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *
* See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA).
*/
if (ac->ac_flags & EXT4_MB_HINT_DATA)
- *new_cr = CR_BEST_AVAIL_LEN;
+ ac->ac_criteria = CR_BEST_AVAIL_LEN;
else
- *new_cr = CR_GOAL_LEN_SLOW;
+ ac->ac_criteria = CR_GOAL_LEN_SLOW;
+
+ return ret;
}
/*
@@ -990,18 +1026,14 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *
* preallocations. However, we make sure that we don't trim the request too
* much and fall to CR_GOAL_LEN_SLOW in that case.
*/
-static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac,
- enum criteria *new_cr, ext4_group_t *group)
+static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac,
+ ext4_group_t group)
{
+ int ret = 0;
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct ext4_group_info *grp = NULL;
int i, order, min_order;
unsigned long num_stripe_clusters = 0;
-
- if (unlikely(ac->ac_flags & EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED)) {
- if (sbi->s_mb_stats)
- atomic_inc(&sbi->s_bal_best_avail_bad_suggestions);
- }
+ ext4_group_t start, end;
/*
* mb_avg_fragment_size_order() returns order in a way that makes
@@ -1033,6 +1065,9 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
if (1 << min_order < ac->ac_o_ex.fe_len)
min_order = fls(ac->ac_o_ex.fe_len);
+ start = group;
+ end = ext4_get_groups_count(ac->ac_sb);
+wrap_around:
for (i = order; i >= min_order; i--) {
int frag_order;
/*
@@ -1055,17 +1090,24 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
frag_order = mb_avg_fragment_size_order(ac->ac_sb,
ac->ac_g_ex.fe_len);
- grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order);
- if (grp) {
- *group = grp->bb_group;
- ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED;
- return;
- }
+ ret = ext4_mb_scan_groups_avg_frag_order_range(ac, frag_order,
+ start, end);
+ if (ret || ac->ac_status != AC_STATUS_CONTINUE)
+ return ret;
+ }
+ if (start) {
+ end = start;
+ start = 0;
+ goto wrap_around;
}
/* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */
ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
- *new_cr = CR_GOAL_LEN_SLOW;
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
+ ac->ac_criteria = CR_GOAL_LEN_SLOW;
+
+ return ret;
}
static inline int should_optimize_scan(struct ext4_allocation_context *ac)
@@ -1080,59 +1122,82 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac)
}
/*
- * Return next linear group for allocation.
+ * next linear group for allocation.
*/
-static ext4_group_t
-next_linear_group(ext4_group_t group, ext4_group_t ngroups)
+static void next_linear_group(ext4_group_t *group, ext4_group_t ngroups)
{
/*
* Artificially restricted ngroups for non-extent
* files makes group > ngroups possible on first loop.
*/
- return group + 1 >= ngroups ? 0 : group + 1;
+ *group = *group + 1 >= ngroups ? 0 : *group + 1;
}
-/*
- * ext4_mb_choose_next_group: choose next group for allocation.
- *
- * @ac Allocation Context
- * @new_cr This is an output parameter. If the there is no good group
- * available at current CR level, this field is updated to indicate
- * the new cr level that should be used.
- * @group This is an input / output parameter. As an input it indicates the
- * next group that the allocator intends to use for allocation. As
- * output, this field indicates the next group that should be used as
- * determined by the optimization functions.
- * @ngroups Total number of groups
- */
-static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
- enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+static int ext4_mb_scan_groups_linear(struct ext4_allocation_context *ac,
+ ext4_group_t ngroups, ext4_group_t *start, ext4_group_t count)
{
- *new_cr = ac->ac_criteria;
+ int ret, i;
+ enum criteria cr = ac->ac_criteria;
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_group_t group = *start;
- if (!should_optimize_scan(ac)) {
- *group = next_linear_group(*group, ngroups);
- return;
+ for (i = 0; i < count; i++, next_linear_group(&group, ngroups)) {
+ ret = ext4_mb_scan_group(ac, group);
+ if (ret || ac->ac_status != AC_STATUS_CONTINUE)
+ return ret;
+ cond_resched();
}
+ *start = group;
+ if (count == ngroups)
+ ac->ac_criteria++;
+
+ /* Processed all groups and haven't found blocks */
+ if (sbi->s_mb_stats && i == ngroups)
+ atomic64_inc(&sbi->s_bal_cX_failed[cr]);
+
+ return 0;
+}
+
+static int ext4_mb_scan_groups(struct ext4_allocation_context *ac)
+{
+ int ret = 0;
+ ext4_group_t start;
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb);
+
+ /* non-extent files are limited to low blocks/groups */
+ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
+ ngroups = sbi->s_blockfile_groups;
+
+ /* searching for the right group start from the goal value specified */
+ start = ac->ac_g_ex.fe_group;
+ ac->ac_prefetch_grp = start;
+ ac->ac_prefetch_nr = 0;
+
+ if (!should_optimize_scan(ac))
+ return ext4_mb_scan_groups_linear(ac, ngroups, &start, ngroups);
+
/*
* Optimized scanning can return non adjacent groups which can cause
* seek overhead for rotational disks. So try few linear groups before
* trying optimized scan.
*/
- if (ac->ac_groups_linear_remaining) {
- *group = next_linear_group(*group, ngroups);
- ac->ac_groups_linear_remaining--;
- return;
- }
+ if (sbi->s_mb_max_linear_groups)
+ ret = ext4_mb_scan_groups_linear(ac, ngroups, &start,
+ sbi->s_mb_max_linear_groups);
+ if (ret || ac->ac_status != AC_STATUS_CONTINUE)
+ return ret;
- if (*new_cr == CR_POWER2_ALIGNED) {
- ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group);
- } else if (*new_cr == CR_GOAL_LEN_FAST) {
- ext4_mb_choose_next_group_goal_fast(ac, new_cr, group);
- } else if (*new_cr == CR_BEST_AVAIL_LEN) {
- ext4_mb_choose_next_group_best_avail(ac, new_cr, group);
- } else {
+ switch (ac->ac_criteria) {
+ case CR_POWER2_ALIGNED:
+ return ext4_mb_scan_groups_p2_aligned(ac, start);
+ case CR_GOAL_LEN_FAST:
+ return ext4_mb_scan_groups_goal_fast(ac, start);
+ case CR_BEST_AVAIL_LEN:
+ return ext4_mb_scan_groups_best_avail(ac, start);
+ default:
/*
* TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an
* rb tree sorted by bb_free. But until that happens, we should
@@ -1140,6 +1205,8 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
*/
WARN_ON(1);
}
+
+ return 0;
}
/*
@@ -1150,33 +1217,35 @@ static void
mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- int i;
+ int new, old = grp->bb_largest_free_order;
- for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--)
- if (grp->bb_counters[i] > 0)
+ for (new = MB_NUM_ORDERS(sb) - 1; new >= 0; new--)
+ if (grp->bb_counters[new] > 0)
break;
+
/* No need to move between order lists? */
- if (!test_opt2(sb, MB_OPTIMIZE_SCAN) ||
- i == grp->bb_largest_free_order) {
- grp->bb_largest_free_order = i;
+ if (new == old)
return;
- }
- if (grp->bb_largest_free_order >= 0) {
- write_lock(&sbi->s_mb_largest_free_orders_locks[
- grp->bb_largest_free_order]);
- list_del_init(&grp->bb_largest_free_order_node);
- write_unlock(&sbi->s_mb_largest_free_orders_locks[
- grp->bb_largest_free_order]);
+ if (old >= 0) {
+ struct xarray *xa = &sbi->s_mb_largest_free_orders[old];
+
+ if (!xa_empty(xa) && xa_load(xa, grp->bb_group))
+ xa_erase(xa, grp->bb_group);
}
- grp->bb_largest_free_order = i;
- if (grp->bb_largest_free_order >= 0 && grp->bb_free) {
- write_lock(&sbi->s_mb_largest_free_orders_locks[
- grp->bb_largest_free_order]);
- list_add_tail(&grp->bb_largest_free_order_node,
- &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]);
- write_unlock(&sbi->s_mb_largest_free_orders_locks[
- grp->bb_largest_free_order]);
+
+ grp->bb_largest_free_order = new;
+ if (test_opt2(sb, MB_OPTIMIZE_SCAN) && new >= 0 && grp->bb_free) {
+ /*
+ * Cannot use __GFP_NOFAIL because we hold the group lock.
+ * Although allocation for insertion may fails, it's not fatal
+ * as we have linear traversal to fall back on.
+ */
+ int err = xa_insert(&sbi->s_mb_largest_free_orders[new],
+ grp->bb_group, grp, GFP_ATOMIC);
+ if (err)
+ mb_debug(sb, "insert group: %u to s_mb_largest_free_orders[%d] failed, err %d",
+ grp->bb_group, new, err);
}
}
@@ -2167,11 +2236,11 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
folio_get(ac->ac_buddy_folio);
/* store last allocated for subsequent stream allocation */
if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
- spin_lock(&sbi->s_md_lock);
- sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
- sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
- spin_unlock(&sbi->s_md_lock);
+ int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;
+
+ WRITE_ONCE(sbi->s_mb_last_groups[hash], ac->ac_f_ex.fe_group);
}
+
/*
* As we've just preallocated more space than
* user requested originally, we store allocated
@@ -2571,6 +2640,30 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
}
}
+static void __ext4_mb_scan_group(struct ext4_allocation_context *ac)
+{
+ bool is_stripe_aligned;
+ struct ext4_sb_info *sbi;
+ enum criteria cr = ac->ac_criteria;
+
+ ac->ac_groups_scanned++;
+ if (cr == CR_POWER2_ALIGNED)
+ return ext4_mb_simple_scan_group(ac, ac->ac_e4b);
+
+ sbi = EXT4_SB(ac->ac_sb);
+ is_stripe_aligned = false;
+ if ((sbi->s_stripe >= sbi->s_cluster_ratio) &&
+ !(ac->ac_g_ex.fe_len % EXT4_NUM_B2C(sbi, sbi->s_stripe)))
+ is_stripe_aligned = true;
+
+ if ((cr == CR_GOAL_LEN_FAST || cr == CR_BEST_AVAIL_LEN) &&
+ is_stripe_aligned)
+ ext4_mb_scan_aligned(ac, ac->ac_e4b);
+
+ if (ac->ac_status == AC_STATUS_CONTINUE)
+ ext4_mb_complex_scan_group(ac, ac->ac_e4b);
+}
+
/*
* This is also called BEFORE we load the buddy bitmap.
* Returns either 1 or 0 indicating that the group is either suitable
@@ -2761,6 +2854,37 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
}
/*
+ * Batch reads of the block allocation bitmaps to get
+ * multiple READs in flight; limit prefetching at inexpensive
+ * CR, otherwise mballoc can spend a lot of time loading
+ * imperfect groups
+ */
+static void ext4_mb_might_prefetch(struct ext4_allocation_context *ac,
+ ext4_group_t group)
+{
+ struct ext4_sb_info *sbi;
+
+ if (ac->ac_prefetch_grp != group)
+ return;
+
+ sbi = EXT4_SB(ac->ac_sb);
+ if (ext4_mb_cr_expensive(ac->ac_criteria) ||
+ ac->ac_prefetch_ios < sbi->s_mb_prefetch_limit) {
+ unsigned int nr = sbi->s_mb_prefetch;
+
+ if (ext4_has_feature_flex_bg(ac->ac_sb)) {
+ nr = 1 << sbi->s_log_groups_per_flex;
+ nr -= group & (nr - 1);
+ nr = umin(nr, sbi->s_mb_prefetch);
+ }
+
+ ac->ac_prefetch_nr = nr;
+ ac->ac_prefetch_grp = ext4_mb_prefetch(ac->ac_sb, group, nr,
+ &ac->ac_prefetch_ios);
+ }
+}
+
+/*
* Prefetching reads the block bitmap into the buffer cache; but we
* need to make sure that the buddy bitmap in the page cache has been
* initialized. Note that ext4_mb_init_group() will block if the I/O
@@ -2793,24 +2917,58 @@ void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
}
}
+static int ext4_mb_scan_group(struct ext4_allocation_context *ac,
+ ext4_group_t group)
+{
+ int ret;
+ struct super_block *sb = ac->ac_sb;
+ enum criteria cr = ac->ac_criteria;
+
+ ext4_mb_might_prefetch(ac, group);
+
+ /* prevent unnecessary buddy loading. */
+ if (cr < CR_ANY_FREE && spin_is_locked(ext4_group_lock_ptr(sb, group)))
+ return 0;
+
+ /* This now checks without needing the buddy page */
+ ret = ext4_mb_good_group_nolock(ac, group, cr);
+ if (ret <= 0) {
+ if (!ac->ac_first_err)
+ ac->ac_first_err = ret;
+ return 0;
+ }
+
+ ret = ext4_mb_load_buddy(sb, group, ac->ac_e4b);
+ if (ret)
+ return ret;
+
+ /* skip busy group */
+ if (cr >= CR_ANY_FREE)
+ ext4_lock_group(sb, group);
+ else if (!ext4_try_lock_group(sb, group))
+ goto out_unload;
+
+ /* We need to check again after locking the block group. */
+ if (unlikely(!ext4_mb_good_group(ac, group, cr)))
+ goto out_unlock;
+
+ __ext4_mb_scan_group(ac);
+
+out_unlock:
+ ext4_unlock_group(sb, group);
+out_unload:
+ ext4_mb_unload_buddy(ac->ac_e4b);
+ return ret;
+}
+
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
- ext4_group_t prefetch_grp = 0, ngroups, group, i;
- enum criteria new_cr, cr = CR_GOAL_LEN_FAST;
- int err = 0, first_err = 0;
- unsigned int nr = 0, prefetch_ios = 0;
- struct ext4_sb_info *sbi;
- struct super_block *sb;
+ ext4_group_t i;
+ int err = 0;
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_buddy e4b;
- int lost;
-
- sb = ac->ac_sb;
- sbi = EXT4_SB(sb);
- ngroups = ext4_get_groups_count(sb);
- /* non-extent files are limited to low blocks/groups */
- if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
- ngroups = sbi->s_blockfile_groups;
BUG_ON(ac->ac_status == AC_STATUS_FOUND);
@@ -2844,11 +3002,11 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
/* if stream allocation is enabled, use global goal */
if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
- /* TBD: may be hot point */
- spin_lock(&sbi->s_md_lock);
- ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
- ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
- spin_unlock(&sbi->s_md_lock);
+ int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;
+
+ ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]);
+ ac->ac_g_ex.fe_start = -1;
+ ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL;
}
/*
@@ -2856,107 +3014,21 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
* start with CR_GOAL_LEN_FAST, unless it is power of 2
* aligned, in which case let's do that faster approach first.
*/
+ ac->ac_criteria = CR_GOAL_LEN_FAST;
if (ac->ac_2order)
- cr = CR_POWER2_ALIGNED;
-repeat:
- for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
- ac->ac_criteria = cr;
- /*
- * searching for the right group start
- * from the goal value specified
- */
- group = ac->ac_g_ex.fe_group;
- ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
- prefetch_grp = group;
- nr = 0;
-
- for (i = 0, new_cr = cr; i < ngroups; i++,
- ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) {
- int ret = 0;
-
- cond_resched();
- if (new_cr != cr) {
- cr = new_cr;
- goto repeat;
- }
-
- /*
- * Batch reads of the block allocation bitmaps
- * to get multiple READs in flight; limit
- * prefetching at inexpensive CR, otherwise mballoc
- * can spend a lot of time loading imperfect groups
- */
- if ((prefetch_grp == group) &&
- (ext4_mb_cr_expensive(cr) ||
- prefetch_ios < sbi->s_mb_prefetch_limit)) {
- nr = sbi->s_mb_prefetch;
- if (ext4_has_feature_flex_bg(sb)) {
- nr = 1 << sbi->s_log_groups_per_flex;
- nr -= group & (nr - 1);
- nr = min(nr, sbi->s_mb_prefetch);
- }
- prefetch_grp = ext4_mb_prefetch(sb, group,
- nr, &prefetch_ios);
- }
-
- /* This now checks without needing the buddy page */
- ret = ext4_mb_good_group_nolock(ac, group, cr);
- if (ret <= 0) {
- if (!first_err)
- first_err = ret;
- continue;
- }
-
- err = ext4_mb_load_buddy(sb, group, &e4b);
- if (err)
- goto out;
-
- ext4_lock_group(sb, group);
-
- /*
- * We need to check again after locking the
- * block group
- */
- ret = ext4_mb_good_group(ac, group, cr);
- if (ret == 0) {
- ext4_unlock_group(sb, group);
- ext4_mb_unload_buddy(&e4b);
- continue;
- }
+ ac->ac_criteria = CR_POWER2_ALIGNED;
- ac->ac_groups_scanned++;
- if (cr == CR_POWER2_ALIGNED)
- ext4_mb_simple_scan_group(ac, &e4b);
- else {
- bool is_stripe_aligned =
- (sbi->s_stripe >=
- sbi->s_cluster_ratio) &&
- !(ac->ac_g_ex.fe_len %
- EXT4_NUM_B2C(sbi, sbi->s_stripe));
-
- if ((cr == CR_GOAL_LEN_FAST ||
- cr == CR_BEST_AVAIL_LEN) &&
- is_stripe_aligned)
- ext4_mb_scan_aligned(ac, &e4b);
-
- if (ac->ac_status == AC_STATUS_CONTINUE)
- ext4_mb_complex_scan_group(ac, &e4b);
- }
-
- ext4_unlock_group(sb, group);
- ext4_mb_unload_buddy(&e4b);
-
- if (ac->ac_status != AC_STATUS_CONTINUE)
- break;
- }
- /* Processed all groups and haven't found blocks */
- if (sbi->s_mb_stats && i == ngroups)
- atomic64_inc(&sbi->s_bal_cX_failed[cr]);
+ ac->ac_e4b = &e4b;
+ ac->ac_prefetch_ios = 0;
+ ac->ac_first_err = 0;
+repeat:
+ while (ac->ac_criteria < EXT4_MB_NUM_CRS) {
+ err = ext4_mb_scan_groups(ac);
+ if (err)
+ goto out;
- if (i == ngroups && ac->ac_criteria == CR_BEST_AVAIL_LEN)
- /* Reset goal length to original goal length before
- * falling into CR_GOAL_LEN_SLOW */
- ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
+ if (ac->ac_status != AC_STATUS_CONTINUE)
+ break;
}
if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
@@ -2967,6 +3039,8 @@ repeat:
*/
ext4_mb_try_best_found(ac, &e4b);
if (ac->ac_status != AC_STATUS_FOUND) {
+ int lost;
+
/*
* Someone more lucky has already allocated it.
* The only thing we can do is just take first
@@ -2982,23 +3056,27 @@ repeat:
ac->ac_b_ex.fe_len = 0;
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_flags |= EXT4_MB_HINT_FIRST;
- cr = CR_ANY_FREE;
+ ac->ac_criteria = CR_ANY_FREE;
goto repeat;
}
}
- if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND)
+ if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) {
atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
+ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC &&
+ ac->ac_b_ex.fe_group == ac->ac_g_ex.fe_group)
+ atomic_inc(&sbi->s_bal_stream_goals);
+ }
out:
- if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
- err = first_err;
+ if (!err && ac->ac_status != AC_STATUS_FOUND && ac->ac_first_err)
+ err = ac->ac_first_err;
mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
- ac->ac_flags, cr, err);
+ ac->ac_flags, ac->ac_criteria, err);
- if (nr)
- ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
+ if (ac->ac_prefetch_nr)
+ ext4_mb_prefetch_fini(sb, ac->ac_prefetch_grp, ac->ac_prefetch_nr);
return err;
}
@@ -3121,8 +3199,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED]));
seq_printf(seq, "\t\tuseless_loops: %llu\n",
atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED]));
- seq_printf(seq, "\t\tbad_suggestions: %u\n",
- atomic_read(&sbi->s_bal_p2_aligned_bad_suggestions));
/* CR_GOAL_LEN_FAST stats */
seq_puts(seq, "\tcr_goal_fast_stats:\n");
@@ -3135,8 +3211,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST]));
seq_printf(seq, "\t\tuseless_loops: %llu\n",
atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST]));
- seq_printf(seq, "\t\tbad_suggestions: %u\n",
- atomic_read(&sbi->s_bal_goal_fast_bad_suggestions));
/* CR_BEST_AVAIL_LEN stats */
seq_puts(seq, "\tcr_best_avail_stats:\n");
@@ -3150,8 +3224,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN]));
seq_printf(seq, "\t\tuseless_loops: %llu\n",
atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN]));
- seq_printf(seq, "\t\tbad_suggestions: %u\n",
- atomic_read(&sbi->s_bal_best_avail_bad_suggestions));
/* CR_GOAL_LEN_SLOW stats */
seq_puts(seq, "\tcr_goal_slow_stats:\n");
@@ -3181,6 +3253,8 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
seq_printf(seq, "\textents_scanned: %u\n",
atomic_read(&sbi->s_bal_ex_scanned));
seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
+ seq_printf(seq, "\t\tstream_goal_hits: %u\n",
+ atomic_read(&sbi->s_bal_stream_goals));
seq_printf(seq, "\t\tlen_goal_hits: %u\n",
atomic_read(&sbi->s_bal_len_goals));
seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
@@ -3227,6 +3301,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
unsigned long position = ((unsigned long) v);
struct ext4_group_info *grp;
unsigned int count;
+ unsigned long idx;
position--;
if (position >= MB_NUM_ORDERS(sb)) {
@@ -3235,11 +3310,8 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
seq_puts(seq, "avg_fragment_size_lists:\n");
count = 0;
- read_lock(&sbi->s_mb_avg_fragment_size_locks[position]);
- list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position],
- bb_avg_fragment_size_node)
+ xa_for_each(&sbi->s_mb_avg_fragment_size[position], idx, grp)
count++;
- read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]);
seq_printf(seq, "\tlist_order_%u_groups: %u\n",
(unsigned int)position, count);
return 0;
@@ -3251,11 +3323,8 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
seq_puts(seq, "max_free_order_lists:\n");
}
count = 0;
- read_lock(&sbi->s_mb_largest_free_orders_locks[position]);
- list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position],
- bb_largest_free_order_node)
+ xa_for_each(&sbi->s_mb_largest_free_orders[position], idx, grp)
count++;
- read_unlock(&sbi->s_mb_largest_free_orders_locks[position]);
seq_printf(seq, "\tlist_order_%u_groups: %u\n",
(unsigned int)position, count);
@@ -3375,8 +3444,6 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
init_rwsem(&meta_group_info[i]->alloc_sem);
meta_group_info[i]->bb_free_root = RB_ROOT;
- INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node);
- INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node);
meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */
meta_group_info[i]->bb_group = group;
@@ -3586,6 +3653,20 @@ static void ext4_discard_work(struct work_struct *work)
ext4_mb_unload_buddy(&e4b);
}
+static inline void ext4_mb_avg_fragment_size_destroy(struct ext4_sb_info *sbi)
+{
+ for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++)
+ xa_destroy(&sbi->s_mb_avg_fragment_size[i]);
+ kfree(sbi->s_mb_avg_fragment_size);
+}
+
+static inline void ext4_mb_largest_free_orders_destroy(struct ext4_sb_info *sbi)
+{
+ for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++)
+ xa_destroy(&sbi->s_mb_largest_free_orders[i]);
+ kfree(sbi->s_mb_largest_free_orders);
+}
+
int ext4_mb_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3631,44 +3712,27 @@ int ext4_mb_init(struct super_block *sb)
} while (i < MB_NUM_ORDERS(sb));
sbi->s_mb_avg_fragment_size =
- kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
+ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct xarray),
GFP_KERNEL);
if (!sbi->s_mb_avg_fragment_size) {
ret = -ENOMEM;
goto out;
}
- sbi->s_mb_avg_fragment_size_locks =
- kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
- GFP_KERNEL);
- if (!sbi->s_mb_avg_fragment_size_locks) {
- ret = -ENOMEM;
- goto out;
- }
- for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
- INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]);
- rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]);
- }
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ xa_init(&sbi->s_mb_avg_fragment_size[i]);
+
sbi->s_mb_largest_free_orders =
- kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
+ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct xarray),
GFP_KERNEL);
if (!sbi->s_mb_largest_free_orders) {
ret = -ENOMEM;
goto out;
}
- sbi->s_mb_largest_free_orders_locks =
- kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
- GFP_KERNEL);
- if (!sbi->s_mb_largest_free_orders_locks) {
- ret = -ENOMEM;
- goto out;
- }
- for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
- INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]);
- rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]);
- }
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ xa_init(&sbi->s_mb_largest_free_orders[i]);
spin_lock_init(&sbi->s_md_lock);
- sbi->s_mb_free_pending = 0;
+ atomic_set(&sbi->s_mb_free_pending, 0);
INIT_LIST_HEAD(&sbi->s_freed_data_list[0]);
INIT_LIST_HEAD(&sbi->s_freed_data_list[1]);
INIT_LIST_HEAD(&sbi->s_discard_list);
@@ -3709,10 +3773,19 @@ int ext4_mb_init(struct super_block *sb)
sbi->s_mb_group_prealloc, EXT4_NUM_B2C(sbi, sbi->s_stripe));
}
+ sbi->s_mb_nr_global_goals = umin(num_possible_cpus(),
+ DIV_ROUND_UP(sbi->s_groups_count, 4));
+ sbi->s_mb_last_groups = kcalloc(sbi->s_mb_nr_global_goals,
+ sizeof(ext4_group_t), GFP_KERNEL);
+ if (sbi->s_mb_last_groups == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
if (sbi->s_locality_groups == NULL) {
ret = -ENOMEM;
- goto out;
+ goto out_free_last_groups;
}
for_each_possible_cpu(i) {
struct ext4_locality_group *lg;
@@ -3737,11 +3810,12 @@ int ext4_mb_init(struct super_block *sb)
out_free_locality_groups:
free_percpu(sbi->s_locality_groups);
sbi->s_locality_groups = NULL;
+out_free_last_groups:
+ kfree(sbi->s_mb_last_groups);
+ sbi->s_mb_last_groups = NULL;
out:
- kfree(sbi->s_mb_avg_fragment_size);
- kfree(sbi->s_mb_avg_fragment_size_locks);
- kfree(sbi->s_mb_largest_free_orders);
- kfree(sbi->s_mb_largest_free_orders_locks);
+ ext4_mb_avg_fragment_size_destroy(sbi);
+ ext4_mb_largest_free_orders_destroy(sbi);
kfree(sbi->s_mb_offsets);
sbi->s_mb_offsets = NULL;
kfree(sbi->s_mb_maxs);
@@ -3808,10 +3882,8 @@ void ext4_mb_release(struct super_block *sb)
kvfree(group_info);
rcu_read_unlock();
}
- kfree(sbi->s_mb_avg_fragment_size);
- kfree(sbi->s_mb_avg_fragment_size_locks);
- kfree(sbi->s_mb_largest_free_orders);
- kfree(sbi->s_mb_largest_free_orders_locks);
+ ext4_mb_avg_fragment_size_destroy(sbi);
+ ext4_mb_largest_free_orders_destroy(sbi);
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
iput(sbi->s_buddy_cache);
@@ -3841,6 +3913,7 @@ void ext4_mb_release(struct super_block *sb)
}
free_percpu(sbi->s_locality_groups);
+ kfree(sbi->s_mb_last_groups);
}
static inline int ext4_issue_discard(struct super_block *sb,
@@ -3871,10 +3944,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
/* we expect to find existing buddy because it's pinned */
BUG_ON(err != 0);
- spin_lock(&EXT4_SB(sb)->s_md_lock);
- EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count;
- spin_unlock(&EXT4_SB(sb)->s_md_lock);
-
+ atomic_sub(entry->efd_count, &EXT4_SB(sb)->s_mb_free_pending);
db = e4b.bd_info;
/* there are blocks to put in buddy to make them really free */
count += entry->efd_count;
@@ -6278,28 +6348,63 @@ out:
* are contiguous, AND the extents were freed by the same transaction,
* AND the blocks are associated with the same group.
*/
-static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
- struct ext4_free_data *entry,
- struct ext4_free_data *new_entry,
- struct rb_root *entry_rb_root)
+static inline bool
+ext4_freed_extents_can_be_merged(struct ext4_free_data *entry1,
+ struct ext4_free_data *entry2)
{
- if ((entry->efd_tid != new_entry->efd_tid) ||
- (entry->efd_group != new_entry->efd_group))
- return;
- if (entry->efd_start_cluster + entry->efd_count ==
- new_entry->efd_start_cluster) {
- new_entry->efd_start_cluster = entry->efd_start_cluster;
- new_entry->efd_count += entry->efd_count;
- } else if (new_entry->efd_start_cluster + new_entry->efd_count ==
- entry->efd_start_cluster) {
- new_entry->efd_count += entry->efd_count;
- } else
- return;
+ if (entry1->efd_tid != entry2->efd_tid)
+ return false;
+ if (entry1->efd_start_cluster + entry1->efd_count !=
+ entry2->efd_start_cluster)
+ return false;
+ if (WARN_ON_ONCE(entry1->efd_group != entry2->efd_group))
+ return false;
+ return true;
+}
+
+static inline void
+ext4_merge_freed_extents(struct ext4_sb_info *sbi, struct rb_root *root,
+ struct ext4_free_data *entry1,
+ struct ext4_free_data *entry2)
+{
+ entry1->efd_count += entry2->efd_count;
spin_lock(&sbi->s_md_lock);
- list_del(&entry->efd_list);
+ list_del(&entry2->efd_list);
spin_unlock(&sbi->s_md_lock);
- rb_erase(&entry->efd_node, entry_rb_root);
- kmem_cache_free(ext4_free_data_cachep, entry);
+ rb_erase(&entry2->efd_node, root);
+ kmem_cache_free(ext4_free_data_cachep, entry2);
+}
+
+static inline void
+ext4_try_merge_freed_extent_prev(struct ext4_sb_info *sbi, struct rb_root *root,
+ struct ext4_free_data *entry)
+{
+ struct ext4_free_data *prev;
+ struct rb_node *node;
+
+ node = rb_prev(&entry->efd_node);
+ if (!node)
+ return;
+
+ prev = rb_entry(node, struct ext4_free_data, efd_node);
+ if (ext4_freed_extents_can_be_merged(prev, entry))
+ ext4_merge_freed_extents(sbi, root, prev, entry);
+}
+
+static inline void
+ext4_try_merge_freed_extent_next(struct ext4_sb_info *sbi, struct rb_root *root,
+ struct ext4_free_data *entry)
+{
+ struct ext4_free_data *next;
+ struct rb_node *node;
+
+ node = rb_next(&entry->efd_node);
+ if (!node)
+ return;
+
+ next = rb_entry(node, struct ext4_free_data, efd_node);
+ if (ext4_freed_extents_can_be_merged(entry, next))
+ ext4_merge_freed_extents(sbi, root, entry, next);
}
static noinline_for_stack void
@@ -6309,11 +6414,12 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
ext4_group_t group = e4b->bd_group;
ext4_grpblk_t cluster;
ext4_grpblk_t clusters = new_entry->efd_count;
- struct ext4_free_data *entry;
+ struct ext4_free_data *entry = NULL;
struct ext4_group_info *db = e4b->bd_info;
struct super_block *sb = e4b->bd_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct rb_node **n = &db->bb_free_root.rb_node, *node;
+ struct rb_root *root = &db->bb_free_root;
+ struct rb_node **n = &root->rb_node;
struct rb_node *parent = NULL, *new_node;
BUG_ON(!ext4_handle_valid(handle));
@@ -6349,27 +6455,30 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
}
}
- rb_link_node(new_node, parent, n);
- rb_insert_color(new_node, &db->bb_free_root);
+ atomic_add(clusters, &sbi->s_mb_free_pending);
+ if (!entry)
+ goto insert;
- /* Now try to see the extent can be merged to left and right */
- node = rb_prev(new_node);
- if (node) {
- entry = rb_entry(node, struct ext4_free_data, efd_node);
- ext4_try_merge_freed_extent(sbi, entry, new_entry,
- &(db->bb_free_root));
+ /* Now try to see the extent can be merged to prev and next */
+ if (ext4_freed_extents_can_be_merged(new_entry, entry)) {
+ entry->efd_start_cluster = cluster;
+ entry->efd_count += new_entry->efd_count;
+ kmem_cache_free(ext4_free_data_cachep, new_entry);
+ ext4_try_merge_freed_extent_prev(sbi, root, entry);
+ return;
}
-
- node = rb_next(new_node);
- if (node) {
- entry = rb_entry(node, struct ext4_free_data, efd_node);
- ext4_try_merge_freed_extent(sbi, entry, new_entry,
- &(db->bb_free_root));
+ if (ext4_freed_extents_can_be_merged(entry, new_entry)) {
+ entry->efd_count += new_entry->efd_count;
+ kmem_cache_free(ext4_free_data_cachep, new_entry);
+ ext4_try_merge_freed_extent_next(sbi, root, entry);
+ return;
}
+insert:
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, root);
spin_lock(&sbi->s_md_lock);
list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]);
- sbi->s_mb_free_pending += clusters;
spin_unlock(&sbi->s_md_lock);
}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index f8280de3e882..15a049f05d04 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -192,8 +192,13 @@ struct ext4_allocation_context {
*/
ext4_grpblk_t ac_orig_goal_len;
+ ext4_group_t ac_prefetch_grp;
+ unsigned int ac_prefetch_ios;
+ unsigned int ac_prefetch_nr;
+
+ int ac_first_err;
+
__u32 ac_flags; /* allocation hints */
- __u32 ac_groups_linear_remaining;
__u16 ac_groups_scanned;
__u16 ac_found;
__u16 ac_cX_found[EXT4_MB_NUM_CRS];
@@ -204,6 +209,8 @@ struct ext4_allocation_context {
__u8 ac_2order; /* if request is to allocate 2^N blocks and
* N > 0, the field stores N, otherwise 0 */
__u8 ac_op; /* operation, for history only */
+
+ struct ext4_buddy *ac_e4b;
struct folio *ac_bitmap_folio;
struct folio *ac_buddy_folio;
struct ext4_prealloc_space *ac_pa;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 1f8493a56e8f..adae3caf175a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -280,7 +280,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
*/
again:
*err = 0;
- jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
+ jblocks = ext4_meta_trans_blocks(orig_inode, block_len_in_page,
+ block_len_in_page) * 2;
handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks);
if (IS_ERR(handle)) {
*err = PTR_ERR(handle);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a178ac229489..d83f91b62317 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2915,33 +2915,50 @@ err_unlock_inode:
return err;
}
-struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
- struct ext4_dir_entry_2 *de,
- int blocksize, int csum_size,
- unsigned int parent_ino, int dotdot_real_len)
+int ext4_init_dirblock(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, unsigned int parent_ino,
+ void *inline_buf, int inline_size)
{
+ struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) bh->b_data;
+ size_t blocksize = bh->b_size;
+ int csum_size = 0, header_size;
+
+ if (ext4_has_feature_metadata_csum(inode->i_sb))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
+
de->inode = cpu_to_le32(inode->i_ino);
de->name_len = 1;
de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL),
blocksize);
- strcpy(de->name, ".");
+ memcpy(de->name, ".", 2);
ext4_set_de_type(inode->i_sb, de, S_IFDIR);
de = ext4_next_entry(de, blocksize);
de->inode = cpu_to_le32(parent_ino);
de->name_len = 2;
- if (!dotdot_real_len)
- de->rec_len = ext4_rec_len_to_disk(blocksize -
- (csum_size + ext4_dir_rec_len(1, NULL)),
- blocksize);
- else
+ memcpy(de->name, "..", 3);
+ ext4_set_de_type(inode->i_sb, de, S_IFDIR);
+ if (inline_buf) {
de->rec_len = ext4_rec_len_to_disk(
ext4_dir_rec_len(de->name_len, NULL),
blocksize);
- strcpy(de->name, "..");
- ext4_set_de_type(inode->i_sb, de, S_IFDIR);
+ de = ext4_next_entry(de, blocksize);
+ header_size = (char *)de - bh->b_data;
+ memcpy((void *)de, inline_buf, inline_size);
+ ext4_update_final_de(bh->b_data, inline_size + header_size,
+ blocksize - csum_size);
+ } else {
+ de->rec_len = ext4_rec_len_to_disk(blocksize -
+ (csum_size + ext4_dir_rec_len(1, NULL)),
+ blocksize);
+ }
- return ext4_next_entry(de, blocksize);
+ if (csum_size)
+ ext4_initialize_dirent_tail(bh, blocksize);
+ BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
+ set_buffer_uptodate(bh);
+ set_buffer_verified(bh);
+ return ext4_handle_dirty_dirblock(handle, inode, bh);
}
int ext4_init_new_dir(handle_t *handle, struct inode *dir,
@@ -2950,13 +2967,8 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir,
struct buffer_head *dir_block = NULL;
struct ext4_dir_entry_2 *de;
ext4_lblk_t block = 0;
- unsigned int blocksize = dir->i_sb->s_blocksize;
- int csum_size = 0;
int err;
- if (ext4_has_feature_metadata_csum(dir->i_sb))
- csum_size = sizeof(struct ext4_dir_entry_tail);
-
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
err = ext4_try_create_inline_dir(handle, dir, inode);
if (err < 0 && err != -ENOSPC)
@@ -2965,21 +2977,15 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir,
goto out;
}
+ set_nlink(inode, 2);
inode->i_size = 0;
dir_block = ext4_append(handle, inode, &block);
if (IS_ERR(dir_block))
return PTR_ERR(dir_block);
de = (struct ext4_dir_entry_2 *)dir_block->b_data;
- ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
- set_nlink(inode, 2);
- if (csum_size)
- ext4_initialize_dirent_tail(dir_block, blocksize);
-
- BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_dirblock(handle, inode, dir_block);
+ err = ext4_init_dirblock(handle, inode, dir_block, dir->i_ino, NULL, 0);
if (err)
goto out;
- set_buffer_verified(dir_block);
out:
brelse(dir_block);
return err;
@@ -3082,7 +3088,8 @@ bool ext4_empty_dir(struct inode *inode)
de = (struct ext4_dir_entry_2 *) bh->b_data;
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
0) ||
- le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) {
+ le32_to_cpu(de->inode) != inode->i_ino || de->name_len != 1 ||
+ de->name[0] != '.') {
ext4_warning_inode(inode, "directory missing '.'");
brelse(bh);
return false;
@@ -3091,7 +3098,8 @@ bool ext4_empty_dir(struct inode *inode)
de = ext4_next_entry(de, sb->s_blocksize);
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
offset) ||
- le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
+ le32_to_cpu(de->inode) == 0 || de->name_len != 2 ||
+ de->name[0] != '.' || de->name[1] != '.') {
ext4_warning_inode(inode, "directory missing '..'");
brelse(bh);
return false;
@@ -3532,7 +3540,7 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data,
bh->b_size, 0) ||
le32_to_cpu(de->inode) != inode->i_ino ||
- strcmp(".", de->name)) {
+ de->name_len != 1 || de->name[0] != '.') {
EXT4_ERROR_INODE(inode, "directory missing '.'");
brelse(bh);
*retval = -EFSCORRUPTED;
@@ -3543,7 +3551,8 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
de = ext4_next_entry(de, inode->i_sb->s_blocksize);
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data,
bh->b_size, offset) ||
- le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
+ le32_to_cpu(de->inode) == 0 || de->name_len != 2 ||
+ de->name[0] != '.' || de->name[1] != '.') {
EXT4_ERROR_INODE(inode, "directory missing '..'");
brelse(bh);
*retval = -EFSCORRUPTED;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 179e54f3a3b6..3d8b0f6d2dea 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -236,10 +236,12 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head)
static bool ext4_io_end_defer_completion(ext4_io_end_t *io_end)
{
- if (io_end->flag & EXT4_IO_END_UNWRITTEN)
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN &&
+ !list_empty(&io_end->list_vec))
return true;
if (test_opt(io_end->inode->i_sb, DATA_ERR_ABORT) &&
- io_end->flag & EXT4_IO_END_FAILED)
+ io_end->flag & EXT4_IO_END_FAILED &&
+ !ext4_emergency_state(io_end->inode->i_sb))
return true;
return false;
}
@@ -256,6 +258,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
WARN_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION));
WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN &&
!io_end->handle && sbi->s_journal);
+ WARN_ON(!io_end->bio);
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
wq = sbi->rsv_conversion_wq;
@@ -318,12 +321,9 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
if (refcount_dec_and_test(&io_end->count)) {
- if (io_end->flag & EXT4_IO_END_FAILED ||
- (io_end->flag & EXT4_IO_END_UNWRITTEN &&
- !list_empty(&io_end->list_vec))) {
- ext4_add_complete_io(io_end);
- return;
- }
+ if (ext4_io_end_defer_completion(io_end))
+ return ext4_add_complete_io(io_end);
+
ext4_release_io_end(io_end);
}
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a7f80ca01174..c7d39da7e733 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3627,7 +3627,7 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly)
*/
static void print_daily_error_info(struct timer_list *t)
{
- struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report);
+ struct ext4_sb_info *sbi = timer_container_of(sbi, t, s_err_report);
struct super_block *sb = sbi->s_sb;
struct ext4_super_block *es = sbi->s_es;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 8d15acbacc20..5a6fe1513fd2 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -338,7 +338,7 @@ xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry,
cmp = name_len - entry->e_name_len;
if (!cmp)
cmp = memcmp(name, entry->e_name, name_len);
- if (cmp <= 0 && (sorted || cmp == 0))
+ if (!cmp || (cmp < 0 && sorted))
break;
}
*pentry = entry;
@@ -962,7 +962,7 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
* so we need to reserve credits for this eventuality
*/
if (inode && ext4_has_inline_data(inode))
- credits += ext4_writepage_trans_blocks(inode) + 1;
+ credits += ext4_chunk_trans_extent(inode, 1) + 1;
/* We are done if ea_inode feature is not enabled. */
if (!ext4_has_feature_ea_inode(sb))
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 31e892842625..711ad80b38d0 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -3519,8 +3519,10 @@ reserve_block:
return 0;
}
-static int f2fs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
+static int f2fs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, struct folio **foliop,
+ void **fsdata)
{
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -3656,7 +3658,7 @@ fail:
return err;
}
-static int f2fs_write_end(struct file *file,
+static int f2fs_write_end(const struct kiocb *iocb,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9333a22b9a01..c78464792ceb 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3615,9 +3615,9 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count);
int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag,
bool readonly, bool need_lock);
int f2fs_precache_extents(struct inode *inode);
-int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int f2fs_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
int f2fs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 6bd3de64f2a8..c677230699fd 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -35,6 +35,17 @@
#include <trace/events/f2fs.h>
#include <uapi/linux/f2fs.h>
+static void f2fs_zero_post_eof_page(struct inode *inode, loff_t new_size)
+{
+ loff_t old_size = i_size_read(inode);
+
+ if (old_size >= new_size)
+ return;
+
+ /* zero or drop pages only in range of [old_size, new_size] */
+ truncate_pagecache(inode, old_size);
+}
+
static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
@@ -103,8 +114,13 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
+ filemap_invalidate_lock(inode->i_mapping);
+ f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT);
+ filemap_invalidate_unlock(inode->i_mapping);
+
file_update_time(vmf->vma->vm_file);
filemap_invalidate_lock_shared(inode->i_mapping);
+
folio_lock(folio);
if (unlikely(folio->mapping != inode->i_mapping ||
folio_pos(folio) > i_size_read(inode) ||
@@ -532,8 +548,9 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
return -EINVAL;
}
-static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int f2fs_file_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
struct inode *inode = file_inode(file);
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
@@ -543,7 +560,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
return -EOPNOTSUPP;
file_accessed(file);
- vma->vm_ops = &f2fs_file_vm_ops;
+ desc->vm_ops = &f2fs_file_vm_ops;
f2fs_down_read(&F2FS_I(inode)->i_sem);
set_inode_flag(inode, FI_MMAP_FILE);
@@ -1109,6 +1126,8 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
+ if (attr->ia_size > old_size)
+ f2fs_zero_post_eof_page(inode, attr->ia_size);
truncate_setsize(inode, attr->ia_size);
if (attr->ia_size <= old_size)
@@ -1227,6 +1246,10 @@ static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (ret)
return ret;
+ filemap_invalidate_lock(inode->i_mapping);
+ f2fs_zero_post_eof_page(inode, offset + len);
+ filemap_invalidate_unlock(inode->i_mapping);
+
pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
@@ -1510,6 +1533,8 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
+ f2fs_zero_post_eof_page(inode, offset + len);
+
f2fs_lock_op(sbi);
f2fs_drop_extent_tree(inode);
truncate_pagecache(inode, offset);
@@ -1631,6 +1656,10 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
if (ret)
return ret;
+ filemap_invalidate_lock(mapping);
+ f2fs_zero_post_eof_page(inode, offset + len);
+ filemap_invalidate_unlock(mapping);
+
pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
@@ -1762,6 +1791,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
/* avoid gc operation during block exchange */
f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(mapping);
+
+ f2fs_zero_post_eof_page(inode, offset + len);
truncate_pagecache(inode, offset);
while (!ret && idx > pg_start) {
@@ -1819,6 +1850,10 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset,
if (err)
return err;
+ filemap_invalidate_lock(inode->i_mapping);
+ f2fs_zero_post_eof_page(inode, offset + len);
+ filemap_invalidate_unlock(inode->i_mapping);
+
f2fs_balance_fs(sbi, true);
pg_start = ((unsigned long long)offset) >> PAGE_SHIFT;
@@ -3356,7 +3391,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
}
#endif
-int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int f2fs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -3380,7 +3415,7 @@ int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
int f2fs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
u32 fsflags = fa->flags, mask = F2FS_SETTABLE_FS_FL;
@@ -4860,6 +4895,10 @@ static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from)
err = file_modified(file);
if (err)
return err;
+
+ filemap_invalidate_lock(inode->i_mapping);
+ f2fs_zero_post_eof_page(inode, iocb->ki_pos + iov_iter_count(from));
+ filemap_invalidate_unlock(inode->i_mapping);
return count;
}
@@ -5376,7 +5415,7 @@ const struct file_operations f2fs_file_operations = {
.iopoll = iocb_bio_iopoll,
.open = f2fs_file_open,
.release = f2fs_release_file,
- .mmap = f2fs_file_mmap,
+ .mmap_prepare = f2fs_file_mmap_prepare,
.flush = f2fs_file_flush,
.fsync = f2fs_sync_file,
.fallocate = f2fs_fallocate,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 1cb4cba7f961..bfe104db284e 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -2078,7 +2078,6 @@ write_node:
if (!__write_node_folio(folio, false, &submitted,
wbc, do_balance, io_type, NULL)) {
- folio_unlock(folio);
folio_batch_release(&fbatch);
ret = -EIO;
goto out;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index e887e9ab7472..4fc49a614fb8 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -204,7 +204,7 @@ const struct file_operations fat_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.release = fat_file_release,
.unlocked_ioctl = fat_generic_ioctl,
.compat_ioctl = compat_ptr_ioctl,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 3852bb66358c..9648ed097816 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -219,13 +219,14 @@ static void fat_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int fat_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int fat_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int err;
- err = cont_write_begin(file, mapping, pos, len,
+ err = cont_write_begin(iocb, mapping, pos, len,
foliop, fsdata, fat_get_block,
&MSDOS_I(mapping->host)->mmu_private);
if (err < 0)
@@ -233,13 +234,14 @@ static int fat_write_begin(struct file *file, struct address_space *mapping,
return err;
}
-static int fat_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+static int fat_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
int err;
- err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
if (err < len)
fat_write_failed(mapping, pos + len);
if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 23e9b9371ec3..0b920ee40a7f 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -646,7 +646,7 @@ static const struct inode_operations msdos_dir_inode_operations = {
static void setup(struct super_block *sb)
{
MSDOS_SB(sb)->dir_ops = &msdos_dir_inode_operations;
- sb->s_d_op = &msdos_dentry_operations;
+ set_default_d_op(sb, &msdos_dentry_operations);
sb->s_flags |= SB_NOATIME;
}
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index dd910edd2404..5dbc4cbb8fce 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -1187,9 +1187,9 @@ static void setup(struct super_block *sb)
{
MSDOS_SB(sb)->dir_ops = &vfat_dir_inode_operations;
if (MSDOS_SB(sb)->options.name_check != 's')
- sb->s_d_op = &vfat_ci_dentry_ops;
+ set_default_d_op(sb, &vfat_ci_dentry_ops);
else
- sb->s_d_op = &vfat_dentry_ops;
+ set_default_d_op(sb, &vfat_dentry_ops);
}
static int vfat_fill_super(struct super_block *sb, struct fs_context *fc)
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 3e092ae6d142..7c236f64cdea 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -88,7 +88,7 @@ static long do_sys_name_to_handle(const struct path *path,
if (fh_flags & EXPORT_FH_CONNECTABLE) {
handle->handle_type |= FILEID_IS_CONNECTABLE;
if (d_is_dir(path->dentry))
- fh_flags |= FILEID_IS_DIR;
+ handle->handle_type |= FILEID_IS_DIR;
}
retval = 0;
}
@@ -168,23 +168,28 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
return err;
}
-static int get_path_from_fd(int fd, struct path *root)
+static int get_path_anchor(int fd, struct path *root)
{
- if (fd == AT_FDCWD) {
- struct fs_struct *fs = current->fs;
- spin_lock(&fs->lock);
- *root = fs->pwd;
- path_get(root);
- spin_unlock(&fs->lock);
- } else {
+ if (fd >= 0) {
CLASS(fd, f)(fd);
if (fd_empty(f))
return -EBADF;
*root = fd_file(f)->f_path;
path_get(root);
+ return 0;
}
- return 0;
+ if (fd == AT_FDCWD) {
+ get_fs_pwd(current->fs, root);
+ return 0;
+ }
+
+ if (fd == FD_PIDFS_ROOT) {
+ pidfs_get_root(root);
+ return 0;
+ }
+
+ return -EBADF;
}
static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
@@ -323,13 +328,24 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
{
int retval = 0;
struct file_handle f_handle;
- struct file_handle *handle = NULL;
+ struct file_handle *handle __free(kfree) = NULL;
struct handle_to_path_ctx ctx = {};
const struct export_operations *eops;
- retval = get_path_from_fd(mountdirfd, &ctx.root);
+ if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
+ return -EFAULT;
+
+ if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
+ (f_handle.handle_bytes == 0))
+ return -EINVAL;
+
+ if (f_handle.handle_type < 0 ||
+ FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS)
+ return -EINVAL;
+
+ retval = get_path_anchor(mountdirfd, &ctx.root);
if (retval)
- goto out_err;
+ return retval;
eops = ctx.root.mnt->mnt_sb->s_export_op;
if (eops && eops->permission)
@@ -339,21 +355,6 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
if (retval)
goto out_path;
- if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
- retval = -EFAULT;
- goto out_path;
- }
- if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
- (f_handle.handle_bytes == 0)) {
- retval = -EINVAL;
- goto out_path;
- }
- if (f_handle.handle_type < 0 ||
- FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS) {
- retval = -EINVAL;
- goto out_path;
- }
-
handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes),
GFP_KERNEL);
if (!handle) {
@@ -366,7 +367,7 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
&ufh->f_handle,
f_handle.handle_bytes)) {
retval = -EFAULT;
- goto out_handle;
+ goto out_path;
}
/*
@@ -384,11 +385,8 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
handle->handle_type &= ~FILEID_USER_FLAGS_MASK;
retval = do_handle_to_path(handle, path, &ctx);
-out_handle:
- kfree(handle);
out_path:
path_put(&ctx.root);
-out_err:
return retval;
}
diff --git a/fs/file.c b/fs/file.c
index 3a3146664cf3..6d2275c3be9c 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -197,6 +197,21 @@ static struct fdtable *alloc_fdtable(unsigned int slots_wanted)
return ERR_PTR(-EMFILE);
}
+ /*
+ * Check if the allocation size would exceed INT_MAX. kvmalloc_array()
+ * and kvmalloc() will warn if the allocation size is greater than
+ * INT_MAX, as filp_cachep objects are not __GFP_NOWARN.
+ *
+ * This can happen when sysctl_nr_open is set to a very high value and
+ * a process tries to use a file descriptor near that limit. For example,
+ * if sysctl_nr_open is set to 1073741816 (0x3ffffff8) - which is what
+ * systemd typically sets it to - then trying to use a file descriptor
+ * close to that value will require allocating a file descriptor table
+ * that exceeds 8GB in size.
+ */
+ if (unlikely(nr > INT_MAX / sizeof(struct file *)))
+ return ERR_PTR(-EMFILE);
+
fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
if (!fdt)
goto out;
@@ -1198,8 +1213,12 @@ bool file_seek_cur_needs_f_lock(struct file *file)
if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared)
return false;
- VFS_WARN_ON_ONCE((file_count(file) > 1) &&
- !mutex_is_locked(&file->f_pos_lock));
+ /*
+ * Note that we are not guaranteed to be called after fdget_pos() on
+ * this file obj, in which case the caller is expected to provide the
+ * appropriate locking.
+ */
+
return true;
}
diff --git a/fs/file_attr.c b/fs/file_attr.c
new file mode 100644
index 000000000000..12424d4945d0
--- /dev/null
+++ b/fs/file_attr.c
@@ -0,0 +1,498 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fs.h>
+#include <linux/security.h>
+#include <linux/fscrypt.h>
+#include <linux/fileattr.h>
+#include <linux/export.h>
+#include <linux/syscalls.h>
+#include <linux/namei.h>
+
+#include "internal.h"
+
+/**
+ * fileattr_fill_xflags - initialize fileattr with xflags
+ * @fa: fileattr pointer
+ * @xflags: FS_XFLAG_* flags
+ *
+ * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags). All
+ * other fields are zeroed.
+ */
+void fileattr_fill_xflags(struct file_kattr *fa, u32 xflags)
+{
+ memset(fa, 0, sizeof(*fa));
+ fa->fsx_valid = true;
+ fa->fsx_xflags = xflags;
+ if (fa->fsx_xflags & FS_XFLAG_IMMUTABLE)
+ fa->flags |= FS_IMMUTABLE_FL;
+ if (fa->fsx_xflags & FS_XFLAG_APPEND)
+ fa->flags |= FS_APPEND_FL;
+ if (fa->fsx_xflags & FS_XFLAG_SYNC)
+ fa->flags |= FS_SYNC_FL;
+ if (fa->fsx_xflags & FS_XFLAG_NOATIME)
+ fa->flags |= FS_NOATIME_FL;
+ if (fa->fsx_xflags & FS_XFLAG_NODUMP)
+ fa->flags |= FS_NODUMP_FL;
+ if (fa->fsx_xflags & FS_XFLAG_DAX)
+ fa->flags |= FS_DAX_FL;
+ if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
+ fa->flags |= FS_PROJINHERIT_FL;
+}
+EXPORT_SYMBOL(fileattr_fill_xflags);
+
+/**
+ * fileattr_fill_flags - initialize fileattr with flags
+ * @fa: fileattr pointer
+ * @flags: FS_*_FL flags
+ *
+ * Set ->flags, ->flags_valid and ->fsx_xflags (translated flags).
+ * All other fields are zeroed.
+ */
+void fileattr_fill_flags(struct file_kattr *fa, u32 flags)
+{
+ memset(fa, 0, sizeof(*fa));
+ fa->flags_valid = true;
+ fa->flags = flags;
+ if (fa->flags & FS_SYNC_FL)
+ fa->fsx_xflags |= FS_XFLAG_SYNC;
+ if (fa->flags & FS_IMMUTABLE_FL)
+ fa->fsx_xflags |= FS_XFLAG_IMMUTABLE;
+ if (fa->flags & FS_APPEND_FL)
+ fa->fsx_xflags |= FS_XFLAG_APPEND;
+ if (fa->flags & FS_NODUMP_FL)
+ fa->fsx_xflags |= FS_XFLAG_NODUMP;
+ if (fa->flags & FS_NOATIME_FL)
+ fa->fsx_xflags |= FS_XFLAG_NOATIME;
+ if (fa->flags & FS_DAX_FL)
+ fa->fsx_xflags |= FS_XFLAG_DAX;
+ if (fa->flags & FS_PROJINHERIT_FL)
+ fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
+}
+EXPORT_SYMBOL(fileattr_fill_flags);
+
+/**
+ * vfs_fileattr_get - retrieve miscellaneous file attributes
+ * @dentry: the object to retrieve from
+ * @fa: fileattr pointer
+ *
+ * Call i_op->fileattr_get() callback, if exists.
+ *
+ * Return: 0 on success, or a negative error on failure.
+ */
+int vfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ int error;
+
+ if (!inode->i_op->fileattr_get)
+ return -EOPNOTSUPP;
+
+ error = security_inode_file_getattr(dentry, fa);
+ if (error)
+ return error;
+
+ return inode->i_op->fileattr_get(dentry, fa);
+}
+EXPORT_SYMBOL(vfs_fileattr_get);
+
+static void fileattr_to_file_attr(const struct file_kattr *fa,
+ struct file_attr *fattr)
+{
+ __u32 mask = FS_XFLAGS_MASK;
+
+ memset(fattr, 0, sizeof(struct file_attr));
+ fattr->fa_xflags = fa->fsx_xflags & mask;
+ fattr->fa_extsize = fa->fsx_extsize;
+ fattr->fa_nextents = fa->fsx_nextents;
+ fattr->fa_projid = fa->fsx_projid;
+ fattr->fa_cowextsize = fa->fsx_cowextsize;
+}
+
+/**
+ * copy_fsxattr_to_user - copy fsxattr to userspace.
+ * @fa: fileattr pointer
+ * @ufa: fsxattr user pointer
+ *
+ * Return: 0 on success, or -EFAULT on failure.
+ */
+int copy_fsxattr_to_user(const struct file_kattr *fa, struct fsxattr __user *ufa)
+{
+ struct fsxattr xfa;
+ __u32 mask = FS_XFLAGS_MASK;
+
+ memset(&xfa, 0, sizeof(xfa));
+ xfa.fsx_xflags = fa->fsx_xflags & mask;
+ xfa.fsx_extsize = fa->fsx_extsize;
+ xfa.fsx_nextents = fa->fsx_nextents;
+ xfa.fsx_projid = fa->fsx_projid;
+ xfa.fsx_cowextsize = fa->fsx_cowextsize;
+
+ if (copy_to_user(ufa, &xfa, sizeof(xfa)))
+ return -EFAULT;
+
+ return 0;
+}
+EXPORT_SYMBOL(copy_fsxattr_to_user);
+
+static int file_attr_to_fileattr(const struct file_attr *fattr,
+ struct file_kattr *fa)
+{
+ __u64 mask = FS_XFLAGS_MASK;
+
+ if (fattr->fa_xflags & ~mask)
+ return -EINVAL;
+
+ fileattr_fill_xflags(fa, fattr->fa_xflags);
+ fa->fsx_xflags &= ~FS_XFLAG_RDONLY_MASK;
+ fa->fsx_extsize = fattr->fa_extsize;
+ fa->fsx_projid = fattr->fa_projid;
+ fa->fsx_cowextsize = fattr->fa_cowextsize;
+
+ return 0;
+}
+
+static int copy_fsxattr_from_user(struct file_kattr *fa,
+ struct fsxattr __user *ufa)
+{
+ struct fsxattr xfa;
+ __u32 mask = FS_XFLAGS_MASK;
+
+ if (copy_from_user(&xfa, ufa, sizeof(xfa)))
+ return -EFAULT;
+
+ if (xfa.fsx_xflags & ~mask)
+ return -EOPNOTSUPP;
+
+ fileattr_fill_xflags(fa, xfa.fsx_xflags);
+ fa->fsx_xflags &= ~FS_XFLAG_RDONLY_MASK;
+ fa->fsx_extsize = xfa.fsx_extsize;
+ fa->fsx_nextents = xfa.fsx_nextents;
+ fa->fsx_projid = xfa.fsx_projid;
+ fa->fsx_cowextsize = xfa.fsx_cowextsize;
+
+ return 0;
+}
+
+/*
+ * Generic function to check FS_IOC_FSSETXATTR/FS_IOC_SETFLAGS values and reject
+ * any invalid configurations.
+ *
+ * Note: must be called with inode lock held.
+ */
+static int fileattr_set_prepare(struct inode *inode,
+ const struct file_kattr *old_ma,
+ struct file_kattr *fa)
+{
+ int err;
+
+ /*
+ * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+ * the relevant capability.
+ */
+ if ((fa->flags ^ old_ma->flags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) &&
+ !capable(CAP_LINUX_IMMUTABLE))
+ return -EPERM;
+
+ err = fscrypt_prepare_setflags(inode, old_ma->flags, fa->flags);
+ if (err)
+ return err;
+
+ /*
+ * Project Quota ID state is only allowed to change from within the init
+ * namespace. Enforce that restriction only if we are trying to change
+ * the quota ID state. Everything else is allowed in user namespaces.
+ */
+ if (current_user_ns() != &init_user_ns) {
+ if (old_ma->fsx_projid != fa->fsx_projid)
+ return -EINVAL;
+ if ((old_ma->fsx_xflags ^ fa->fsx_xflags) &
+ FS_XFLAG_PROJINHERIT)
+ return -EINVAL;
+ } else {
+ /*
+ * Caller is allowed to change the project ID. If it is being
+ * changed, make sure that the new value is valid.
+ */
+ if (old_ma->fsx_projid != fa->fsx_projid &&
+ !projid_valid(make_kprojid(&init_user_ns, fa->fsx_projid)))
+ return -EINVAL;
+ }
+
+ /* Check extent size hints. */
+ if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
+ !S_ISDIR(inode->i_mode))
+ return -EINVAL;
+
+ if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) &&
+ !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+ return -EINVAL;
+
+ /*
+ * It is only valid to set the DAX flag on regular files and
+ * directories on filesystems.
+ */
+ if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
+ !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
+ return -EINVAL;
+
+ /* Extent size hints of zero turn off the flags. */
+ if (fa->fsx_extsize == 0)
+ fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
+ if (fa->fsx_cowextsize == 0)
+ fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
+
+ return 0;
+}
+
+/**
+ * vfs_fileattr_set - change miscellaneous file attributes
+ * @idmap: idmap of the mount
+ * @dentry: the object to change
+ * @fa: fileattr pointer
+ *
+ * After verifying permissions, call i_op->fileattr_set() callback, if
+ * exists.
+ *
+ * Verifying attributes involves retrieving current attributes with
+ * i_op->fileattr_get(), this also allows initializing attributes that have
+ * not been set by the caller to current values. Inode lock is held
+ * thoughout to prevent racing with another instance.
+ *
+ * Return: 0 on success, or a negative error on failure.
+ */
+int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct file_kattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ struct file_kattr old_ma = {};
+ int err;
+
+ if (!inode->i_op->fileattr_set)
+ return -EOPNOTSUPP;
+
+ if (!inode_owner_or_capable(idmap, inode))
+ return -EPERM;
+
+ inode_lock(inode);
+ err = vfs_fileattr_get(dentry, &old_ma);
+ if (!err) {
+ /* initialize missing bits from old_ma */
+ if (fa->flags_valid) {
+ fa->fsx_xflags |= old_ma.fsx_xflags & ~FS_XFLAG_COMMON;
+ fa->fsx_extsize = old_ma.fsx_extsize;
+ fa->fsx_nextents = old_ma.fsx_nextents;
+ fa->fsx_projid = old_ma.fsx_projid;
+ fa->fsx_cowextsize = old_ma.fsx_cowextsize;
+ } else {
+ fa->flags |= old_ma.flags & ~FS_COMMON_FL;
+ }
+
+ err = fileattr_set_prepare(inode, &old_ma, fa);
+ if (err)
+ goto out;
+ err = security_inode_file_setattr(dentry, fa);
+ if (err)
+ goto out;
+ err = inode->i_op->fileattr_set(idmap, dentry, fa);
+ if (err)
+ goto out;
+ }
+
+out:
+ inode_unlock(inode);
+ return err;
+}
+EXPORT_SYMBOL(vfs_fileattr_set);
+
+int ioctl_getflags(struct file *file, unsigned int __user *argp)
+{
+ struct file_kattr fa = { .flags_valid = true }; /* hint only */
+ int err;
+
+ err = vfs_fileattr_get(file->f_path.dentry, &fa);
+ if (err == -EOPNOTSUPP)
+ err = -ENOIOCTLCMD;
+ if (!err)
+ err = put_user(fa.flags, argp);
+ return err;
+}
+EXPORT_SYMBOL(ioctl_getflags);
+
+int ioctl_setflags(struct file *file, unsigned int __user *argp)
+{
+ struct mnt_idmap *idmap = file_mnt_idmap(file);
+ struct dentry *dentry = file->f_path.dentry;
+ struct file_kattr fa;
+ unsigned int flags;
+ int err;
+
+ err = get_user(flags, argp);
+ if (!err) {
+ err = mnt_want_write_file(file);
+ if (!err) {
+ fileattr_fill_flags(&fa, flags);
+ err = vfs_fileattr_set(idmap, dentry, &fa);
+ mnt_drop_write_file(file);
+ if (err == -EOPNOTSUPP)
+ err = -ENOIOCTLCMD;
+ }
+ }
+ return err;
+}
+EXPORT_SYMBOL(ioctl_setflags);
+
+int ioctl_fsgetxattr(struct file *file, void __user *argp)
+{
+ struct file_kattr fa = { .fsx_valid = true }; /* hint only */
+ int err;
+
+ err = vfs_fileattr_get(file->f_path.dentry, &fa);
+ if (err == -EOPNOTSUPP)
+ err = -ENOIOCTLCMD;
+ if (!err)
+ err = copy_fsxattr_to_user(&fa, argp);
+
+ return err;
+}
+EXPORT_SYMBOL(ioctl_fsgetxattr);
+
+int ioctl_fssetxattr(struct file *file, void __user *argp)
+{
+ struct mnt_idmap *idmap = file_mnt_idmap(file);
+ struct dentry *dentry = file->f_path.dentry;
+ struct file_kattr fa;
+ int err;
+
+ err = copy_fsxattr_from_user(&fa, argp);
+ if (!err) {
+ err = mnt_want_write_file(file);
+ if (!err) {
+ err = vfs_fileattr_set(idmap, dentry, &fa);
+ mnt_drop_write_file(file);
+ if (err == -EOPNOTSUPP)
+ err = -ENOIOCTLCMD;
+ }
+ }
+ return err;
+}
+EXPORT_SYMBOL(ioctl_fssetxattr);
+
+SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename,
+ struct file_attr __user *, ufattr, size_t, usize,
+ unsigned int, at_flags)
+{
+ struct path filepath __free(path_put) = {};
+ struct filename *name __free(putname) = NULL;
+ unsigned int lookup_flags = 0;
+ struct file_attr fattr;
+ struct file_kattr fa;
+ int error;
+
+ BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0);
+ BUILD_BUG_ON(sizeof(struct file_attr) != FILE_ATTR_SIZE_LATEST);
+
+ if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+ return -EINVAL;
+
+ if (!(at_flags & AT_SYMLINK_NOFOLLOW))
+ lookup_flags |= LOOKUP_FOLLOW;
+
+ if (usize > PAGE_SIZE)
+ return -E2BIG;
+
+ if (usize < FILE_ATTR_SIZE_VER0)
+ return -EINVAL;
+
+ name = getname_maybe_null(filename, at_flags);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ if (!name && dfd >= 0) {
+ CLASS(fd, f)(dfd);
+ if (fd_empty(f))
+ return -EBADF;
+
+ filepath = fd_file(f)->f_path;
+ path_get(&filepath);
+ } else {
+ error = filename_lookup(dfd, name, lookup_flags, &filepath,
+ NULL);
+ if (error)
+ return error;
+ }
+
+ error = vfs_fileattr_get(filepath.dentry, &fa);
+ if (error)
+ return error;
+
+ fileattr_to_file_attr(&fa, &fattr);
+ error = copy_struct_to_user(ufattr, usize, &fattr,
+ sizeof(struct file_attr), NULL);
+
+ return error;
+}
+
+SYSCALL_DEFINE5(file_setattr, int, dfd, const char __user *, filename,
+ struct file_attr __user *, ufattr, size_t, usize,
+ unsigned int, at_flags)
+{
+ struct path filepath __free(path_put) = {};
+ struct filename *name __free(putname) = NULL;
+ unsigned int lookup_flags = 0;
+ struct file_attr fattr;
+ struct file_kattr fa;
+ int error;
+
+ BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0);
+ BUILD_BUG_ON(sizeof(struct file_attr) != FILE_ATTR_SIZE_LATEST);
+
+ if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+ return -EINVAL;
+
+ if (!(at_flags & AT_SYMLINK_NOFOLLOW))
+ lookup_flags |= LOOKUP_FOLLOW;
+
+ if (usize > PAGE_SIZE)
+ return -E2BIG;
+
+ if (usize < FILE_ATTR_SIZE_VER0)
+ return -EINVAL;
+
+ error = copy_struct_from_user(&fattr, sizeof(struct file_attr), ufattr,
+ usize);
+ if (error)
+ return error;
+
+ error = file_attr_to_fileattr(&fattr, &fa);
+ if (error)
+ return error;
+
+ name = getname_maybe_null(filename, at_flags);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ if (!name && dfd >= 0) {
+ CLASS(fd, f)(dfd);
+ if (fd_empty(f))
+ return -EBADF;
+
+ filepath = fd_file(f)->f_path;
+ path_get(&filepath);
+ } else {
+ error = filename_lookup(dfd, name, lookup_flags, &filepath,
+ NULL);
+ if (error)
+ return error;
+ }
+
+ error = mnt_want_write(filepath.mnt);
+ if (!error) {
+ error = vfs_fileattr_set(mnt_idmap(filepath.mnt),
+ filepath.dentry, &fa);
+ mnt_drop_write(filepath.mnt);
+ }
+
+ return error;
+}
diff --git a/fs/file_table.c b/fs/file_table.c
index 138114d64307..81c72576e548 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -52,17 +52,20 @@ struct backing_file {
};
};
-static inline struct backing_file *backing_file(struct file *f)
-{
- return container_of(f, struct backing_file, file);
-}
+#define backing_file(f) container_of(f, struct backing_file, file)
-struct path *backing_file_user_path(struct file *f)
+struct path *backing_file_user_path(const struct file *f)
{
return &backing_file(f)->user_path;
}
EXPORT_SYMBOL_GPL(backing_file_user_path);
+void backing_file_set_user_path(struct file *f, const struct path *path)
+{
+ backing_file(f)->user_path = *path;
+}
+EXPORT_SYMBOL_GPL(backing_file_set_user_path);
+
static inline void file_free(struct file *f)
{
security_file_free(f);
@@ -196,7 +199,7 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
file_ref_init(&f->f_ref, 1);
/*
* Disable permission and pre-content events for all files by default.
- * They may be enabled later by file_set_fsnotify_mode_from_watchers().
+ * They may be enabled later by fsnotify_open_perm_and_set_mode().
*/
file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM);
return 0;
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 64c2d0814ed6..28be762ac1c6 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -17,12 +17,10 @@ void set_fs_root(struct fs_struct *fs, const struct path *path)
struct path old_root;
path_get(path);
- spin_lock(&fs->lock);
- write_seqcount_begin(&fs->seq);
+ write_seqlock(&fs->seq);
old_root = fs->root;
fs->root = *path;
- write_seqcount_end(&fs->seq);
- spin_unlock(&fs->lock);
+ write_sequnlock(&fs->seq);
if (old_root.dentry)
path_put(&old_root);
}
@@ -36,12 +34,10 @@ void set_fs_pwd(struct fs_struct *fs, const struct path *path)
struct path old_pwd;
path_get(path);
- spin_lock(&fs->lock);
- write_seqcount_begin(&fs->seq);
+ write_seqlock(&fs->seq);
old_pwd = fs->pwd;
fs->pwd = *path;
- write_seqcount_end(&fs->seq);
- spin_unlock(&fs->lock);
+ write_sequnlock(&fs->seq);
if (old_pwd.dentry)
path_put(&old_pwd);
@@ -67,16 +63,14 @@ void chroot_fs_refs(const struct path *old_root, const struct path *new_root)
fs = p->fs;
if (fs) {
int hits = 0;
- spin_lock(&fs->lock);
- write_seqcount_begin(&fs->seq);
+ write_seqlock(&fs->seq);
hits += replace_path(&fs->root, old_root, new_root);
hits += replace_path(&fs->pwd, old_root, new_root);
- write_seqcount_end(&fs->seq);
while (hits--) {
count++;
path_get(new_root);
}
- spin_unlock(&fs->lock);
+ write_sequnlock(&fs->seq);
}
task_unlock(p);
}
@@ -99,10 +93,10 @@ void exit_fs(struct task_struct *tsk)
if (fs) {
int kill;
task_lock(tsk);
- spin_lock(&fs->lock);
+ read_seqlock_excl(&fs->seq);
tsk->fs = NULL;
kill = !--fs->users;
- spin_unlock(&fs->lock);
+ read_sequnlock_excl(&fs->seq);
task_unlock(tsk);
if (kill)
free_fs_struct(fs);
@@ -116,16 +110,15 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
if (fs) {
fs->users = 1;
fs->in_exec = 0;
- spin_lock_init(&fs->lock);
- seqcount_spinlock_init(&fs->seq, &fs->lock);
+ seqlock_init(&fs->seq);
fs->umask = old->umask;
- spin_lock(&old->lock);
+ read_seqlock_excl(&old->seq);
fs->root = old->root;
path_get(&fs->root);
fs->pwd = old->pwd;
path_get(&fs->pwd);
- spin_unlock(&old->lock);
+ read_sequnlock_excl(&old->seq);
}
return fs;
}
@@ -140,10 +133,10 @@ int unshare_fs_struct(void)
return -ENOMEM;
task_lock(current);
- spin_lock(&fs->lock);
+ read_seqlock_excl(&fs->seq);
kill = !--fs->users;
current->fs = new_fs;
- spin_unlock(&fs->lock);
+ read_sequnlock_excl(&fs->seq);
task_unlock(current);
if (kill)
@@ -162,7 +155,6 @@ EXPORT_SYMBOL(current_umask);
/* to be mentioned only in INIT_TASK */
struct fs_struct init_fs = {
.users = 1,
- .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock),
- .seq = SEQCNT_SPINLOCK_ZERO(init_fs.seq, &init_fs.lock),
+ .seq = __SEQLOCK_UNLOCKED(init_fs.seq),
.umask = 0022,
};
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index ca215a3cba3e..a774166264de 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -2,6 +2,7 @@
config FUSE_FS
tristate "FUSE (Filesystem in Userspace) support"
select FS_POSIX_ACL
+ select FS_IOMAP
help
With FUSE it is possible to implement a fully functional filesystem
in a userspace program.
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 2a730d88cc3b..bb407705603c 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -11,6 +11,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs_context.h>
+#include <linux/namei.h>
#define FUSE_CTL_SUPER_MAGIC 0x65735543
@@ -212,7 +213,6 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
struct dentry *dentry;
struct inode *inode;
- BUG_ON(fc->ctl_ndents >= FUSE_CTL_NUM_DENTRIES);
dentry = d_alloc_name(parent, name);
if (!dentry)
return NULL;
@@ -236,8 +236,6 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
inode->i_private = fc;
d_add(dentry, inode);
- fc->ctl_dentry[fc->ctl_ndents++] = dentry;
-
return dentry;
}
@@ -280,27 +278,29 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
return -ENOMEM;
}
+static void remove_one(struct dentry *dentry)
+{
+ d_inode(dentry)->i_private = NULL;
+}
+
/*
* Remove a connection from the control filesystem (if it exists).
* Caller must hold fuse_mutex
*/
void fuse_ctl_remove_conn(struct fuse_conn *fc)
{
- int i;
+ struct dentry *dentry;
+ char name[32];
if (!fuse_control_sb || fc->no_control)
return;
- for (i = fc->ctl_ndents - 1; i >= 0; i--) {
- struct dentry *dentry = fc->ctl_dentry[i];
- d_inode(dentry)->i_private = NULL;
- if (!i) {
- /* Get rid of submounts: */
- d_invalidate(dentry);
- }
- dput(dentry);
+ sprintf(name, "%u", fc->dev);
+ dentry = lookup_noperm_positive_unlocked(&QSTR(name), fuse_control_sb->s_root);
+ if (!IS_ERR(dentry)) {
+ simple_recursive_removal(dentry, remove_one);
+ dput(dentry); // paired with lookup_noperm_positive_unlocked()
}
- drop_nlink(d_inode(fuse_control_sb->s_root));
}
static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fsc)
@@ -346,12 +346,8 @@ static int fuse_ctl_init_fs_context(struct fs_context *fsc)
static void fuse_ctl_kill_sb(struct super_block *sb)
{
- struct fuse_conn *fc;
-
mutex_lock(&fuse_mutex);
fuse_control_sb = NULL;
- list_for_each_entry(fc, &fuse_conn_list, entry)
- fc->ctl_ndents = 0;
mutex_unlock(&fuse_mutex);
kill_litter_super(sb);
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 0502bf3cdf6a..ac6d4c1064cc 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -10,7 +10,6 @@
#include <linux/dax.h>
#include <linux/uio.h>
#include <linux/pagemap.h>
-#include <linux/pfn_t.h>
#include <linux/iomap.h>
#include <linux/interval_tree.h>
@@ -757,7 +756,7 @@ static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, unsigned int order,
vm_fault_t ret;
struct inode *inode = file_inode(vmf->vma->vm_file);
struct super_block *sb = inode->i_sb;
- pfn_t pfn;
+ unsigned long pfn;
int error = 0;
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_conn_dax *fcd = fc->dax;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6dcbaa218b7a..e80cd8f2c049 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -23,6 +23,7 @@
#include <linux/swap.h>
#include <linux/splice.h>
#include <linux/sched.h>
+#include <linux/seq_file.h>
#define CREATE_TRACE_POINTS
#include "fuse_trace.h"
@@ -45,7 +46,7 @@ bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list)
return time_is_before_jiffies(req->create_time + fc->timeout.req_timeout);
}
-bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing)
+static bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing)
{
int i;
@@ -816,7 +817,7 @@ static int unlock_request(struct fuse_req *req)
return err;
}
-void fuse_copy_init(struct fuse_copy_state *cs, int write,
+void fuse_copy_init(struct fuse_copy_state *cs, bool write,
struct iov_iter *iter)
{
memset(cs, 0, sizeof(*cs));
@@ -955,10 +956,10 @@ static int fuse_check_folio(struct folio *folio)
* folio that was originally in @pagep will lose a reference and the new
* folio returned in @pagep will carry a reference.
*/
-static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
+static int fuse_try_move_folio(struct fuse_copy_state *cs, struct folio **foliop)
{
int err;
- struct folio *oldfolio = page_folio(*pagep);
+ struct folio *oldfolio = *foliop;
struct folio *newfolio;
struct pipe_buffer *buf = cs->pipebufs;
@@ -979,7 +980,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
cs->pipebufs++;
cs->nr_segs--;
- if (cs->len != PAGE_SIZE)
+ if (cs->len != folio_size(oldfolio))
goto out_fallback;
if (!pipe_buf_try_steal(cs->pipe, buf))
@@ -1025,7 +1026,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
if (test_bit(FR_ABORTED, &cs->req->flags))
err = -ENOENT;
else
- *pagep = &newfolio->page;
+ *foliop = newfolio;
spin_unlock(&cs->req->waitq.lock);
if (err) {
@@ -1058,8 +1059,8 @@ out_fallback:
goto out_put_old;
}
-static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
- unsigned offset, unsigned count)
+static int fuse_ref_folio(struct fuse_copy_state *cs, struct folio *folio,
+ unsigned offset, unsigned count)
{
struct pipe_buffer *buf;
int err;
@@ -1067,17 +1068,17 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
if (cs->nr_segs >= cs->pipe->max_usage)
return -EIO;
- get_page(page);
+ folio_get(folio);
err = unlock_request(cs->req);
if (err) {
- put_page(page);
+ folio_put(folio);
return err;
}
fuse_copy_finish(cs);
buf = cs->pipebufs;
- buf->page = page;
+ buf->page = &folio->page;
buf->offset = offset;
buf->len = count;
@@ -1089,20 +1090,24 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
}
/*
- * Copy a page in the request to/from the userspace buffer. Must be
+ * Copy a folio in the request to/from the userspace buffer. Must be
* done atomically
*/
-static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
- unsigned offset, unsigned count, int zeroing)
+static int fuse_copy_folio(struct fuse_copy_state *cs, struct folio **foliop,
+ unsigned offset, unsigned count, int zeroing)
{
int err;
- struct page *page = *pagep;
+ struct folio *folio = *foliop;
+ size_t size;
- if (page && zeroing && count < PAGE_SIZE)
- clear_highpage(page);
+ if (folio) {
+ size = folio_size(folio);
+ if (zeroing && count < size)
+ folio_zero_range(folio, 0, size);
+ }
while (count) {
- if (cs->write && cs->pipebufs && page) {
+ if (cs->write && cs->pipebufs && folio) {
/*
* Can't control lifetime of pipe buffers, so always
* copy user pages.
@@ -1112,12 +1117,12 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
if (err)
return err;
} else {
- return fuse_ref_page(cs, page, offset, count);
+ return fuse_ref_folio(cs, folio, offset, count);
}
} else if (!cs->len) {
- if (cs->move_pages && page &&
- offset == 0 && count == PAGE_SIZE) {
- err = fuse_try_move_page(cs, pagep);
+ if (cs->move_folios && folio &&
+ offset == 0 && count == size) {
+ err = fuse_try_move_folio(cs, foliop);
if (err <= 0)
return err;
} else {
@@ -1126,22 +1131,30 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
return err;
}
}
- if (page) {
- void *mapaddr = kmap_local_page(page);
- void *buf = mapaddr + offset;
- offset += fuse_copy_do(cs, &buf, &count);
+ if (folio) {
+ void *mapaddr = kmap_local_folio(folio, offset);
+ void *buf = mapaddr;
+ unsigned int copy = count;
+ unsigned int bytes_copied;
+
+ if (folio_test_highmem(folio) && count > PAGE_SIZE - offset_in_page(offset))
+ copy = PAGE_SIZE - offset_in_page(offset);
+
+ bytes_copied = fuse_copy_do(cs, &buf, &copy);
kunmap_local(mapaddr);
+ offset += bytes_copied;
+ count -= bytes_copied;
} else
offset += fuse_copy_do(cs, NULL, &count);
}
- if (page && !cs->write)
- flush_dcache_page(page);
+ if (folio && !cs->write)
+ flush_dcache_folio(folio);
return 0;
}
-/* Copy pages in the request to/from userspace buffer */
-static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
- int zeroing)
+/* Copy folios in the request to/from userspace buffer */
+static int fuse_copy_folios(struct fuse_copy_state *cs, unsigned nbytes,
+ int zeroing)
{
unsigned i;
struct fuse_req *req = cs->req;
@@ -1151,23 +1164,12 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
int err;
unsigned int offset = ap->descs[i].offset;
unsigned int count = min(nbytes, ap->descs[i].length);
- struct page *orig, *pagep;
- orig = pagep = &ap->folios[i]->page;
-
- err = fuse_copy_page(cs, &pagep, offset, count, zeroing);
+ err = fuse_copy_folio(cs, &ap->folios[i], offset, count, zeroing);
if (err)
return err;
nbytes -= count;
-
- /*
- * fuse_copy_page may have moved a page from a pipe instead of
- * copying into our given page, so update the folios if it was
- * replaced.
- */
- if (pagep != orig)
- ap->folios[i] = page_folio(pagep);
}
return 0;
}
@@ -1197,7 +1199,7 @@ int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
for (i = 0; !err && i < numargs; i++) {
struct fuse_arg *arg = &args[i];
if (i == numargs - 1 && argpages)
- err = fuse_copy_pages(cs, arg->size, zeroing);
+ err = fuse_copy_folios(cs, arg->size, zeroing);
else
err = fuse_copy_one(cs, arg->value, arg->size);
}
@@ -1538,7 +1540,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to)
if (!user_backed_iter(to))
return -EINVAL;
- fuse_copy_init(&cs, 1, to);
+ fuse_copy_init(&cs, true, to);
return fuse_dev_do_read(fud, file, &cs, iov_iter_count(to));
}
@@ -1561,7 +1563,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
if (!bufs)
return -ENOMEM;
- fuse_copy_init(&cs, 1, NULL);
+ fuse_copy_init(&cs, true, NULL);
cs.pipebufs = bufs;
cs.pipe = pipe;
ret = fuse_dev_do_read(fud, in, &cs, len);
@@ -1786,20 +1788,23 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
num = outarg.size;
while (num) {
struct folio *folio;
- struct page *page;
- unsigned int this_num;
+ unsigned int folio_offset;
+ unsigned int nr_bytes;
+ unsigned int nr_pages;
folio = filemap_grab_folio(mapping, index);
err = PTR_ERR(folio);
if (IS_ERR(folio))
goto out_iput;
- page = &folio->page;
- this_num = min_t(unsigned, num, folio_size(folio) - offset);
- err = fuse_copy_page(cs, &page, offset, this_num, 0);
+ folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset;
+ nr_bytes = min_t(unsigned, num, folio_size(folio) - folio_offset);
+ nr_pages = (offset + nr_bytes + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ err = fuse_copy_folio(cs, &folio, folio_offset, nr_bytes, 0);
if (!folio_test_uptodate(folio) && !err && offset == 0 &&
- (this_num == folio_size(folio) || file_size == end)) {
- folio_zero_segment(folio, this_num, folio_size(folio));
+ (nr_bytes == folio_size(folio) || file_size == end)) {
+ folio_zero_segment(folio, nr_bytes, folio_size(folio));
folio_mark_uptodate(folio);
}
folio_unlock(folio);
@@ -1808,9 +1813,9 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
if (err)
goto out_iput;
- num -= this_num;
+ num -= nr_bytes;
offset = 0;
- index++;
+ index += nr_pages;
}
err = 0;
@@ -1849,7 +1854,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
unsigned int num;
unsigned int offset;
size_t total_len = 0;
- unsigned int num_pages, cur_pages = 0;
+ unsigned int num_pages;
struct fuse_conn *fc = fm->fc;
struct fuse_retrieve_args *ra;
size_t args_size = sizeof(*ra);
@@ -1867,6 +1872,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
num_pages = min(num_pages, fc->max_pages);
+ num = min(num, num_pages << PAGE_SHIFT);
args_size += num_pages * (sizeof(ap->folios[0]) + sizeof(ap->descs[0]));
@@ -1887,25 +1893,29 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
index = outarg->offset >> PAGE_SHIFT;
- while (num && cur_pages < num_pages) {
+ while (num) {
struct folio *folio;
- unsigned int this_num;
+ unsigned int folio_offset;
+ unsigned int nr_bytes;
+ unsigned int nr_pages;
folio = filemap_get_folio(mapping, index);
if (IS_ERR(folio))
break;
- this_num = min_t(unsigned, num, PAGE_SIZE - offset);
+ folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset;
+ nr_bytes = min(folio_size(folio) - folio_offset, num);
+ nr_pages = (offset + nr_bytes + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
ap->folios[ap->num_folios] = folio;
- ap->descs[ap->num_folios].offset = offset;
- ap->descs[ap->num_folios].length = this_num;
+ ap->descs[ap->num_folios].offset = folio_offset;
+ ap->descs[ap->num_folios].length = nr_bytes;
ap->num_folios++;
- cur_pages++;
offset = 0;
- num -= this_num;
- total_len += this_num;
- index++;
+ num -= nr_bytes;
+ total_len += nr_bytes;
+ index += nr_pages;
}
ra->inarg.offset = outarg->offset;
ra->inarg.size = total_len;
@@ -2021,11 +2031,24 @@ static int fuse_notify_resend(struct fuse_conn *fc)
return 0;
}
+/*
+ * Increments the fuse connection epoch. This will result of dentries from
+ * previous epochs to be invalidated.
+ *
+ * XXX optimization: add call to shrink_dcache_sb()?
+ */
+static int fuse_notify_inc_epoch(struct fuse_conn *fc)
+{
+ atomic_inc(&fc->epoch);
+
+ return 0;
+}
+
static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
unsigned int size, struct fuse_copy_state *cs)
{
- /* Don't try to move pages (yet) */
- cs->move_pages = 0;
+ /* Don't try to move folios (yet) */
+ cs->move_folios = false;
switch (code) {
case FUSE_NOTIFY_POLL:
@@ -2049,6 +2072,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
case FUSE_NOTIFY_RESEND:
return fuse_notify_resend(fc);
+ case FUSE_NOTIFY_INC_EPOCH:
+ return fuse_notify_inc_epoch(fc);
+
default:
fuse_copy_finish(cs);
return -EINVAL;
@@ -2173,7 +2199,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
spin_unlock(&fpq->lock);
cs->req = req;
if (!req->args->page_replace)
- cs->move_pages = 0;
+ cs->move_folios = false;
if (oh.error)
err = nbytes != sizeof(oh) ? -EINVAL : 0;
@@ -2211,7 +2237,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
if (!user_backed_iter(from))
return -EINVAL;
- fuse_copy_init(&cs, 0, from);
+ fuse_copy_init(&cs, false, from);
return fuse_dev_do_write(fud, &cs, iov_iter_count(from));
}
@@ -2285,13 +2311,13 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
}
pipe_unlock(pipe);
- fuse_copy_init(&cs, 0, NULL);
+ fuse_copy_init(&cs, false, NULL);
cs.pipebufs = bufs;
cs.nr_segs = nbuf;
cs.pipe = pipe;
if (flags & SPLICE_F_MOVE)
- cs.move_pages = 1;
+ cs.move_folios = true;
ret = fuse_dev_do_write(fud, &cs, len);
@@ -2602,6 +2628,17 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
}
}
+#ifdef CONFIG_PROC_FS
+static void fuse_dev_show_fdinfo(struct seq_file *seq, struct file *file)
+{
+ struct fuse_dev *fud = fuse_get_dev(file);
+ if (!fud)
+ return;
+
+ seq_printf(seq, "fuse_connection:\t%u\n", fud->fc->dev);
+}
+#endif
+
const struct file_operations fuse_dev_operations = {
.owner = THIS_MODULE,
.open = fuse_dev_open,
@@ -2617,6 +2654,9 @@ const struct file_operations fuse_dev_operations = {
#ifdef CONFIG_FUSE_IO_URING
.uring_cmd = fuse_uring_cmd,
#endif
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = fuse_dev_show_fdinfo,
+#endif
};
EXPORT_SYMBOL_GPL(fuse_dev_operations);
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index accdce2977c5..249b210becb1 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -140,6 +140,21 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring)
}
}
+static bool ent_list_request_expired(struct fuse_conn *fc, struct list_head *list)
+{
+ struct fuse_ring_ent *ent;
+ struct fuse_req *req;
+
+ ent = list_first_entry_or_null(list, struct fuse_ring_ent, list);
+ if (!ent)
+ return false;
+
+ req = ent->fuse_req;
+
+ return time_is_before_jiffies(req->create_time +
+ fc->timeout.req_timeout);
+}
+
bool fuse_uring_request_expired(struct fuse_conn *fc)
{
struct fuse_ring *ring = fc->ring;
@@ -157,7 +172,8 @@ bool fuse_uring_request_expired(struct fuse_conn *fc)
spin_lock(&queue->lock);
if (fuse_request_expired(fc, &queue->fuse_req_queue) ||
fuse_request_expired(fc, &queue->fuse_req_bg_queue) ||
- fuse_fpq_processing_expired(fc, queue->fpq.processing)) {
+ ent_list_request_expired(fc, &queue->ent_w_req_queue) ||
+ ent_list_request_expired(fc, &queue->ent_in_userspace)) {
spin_unlock(&queue->lock);
return true;
}
@@ -494,7 +510,7 @@ static void fuse_uring_cancel(struct io_uring_cmd *cmd,
spin_lock(&queue->lock);
if (ent->state == FRRS_AVAILABLE) {
ent->state = FRRS_USERSPACE;
- list_move(&ent->list, &queue->ent_in_userspace);
+ list_move_tail(&ent->list, &queue->ent_in_userspace);
need_cmd_done = true;
ent->cmd = NULL;
}
@@ -577,8 +593,8 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring,
if (err)
return err;
- fuse_copy_init(&cs, 0, &iter);
- cs.is_uring = 1;
+ fuse_copy_init(&cs, false, &iter);
+ cs.is_uring = true;
cs.req = req;
return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz);
@@ -607,8 +623,8 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req,
return err;
}
- fuse_copy_init(&cs, 1, &iter);
- cs.is_uring = 1;
+ fuse_copy_init(&cs, true, &iter);
+ cs.is_uring = true;
cs.req = req;
if (num_args > 0) {
@@ -714,7 +730,7 @@ static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent,
cmd = ent->cmd;
ent->cmd = NULL;
ent->state = FRRS_USERSPACE;
- list_move(&ent->list, &queue->ent_in_userspace);
+ list_move_tail(&ent->list, &queue->ent_in_userspace);
spin_unlock(&queue->lock);
io_uring_cmd_done(cmd, 0, 0, issue_flags);
@@ -764,7 +780,7 @@ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent,
clear_bit(FR_PENDING, &req->flags);
ent->fuse_req = req;
ent->state = FRRS_FUSE_REQ;
- list_move(&ent->list, &queue->ent_w_req_queue);
+ list_move_tail(&ent->list, &queue->ent_w_req_queue);
fuse_uring_add_to_pq(ent, req);
}
@@ -1180,7 +1196,7 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd,
spin_lock(&queue->lock);
ent->state = FRRS_USERSPACE;
- list_move(&ent->list, &queue->ent_in_userspace);
+ list_move_tail(&ent->list, &queue->ent_in_userspace);
ent->cmd = NULL;
spin_unlock(&queue->lock);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 7d7ed45cb3e9..2d817d7cab26 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -200,9 +200,14 @@ static int fuse_dentry_revalidate(struct inode *dir, const struct qstr *name,
{
struct inode *inode;
struct fuse_mount *fm;
+ struct fuse_conn *fc;
struct fuse_inode *fi;
int ret;
+ fc = get_fuse_conn_super(dir->i_sb);
+ if (entry->d_time < atomic_read(&fc->epoch))
+ goto invalid;
+
inode = d_inode_rcu(entry);
if (inode && fuse_is_bad(inode))
goto invalid;
@@ -333,13 +338,6 @@ const struct dentry_operations fuse_dentry_operations = {
.d_automount = fuse_dentry_automount,
};
-const struct dentry_operations fuse_root_dentry_operations = {
-#if BITS_PER_LONG < 64
- .d_init = fuse_dentry_init,
- .d_release = fuse_dentry_release,
-#endif
-};
-
int fuse_valid_type(int m)
{
return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) ||
@@ -412,16 +410,20 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
unsigned int flags)
{
- int err;
struct fuse_entry_out outarg;
+ struct fuse_conn *fc;
struct inode *inode;
struct dentry *newent;
+ int err, epoch;
bool outarg_valid = true;
bool locked;
if (fuse_is_bad(dir))
return ERR_PTR(-EIO);
+ fc = get_fuse_conn_super(dir->i_sb);
+ epoch = atomic_read(&fc->epoch);
+
locked = fuse_lock_inode(dir);
err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name,
&outarg, &inode);
@@ -443,6 +445,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
goto out_err;
entry = newent ? newent : entry;
+ entry->d_time = epoch;
if (outarg_valid)
fuse_change_entry_timeout(entry, &outarg);
else
@@ -616,7 +619,6 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *entry, struct file *file,
unsigned int flags, umode_t mode, u32 opcode)
{
- int err;
struct inode *inode;
struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
@@ -626,11 +628,13 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir,
struct fuse_entry_out outentry;
struct fuse_inode *fi;
struct fuse_file *ff;
+ int epoch, err;
bool trunc = flags & O_TRUNC;
/* Userspace expects S_IFREG in create mode */
BUG_ON((mode & S_IFMT) != S_IFREG);
+ epoch = atomic_read(&fm->fc->epoch);
forget = fuse_alloc_forget();
err = -ENOMEM;
if (!forget)
@@ -699,6 +703,7 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir,
}
kfree(forget);
d_instantiate(entry, inode);
+ entry->d_time = epoch;
fuse_change_entry_timeout(entry, &outentry);
fuse_dir_changed(dir);
err = generic_file_open(inode, file);
@@ -785,12 +790,14 @@ static struct dentry *create_new_entry(struct mnt_idmap *idmap, struct fuse_moun
struct fuse_entry_out outarg;
struct inode *inode;
struct dentry *d;
- int err;
struct fuse_forget_link *forget;
+ int epoch, err;
if (fuse_is_bad(dir))
return ERR_PTR(-EIO);
+ epoch = atomic_read(&fm->fc->epoch);
+
forget = fuse_alloc_forget();
if (!forget)
return ERR_PTR(-ENOMEM);
@@ -832,10 +839,13 @@ static struct dentry *create_new_entry(struct mnt_idmap *idmap, struct fuse_moun
if (IS_ERR(d))
return d;
- if (d)
+ if (d) {
+ d->d_time = epoch;
fuse_change_entry_timeout(d, &outarg);
- else
+ } else {
+ entry->d_time = epoch;
fuse_change_entry_timeout(entry, &outarg);
+ }
fuse_dir_changed(dir);
return d;
@@ -1609,10 +1619,10 @@ static int fuse_permission(struct mnt_idmap *idmap,
return err;
}
-static int fuse_readlink_page(struct inode *inode, struct folio *folio)
+static int fuse_readlink_folio(struct inode *inode, struct folio *folio)
{
struct fuse_mount *fm = get_fuse_mount(inode);
- struct fuse_folio_desc desc = { .length = PAGE_SIZE - 1 };
+ struct fuse_folio_desc desc = { .length = folio_size(folio) - 1 };
struct fuse_args_pages ap = {
.num_folios = 1,
.folios = &folio,
@@ -1667,7 +1677,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode,
if (!folio)
goto out_err;
- err = fuse_readlink_page(inode, folio);
+ err = fuse_readlink_folio(inode, folio);
if (err) {
folio_put(folio);
goto out_err;
@@ -1943,6 +1953,7 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
int err;
bool trust_local_cmtime = is_wb;
bool fault_blocked = false;
+ u64 attr_version;
if (!fc->default_permissions)
attr->ia_valid |= ATTR_FORCE;
@@ -2027,6 +2038,8 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (fc->handle_killpriv_v2 && !capable(CAP_FSETID))
inarg.valid |= FATTR_KILL_SUIDGID;
}
+
+ attr_version = fuse_get_attr_version(fm->fc);
fuse_setattr_fill(fc, &args, inode, &inarg, &outarg);
err = fuse_simple_request(fm, &args);
if (err) {
@@ -2052,6 +2065,14 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
/* FIXME: clear I_DIRTY_SYNC? */
}
+ if (fi->attr_version > attr_version) {
+ /*
+ * Apply attributes, for example for fsnotify_change(), but set
+ * attribute timeout to zero.
+ */
+ outarg.attr_valid = outarg.attr_valid_nsec = 0;
+ }
+
fuse_change_attributes_common(inode, &outarg.attr, NULL,
ATTR_TIMEOUT(&outarg),
fuse_get_cache_mask(inode), 0);
@@ -2257,7 +2278,7 @@ void fuse_init_dir(struct inode *inode)
static int fuse_symlink_read_folio(struct file *null, struct folio *folio)
{
- int err = fuse_readlink_page(folio->mapping->host, folio);
+ int err = fuse_readlink_folio(folio->mapping->host, folio);
if (!err)
folio_mark_uptodate(folio);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 754378dd9f71..5525a4520b0f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -21,6 +21,7 @@
#include <linux/filelock.h>
#include <linux/splice.h>
#include <linux/task_io_accounting_ops.h>
+#include <linux/iomap.h>
static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
unsigned int open_flags, int opcode,
@@ -415,89 +416,11 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
struct fuse_writepage_args {
struct fuse_io_args ia;
- struct rb_node writepages_entry;
struct list_head queue_entry;
- struct fuse_writepage_args *next;
struct inode *inode;
struct fuse_sync_bucket *bucket;
};
-static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
- pgoff_t idx_from, pgoff_t idx_to)
-{
- struct rb_node *n;
-
- n = fi->writepages.rb_node;
-
- while (n) {
- struct fuse_writepage_args *wpa;
- pgoff_t curr_index;
-
- wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry);
- WARN_ON(get_fuse_inode(wpa->inode) != fi);
- curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT;
- if (idx_from >= curr_index + wpa->ia.ap.num_folios)
- n = n->rb_right;
- else if (idx_to < curr_index)
- n = n->rb_left;
- else
- return wpa;
- }
- return NULL;
-}
-
-/*
- * Check if any page in a range is under writeback
- */
-static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
- pgoff_t idx_to)
-{
- struct fuse_inode *fi = get_fuse_inode(inode);
- bool found;
-
- if (RB_EMPTY_ROOT(&fi->writepages))
- return false;
-
- spin_lock(&fi->lock);
- found = fuse_find_writeback(fi, idx_from, idx_to);
- spin_unlock(&fi->lock);
-
- return found;
-}
-
-static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
-{
- return fuse_range_is_writeback(inode, index, index);
-}
-
-/*
- * Wait for page writeback to be completed.
- *
- * Since fuse doesn't rely on the VM writeback tracking, this has to
- * use some other means.
- */
-static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
-{
- struct fuse_inode *fi = get_fuse_inode(inode);
-
- wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
-}
-
-static inline bool fuse_folio_is_writeback(struct inode *inode,
- struct folio *folio)
-{
- pgoff_t last = folio_next_index(folio) - 1;
- return fuse_range_is_writeback(inode, folio_index(folio), last);
-}
-
-static void fuse_wait_on_folio_writeback(struct inode *inode,
- struct folio *folio)
-{
- struct fuse_inode *fi = get_fuse_inode(inode);
-
- wait_event(fi->page_waitq, !fuse_folio_is_writeback(inode, folio));
-}
-
/*
* Wait for all pending writepages on the inode to finish.
*
@@ -532,10 +455,6 @@ static int fuse_flush(struct file *file, fl_owner_t id)
if (err)
return err;
- inode_lock(inode);
- fuse_sync_writes(inode);
- inode_unlock(inode);
-
err = filemap_check_errors(file->f_mapping);
if (err)
return err;
@@ -870,12 +789,16 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
}
}
-static int fuse_do_readfolio(struct file *file, struct folio *folio)
+static int fuse_do_readfolio(struct file *file, struct folio *folio,
+ size_t off, size_t len)
{
struct inode *inode = folio->mapping->host;
struct fuse_mount *fm = get_fuse_mount(inode);
- loff_t pos = folio_pos(folio);
- struct fuse_folio_desc desc = { .length = PAGE_SIZE };
+ loff_t pos = folio_pos(folio) + off;
+ struct fuse_folio_desc desc = {
+ .offset = off,
+ .length = len,
+ };
struct fuse_io_args ia = {
.ap.args.page_zeroing = true,
.ap.args.out_pages = true,
@@ -886,13 +809,6 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio)
ssize_t res;
u64 attr_ver;
- /*
- * With the temporary pages that are used to complete writeback, we can
- * have writeback that extends beyond the lifetime of the folio. So
- * make sure we read a properly synced folio.
- */
- fuse_wait_on_folio_writeback(inode, folio);
-
attr_ver = fuse_get_attr_version(fm->fc);
/* Don't overflow end offset */
@@ -909,8 +825,6 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio)
if (res < desc.length)
fuse_short_read(inode, attr_ver, res, &ia.ap);
- folio_mark_uptodate(folio);
-
return 0;
}
@@ -923,13 +837,26 @@ static int fuse_read_folio(struct file *file, struct folio *folio)
if (fuse_is_bad(inode))
goto out;
- err = fuse_do_readfolio(file, folio);
+ err = fuse_do_readfolio(file, folio, 0, folio_size(folio));
+ if (!err)
+ folio_mark_uptodate(folio);
+
fuse_invalidate_atime(inode);
out:
folio_unlock(folio);
return err;
}
+static int fuse_iomap_read_folio_range(const struct iomap_iter *iter,
+ struct folio *folio, loff_t pos,
+ size_t len)
+{
+ struct file *file = iter->private;
+ size_t off = offset_in_folio(folio, pos);
+
+ return fuse_do_readfolio(file, folio, off, len);
+}
+
static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
int err)
{
@@ -965,14 +892,13 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
fuse_io_free(ia);
}
-static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
+static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file,
+ unsigned int count)
{
struct fuse_file *ff = file->private_data;
struct fuse_mount *fm = ff->fm;
struct fuse_args_pages *ap = &ia->ap;
loff_t pos = folio_pos(ap->folios[0]);
- /* Currently, all folios in FUSE are one page */
- size_t count = ap->num_folios << PAGE_SHIFT;
ssize_t res;
int err;
@@ -1005,17 +931,13 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
static void fuse_readahead(struct readahead_control *rac)
{
struct inode *inode = rac->mapping->host;
- struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = get_fuse_conn(inode);
unsigned int max_pages, nr_pages;
- pgoff_t first = readahead_index(rac);
- pgoff_t last = first + readahead_count(rac) - 1;
+ struct folio *folio = NULL;
if (fuse_is_bad(inode))
return;
- wait_event(fi->page_waitq, !fuse_range_is_writeback(inode, first, last));
-
max_pages = min_t(unsigned int, fc->max_pages,
fc->max_read / PAGE_SIZE);
@@ -1033,8 +955,8 @@ static void fuse_readahead(struct readahead_control *rac)
while (nr_pages) {
struct fuse_io_args *ia;
struct fuse_args_pages *ap;
- struct folio *folio;
unsigned cur_pages = min(max_pages, nr_pages);
+ unsigned int pages = 0;
if (fc->num_background >= fc->congestion_threshold &&
rac->ra->async_size >= readahead_count(rac))
@@ -1046,10 +968,12 @@ static void fuse_readahead(struct readahead_control *rac)
ia = fuse_io_alloc(NULL, cur_pages);
if (!ia)
- return;
+ break;
ap = &ia->ap;
- while (ap->num_folios < cur_pages) {
+ while (pages < cur_pages) {
+ unsigned int folio_pages;
+
/*
* This returns a folio with a ref held on it.
* The ref needs to be held until the request is
@@ -1057,13 +981,31 @@ static void fuse_readahead(struct readahead_control *rac)
* fuse_try_move_page()) drops the ref after it's
* replaced in the page cache.
*/
- folio = __readahead_folio(rac);
+ if (!folio)
+ folio = __readahead_folio(rac);
+
+ folio_pages = folio_nr_pages(folio);
+ if (folio_pages > cur_pages - pages) {
+ /*
+ * Large folios belonging to fuse will never
+ * have more pages than max_pages.
+ */
+ WARN_ON(!pages);
+ break;
+ }
+
ap->folios[ap->num_folios] = folio;
ap->descs[ap->num_folios].length = folio_size(folio);
ap->num_folios++;
+ pages += folio_pages;
+ folio = NULL;
}
- fuse_send_readpages(ia, rac->file);
- nr_pages -= cur_pages;
+ fuse_send_readpages(ia, rac->file, pages << PAGE_SHIFT);
+ nr_pages -= pages;
+ }
+ if (folio) {
+ folio_end_read(folio, false);
+ folio_put(folio);
}
}
@@ -1181,7 +1123,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
int err;
for (i = 0; i < ap->num_folios; i++)
- fuse_wait_on_folio_writeback(inode, ap->folios[i]);
+ folio_wait_writeback(ap->folios[i]);
fuse_write_args_fill(ia, ff, pos, count);
ia->write.in.flags = fuse_write_flags(iocb);
@@ -1221,32 +1163,28 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
struct address_space *mapping,
struct iov_iter *ii, loff_t pos,
- unsigned int max_pages)
+ unsigned int max_folios)
{
struct fuse_args_pages *ap = &ia->ap;
struct fuse_conn *fc = get_fuse_conn(mapping->host);
unsigned offset = pos & (PAGE_SIZE - 1);
- unsigned int nr_pages = 0;
size_t count = 0;
- int err;
+ unsigned int num;
+ int err = 0;
+
+ num = min(iov_iter_count(ii), fc->max_write);
ap->args.in_pages = true;
ap->descs[0].offset = offset;
- do {
+ while (num && ap->num_folios < max_folios) {
size_t tmp;
struct folio *folio;
pgoff_t index = pos >> PAGE_SHIFT;
- size_t bytes = min_t(size_t, PAGE_SIZE - offset,
- iov_iter_count(ii));
-
- bytes = min_t(size_t, bytes, fc->max_write - count);
+ unsigned int bytes;
+ unsigned int folio_offset;
again:
- err = -EFAULT;
- if (fault_in_iov_iter_readable(ii, bytes))
- break;
-
folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
mapping_gfp_mask(mapping));
if (IS_ERR(folio)) {
@@ -1257,29 +1195,42 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
if (mapping_writably_mapped(mapping))
flush_dcache_folio(folio);
- tmp = copy_folio_from_iter_atomic(folio, offset, bytes, ii);
+ folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset;
+ bytes = min(folio_size(folio) - folio_offset, num);
+
+ tmp = copy_folio_from_iter_atomic(folio, folio_offset, bytes, ii);
flush_dcache_folio(folio);
if (!tmp) {
folio_unlock(folio);
folio_put(folio);
+
+ /*
+ * Ensure forward progress by faulting in
+ * while not holding the folio lock:
+ */
+ if (fault_in_iov_iter_readable(ii, bytes)) {
+ err = -EFAULT;
+ break;
+ }
+
goto again;
}
- err = 0;
ap->folios[ap->num_folios] = folio;
+ ap->descs[ap->num_folios].offset = folio_offset;
ap->descs[ap->num_folios].length = tmp;
ap->num_folios++;
- nr_pages++;
count += tmp;
pos += tmp;
+ num -= tmp;
offset += tmp;
- if (offset == PAGE_SIZE)
+ if (offset == folio_size(folio))
offset = 0;
- /* If we copied full page, mark it uptodate */
- if (tmp == PAGE_SIZE)
+ /* If we copied full folio, mark it uptodate */
+ if (tmp == folio_size(folio))
folio_mark_uptodate(folio);
if (folio_test_uptodate(folio)) {
@@ -1288,10 +1239,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
ia->write.folio_locked = true;
break;
}
- if (!fc->big_writes)
+ if (!fc->big_writes || offset != 0)
break;
- } while (iov_iter_count(ii) && count < fc->max_write &&
- nr_pages < max_pages && offset == 0);
+ }
return count > 0 ? count : err;
}
@@ -1440,6 +1390,24 @@ static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive)
}
}
+static const struct iomap_write_ops fuse_iomap_write_ops = {
+ .read_folio_range = fuse_iomap_read_folio_range,
+};
+
+static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ iomap->type = IOMAP_MAPPED;
+ iomap->length = length;
+ iomap->offset = offset;
+ return 0;
+}
+
+static const struct iomap_ops fuse_iomap_ops = {
+ .iomap_begin = fuse_iomap_begin,
+};
+
static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
@@ -1449,6 +1417,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = mapping->host;
ssize_t err, count;
struct fuse_conn *fc = get_fuse_conn(inode);
+ bool writeback = false;
if (fc->writeback_cache) {
/* Update size (EOF optimization) and mode (SUID clearing) */
@@ -1457,16 +1426,11 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (err)
return err;
- if (fc->handle_killpriv_v2 &&
- setattr_should_drop_suidgid(idmap,
- file_inode(file))) {
- goto writethrough;
- }
-
- return generic_file_write_iter(iocb, from);
+ if (!fc->handle_killpriv_v2 ||
+ !setattr_should_drop_suidgid(idmap, file_inode(file)))
+ writeback = true;
}
-writethrough:
inode_lock(inode);
err = count = generic_write_checks(iocb, from);
@@ -1485,6 +1449,15 @@ writethrough:
goto out;
written = direct_write_fallback(iocb, from, written,
fuse_perform_write(iocb, from));
+ } else if (writeback) {
+ /*
+ * Use iomap so that we can do granular uptodate reads
+ * and granular dirty tracking for large folios.
+ */
+ written = iomap_file_buffered_write(iocb, from,
+ &fuse_iomap_ops,
+ &fuse_iomap_write_ops,
+ file);
} else {
written = fuse_perform_write(iocb, from);
}
@@ -1638,7 +1611,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
return res;
}
}
- if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
+ if (!cuse && filemap_range_has_writeback(mapping, pos, (pos + count - 1))) {
if (!write)
inode_lock(inode);
fuse_sync_writes(inode);
@@ -1835,38 +1808,34 @@ static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out,
static void fuse_writepage_free(struct fuse_writepage_args *wpa)
{
struct fuse_args_pages *ap = &wpa->ia.ap;
- int i;
if (wpa->bucket)
fuse_sync_bucket_dec(wpa->bucket);
- for (i = 0; i < ap->num_folios; i++)
- folio_put(ap->folios[i]);
-
fuse_file_put(wpa->ia.ff, false);
kfree(ap->folios);
kfree(wpa);
}
-static void fuse_writepage_finish_stat(struct inode *inode, struct folio *folio)
-{
- struct backing_dev_info *bdi = inode_to_bdi(inode);
-
- dec_wb_stat(&bdi->wb, WB_WRITEBACK);
- node_stat_sub_folio(folio, NR_WRITEBACK_TEMP);
- wb_writeout_inc(&bdi->wb);
-}
-
static void fuse_writepage_finish(struct fuse_writepage_args *wpa)
{
struct fuse_args_pages *ap = &wpa->ia.ap;
struct inode *inode = wpa->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
int i;
- for (i = 0; i < ap->num_folios; i++)
- fuse_writepage_finish_stat(inode, ap->folios[i]);
+ for (i = 0; i < ap->num_folios; i++) {
+ /*
+ * Benchmarks showed that ending writeback within the
+ * scope of the fi->lock alleviates xarray lock
+ * contention and noticeably improves performance.
+ */
+ iomap_finish_folio_write(inode, ap->folios[i], 1);
+ dec_wb_stat(&bdi->wb, WB_WRITEBACK);
+ wb_writeout_inc(&bdi->wb);
+ }
wake_up(&fi->page_waitq);
}
@@ -1877,13 +1846,15 @@ static void fuse_send_writepage(struct fuse_mount *fm,
__releases(fi->lock)
__acquires(fi->lock)
{
- struct fuse_writepage_args *aux, *next;
struct fuse_inode *fi = get_fuse_inode(wpa->inode);
+ struct fuse_args_pages *ap = &wpa->ia.ap;
struct fuse_write_in *inarg = &wpa->ia.write.in;
- struct fuse_args *args = &wpa->ia.ap.args;
- /* Currently, all folios in FUSE are one page */
- __u64 data_size = wpa->ia.ap.num_folios * PAGE_SIZE;
- int err;
+ struct fuse_args *args = &ap->args;
+ __u64 data_size = 0;
+ int err, i;
+
+ for (i = 0; i < ap->num_folios; i++)
+ data_size += ap->descs[i].length;
fi->writectr++;
if (inarg->offset + data_size <= size) {
@@ -1914,19 +1885,8 @@ __acquires(fi->lock)
out_free:
fi->writectr--;
- rb_erase(&wpa->writepages_entry, &fi->writepages);
fuse_writepage_finish(wpa);
spin_unlock(&fi->lock);
-
- /* After rb_erase() aux request list is private */
- for (aux = wpa->next; aux; aux = next) {
- next = aux->next;
- aux->next = NULL;
- fuse_writepage_finish_stat(aux->inode,
- aux->ia.ap.folios[0]);
- fuse_writepage_free(aux);
- }
-
fuse_writepage_free(wpa);
spin_lock(&fi->lock);
}
@@ -1954,43 +1914,6 @@ __acquires(fi->lock)
}
}
-static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root,
- struct fuse_writepage_args *wpa)
-{
- pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT;
- pgoff_t idx_to = idx_from + wpa->ia.ap.num_folios - 1;
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
-
- WARN_ON(!wpa->ia.ap.num_folios);
- while (*p) {
- struct fuse_writepage_args *curr;
- pgoff_t curr_index;
-
- parent = *p;
- curr = rb_entry(parent, struct fuse_writepage_args,
- writepages_entry);
- WARN_ON(curr->inode != wpa->inode);
- curr_index = curr->ia.write.in.offset >> PAGE_SHIFT;
-
- if (idx_from >= curr_index + curr->ia.ap.num_folios)
- p = &(*p)->rb_right;
- else if (idx_to < curr_index)
- p = &(*p)->rb_left;
- else
- return curr;
- }
-
- rb_link_node(&wpa->writepages_entry, parent, p);
- rb_insert_color(&wpa->writepages_entry, root);
- return NULL;
-}
-
-static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
-{
- WARN_ON(fuse_insert_writeback(root, wpa));
-}
-
static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
int error)
{
@@ -2010,41 +1933,6 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
if (!fc->writeback_cache)
fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY);
spin_lock(&fi->lock);
- rb_erase(&wpa->writepages_entry, &fi->writepages);
- while (wpa->next) {
- struct fuse_mount *fm = get_fuse_mount(inode);
- struct fuse_write_in *inarg = &wpa->ia.write.in;
- struct fuse_writepage_args *next = wpa->next;
-
- wpa->next = next->next;
- next->next = NULL;
- tree_insert(&fi->writepages, next);
-
- /*
- * Skip fuse_flush_writepages() to make it easy to crop requests
- * based on primary request size.
- *
- * 1st case (trivial): there are no concurrent activities using
- * fuse_set/release_nowrite. Then we're on safe side because
- * fuse_flush_writepages() would call fuse_send_writepage()
- * anyway.
- *
- * 2nd case: someone called fuse_set_nowrite and it is waiting
- * now for completion of all in-flight requests. This happens
- * rarely and no more than once per page, so this should be
- * okay.
- *
- * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle
- * of fuse_set_nowrite..fuse_release_nowrite section. The fact
- * that fuse_set_nowrite returned implies that all in-flight
- * requests were completed along with all of their secondary
- * requests. Further primary requests are blocked by negative
- * writectr. Hence there cannot be any in-flight requests and
- * no invocations of fuse_writepage_end() while we're in
- * fuse_set_nowrite..fuse_release_nowrite section.
- */
- fuse_send_writepage(fm, next, inarg->offset + inarg->size);
- }
fi->writectr--;
fuse_writepage_finish(wpa);
spin_unlock(&fi->lock);
@@ -2078,17 +1966,6 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
struct fuse_file *ff;
int err;
- /*
- * Inode is always written before the last reference is dropped and
- * hence this should not be reached from reclaim.
- *
- * Writing back the inode from reclaim can deadlock if the request
- * processing itself needs an allocation. Allocations triggering
- * reclaim while serving a request can't be prevented, because it can
- * involve any number of unrelated userspace processes.
- */
- WARN_ON(wbc->for_reclaim);
-
ff = __fuse_write_file_get(fi);
err = fuse_flush_times(inode, ff);
if (ff)
@@ -2131,22 +2008,20 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
}
static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio,
- struct folio *tmp_folio, uint32_t folio_index)
+ uint32_t folio_index, loff_t offset, unsigned len)
{
struct inode *inode = folio->mapping->host;
struct fuse_args_pages *ap = &wpa->ia.ap;
- folio_copy(tmp_folio, folio);
-
- ap->folios[folio_index] = tmp_folio;
- ap->descs[folio_index].offset = 0;
- ap->descs[folio_index].length = PAGE_SIZE;
+ ap->folios[folio_index] = folio;
+ ap->descs[folio_index].offset = offset;
+ ap->descs[folio_index].length = len;
inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
- node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP);
}
static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio,
+ size_t offset,
struct fuse_file *ff)
{
struct inode *inode = folio->mapping->host;
@@ -2159,7 +2034,7 @@ static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio
return NULL;
fuse_writepage_add_to_bucket(fc, wpa);
- fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio), 0);
+ fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio) + offset, 0);
wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
wpa->inode = inode;
wpa->ia.ff = ff;
@@ -2171,74 +2046,28 @@ static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio
return wpa;
}
-static int fuse_writepage_locked(struct folio *folio)
-{
- struct address_space *mapping = folio->mapping;
- struct inode *inode = mapping->host;
- struct fuse_inode *fi = get_fuse_inode(inode);
- struct fuse_writepage_args *wpa;
- struct fuse_args_pages *ap;
- struct folio *tmp_folio;
- struct fuse_file *ff;
- int error = -ENOMEM;
-
- tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0);
- if (!tmp_folio)
- goto err;
-
- error = -EIO;
- ff = fuse_write_file_get(fi);
- if (!ff)
- goto err_nofile;
-
- wpa = fuse_writepage_args_setup(folio, ff);
- error = -ENOMEM;
- if (!wpa)
- goto err_writepage_args;
-
- ap = &wpa->ia.ap;
- ap->num_folios = 1;
-
- folio_start_writeback(folio);
- fuse_writepage_args_page_fill(wpa, folio, tmp_folio, 0);
-
- spin_lock(&fi->lock);
- tree_insert(&fi->writepages, wpa);
- list_add_tail(&wpa->queue_entry, &fi->queued_writes);
- fuse_flush_writepages(inode);
- spin_unlock(&fi->lock);
-
- folio_end_writeback(folio);
-
- return 0;
-
-err_writepage_args:
- fuse_file_put(ff, false);
-err_nofile:
- folio_put(tmp_folio);
-err:
- mapping_set_error(folio->mapping, error);
- return error;
-}
-
struct fuse_fill_wb_data {
struct fuse_writepage_args *wpa;
struct fuse_file *ff;
- struct inode *inode;
- struct folio **orig_folios;
unsigned int max_folios;
+ /*
+ * nr_bytes won't overflow since fuse_writepage_need_send() caps
+ * wb requests to never exceed fc->max_pages (which has an upper bound
+ * of U16_MAX).
+ */
+ unsigned int nr_bytes;
};
-static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
+static bool fuse_pages_realloc(struct fuse_fill_wb_data *data,
+ unsigned int max_pages)
{
struct fuse_args_pages *ap = &data->wpa->ia.ap;
- struct fuse_conn *fc = get_fuse_conn(data->inode);
struct folio **folios;
struct fuse_folio_desc *descs;
unsigned int nfolios = min_t(unsigned int,
max_t(unsigned int, data->max_folios * 2,
FUSE_DEFAULT_MAX_PAGES_PER_REQ),
- fc->max_pages);
+ max_pages);
WARN_ON(nfolios <= data->max_folios);
folios = fuse_folios_alloc(nfolios, GFP_NOFS, &descs);
@@ -2255,319 +2084,162 @@ static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
return true;
}
-static void fuse_writepages_send(struct fuse_fill_wb_data *data)
+static void fuse_writepages_send(struct inode *inode,
+ struct fuse_fill_wb_data *data)
{
struct fuse_writepage_args *wpa = data->wpa;
- struct inode *inode = data->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
- int num_folios = wpa->ia.ap.num_folios;
- int i;
spin_lock(&fi->lock);
list_add_tail(&wpa->queue_entry, &fi->queued_writes);
fuse_flush_writepages(inode);
spin_unlock(&fi->lock);
-
- for (i = 0; i < num_folios; i++)
- folio_end_writeback(data->orig_folios[i]);
}
-/*
- * Check under fi->lock if the page is under writeback, and insert it onto the
- * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's
- * one already added for a page at this offset. If there's none, then insert
- * this new request onto the auxiliary list, otherwise reuse the existing one by
- * swapping the new temp page with the old one.
- */
-static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa,
- struct folio *folio)
-{
- struct fuse_inode *fi = get_fuse_inode(new_wpa->inode);
- struct fuse_writepage_args *tmp;
- struct fuse_writepage_args *old_wpa;
- struct fuse_args_pages *new_ap = &new_wpa->ia.ap;
-
- WARN_ON(new_ap->num_folios != 0);
- new_ap->num_folios = 1;
-
- spin_lock(&fi->lock);
- old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa);
- if (!old_wpa) {
- spin_unlock(&fi->lock);
- return true;
- }
-
- for (tmp = old_wpa->next; tmp; tmp = tmp->next) {
- pgoff_t curr_index;
-
- WARN_ON(tmp->inode != new_wpa->inode);
- curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT;
- if (curr_index == folio->index) {
- WARN_ON(tmp->ia.ap.num_folios != 1);
- swap(tmp->ia.ap.folios[0], new_ap->folios[0]);
- break;
- }
- }
-
- if (!tmp) {
- new_wpa->next = old_wpa->next;
- old_wpa->next = new_wpa;
- }
-
- spin_unlock(&fi->lock);
-
- if (tmp) {
- fuse_writepage_finish_stat(new_wpa->inode,
- folio);
- fuse_writepage_free(new_wpa);
- }
-
- return false;
-}
-
-static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio,
- struct fuse_args_pages *ap,
+static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos,
+ unsigned len, struct fuse_args_pages *ap,
struct fuse_fill_wb_data *data)
{
- WARN_ON(!ap->num_folios);
+ struct folio *prev_folio;
+ struct fuse_folio_desc prev_desc;
+ unsigned bytes = data->nr_bytes + len;
+ loff_t prev_pos;
- /*
- * Being under writeback is unlikely but possible. For example direct
- * read to an mmaped fuse file will set the page dirty twice; once when
- * the pages are faulted with get_user_pages(), and then after the read
- * completed.
- */
- if (fuse_folio_is_writeback(data->inode, folio))
- return true;
+ WARN_ON(!ap->num_folios);
/* Reached max pages */
- if (ap->num_folios == fc->max_pages)
+ if ((bytes + PAGE_SIZE - 1) >> PAGE_SHIFT > fc->max_pages)
return true;
/* Reached max write bytes */
- if ((ap->num_folios + 1) * PAGE_SIZE > fc->max_write)
+ if (bytes > fc->max_write)
return true;
/* Discontinuity */
- if (data->orig_folios[ap->num_folios - 1]->index + 1 != folio_index(folio))
+ prev_folio = ap->folios[ap->num_folios - 1];
+ prev_desc = ap->descs[ap->num_folios - 1];
+ prev_pos = folio_pos(prev_folio) + prev_desc.offset + prev_desc.length;
+ if (prev_pos != pos)
return true;
/* Need to grow the pages array? If so, did the expansion fail? */
- if (ap->num_folios == data->max_folios && !fuse_pages_realloc(data))
+ if (ap->num_folios == data->max_folios &&
+ !fuse_pages_realloc(data, fc->max_pages))
return true;
return false;
}
-static int fuse_writepages_fill(struct folio *folio,
- struct writeback_control *wbc, void *_data)
+static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc,
+ struct folio *folio, u64 pos,
+ unsigned len, u64 end_pos)
{
- struct fuse_fill_wb_data *data = _data;
+ struct fuse_fill_wb_data *data = wpc->wb_ctx;
struct fuse_writepage_args *wpa = data->wpa;
struct fuse_args_pages *ap = &wpa->ia.ap;
- struct inode *inode = data->inode;
+ struct inode *inode = wpc->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = get_fuse_conn(inode);
- struct folio *tmp_folio;
- int err;
+ loff_t offset = offset_in_folio(folio, pos);
+
+ WARN_ON_ONCE(!data);
if (!data->ff) {
- err = -EIO;
data->ff = fuse_write_file_get(fi);
if (!data->ff)
- goto out_unlock;
+ return -EIO;
}
- if (wpa && fuse_writepage_need_send(fc, folio, ap, data)) {
- fuse_writepages_send(data);
+ if (wpa && fuse_writepage_need_send(fc, pos, len, ap, data)) {
+ fuse_writepages_send(inode, data);
data->wpa = NULL;
+ data->nr_bytes = 0;
}
- err = -ENOMEM;
- tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0);
- if (!tmp_folio)
- goto out_unlock;
-
- /*
- * The page must not be redirtied until the writeout is completed
- * (i.e. userspace has sent a reply to the write request). Otherwise
- * there could be more than one temporary page instance for each real
- * page.
- *
- * This is ensured by holding the page lock in page_mkwrite() while
- * checking fuse_page_is_writeback(). We already hold the page lock
- * since clear_page_dirty_for_io() and keep it held until we add the
- * request to the fi->writepages list and increment ap->num_folios.
- * After this fuse_page_is_writeback() will indicate that the page is
- * under writeback, so we can release the page lock.
- */
if (data->wpa == NULL) {
- err = -ENOMEM;
- wpa = fuse_writepage_args_setup(folio, data->ff);
- if (!wpa) {
- folio_put(tmp_folio);
- goto out_unlock;
- }
+ wpa = fuse_writepage_args_setup(folio, offset, data->ff);
+ if (!wpa)
+ return -ENOMEM;
fuse_file_get(wpa->ia.ff);
data->max_folios = 1;
ap = &wpa->ia.ap;
}
- folio_start_writeback(folio);
- fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_folios);
- data->orig_folios[ap->num_folios] = folio;
+ iomap_start_folio_write(inode, folio, 1);
+ fuse_writepage_args_page_fill(wpa, folio, ap->num_folios,
+ offset, len);
+ data->nr_bytes += len;
- err = 0;
- if (data->wpa) {
- /*
- * Protected by fi->lock against concurrent access by
- * fuse_page_is_writeback().
- */
- spin_lock(&fi->lock);
- ap->num_folios++;
- spin_unlock(&fi->lock);
- } else if (fuse_writepage_add(wpa, folio)) {
+ ap->num_folios++;
+ if (!data->wpa)
data->wpa = wpa;
- } else {
- folio_end_writeback(folio);
+
+ return len;
+}
+
+static int fuse_iomap_writeback_submit(struct iomap_writepage_ctx *wpc,
+ int error)
+{
+ struct fuse_fill_wb_data *data = wpc->wb_ctx;
+
+ WARN_ON_ONCE(!data);
+
+ if (data->wpa) {
+ WARN_ON(!data->wpa->ia.ap.num_folios);
+ fuse_writepages_send(wpc->inode, data);
}
-out_unlock:
- folio_unlock(folio);
- return err;
+ if (data->ff)
+ fuse_file_put(data->ff, false);
+
+ return error;
}
+static const struct iomap_writeback_ops fuse_writeback_ops = {
+ .writeback_range = fuse_iomap_writeback_range,
+ .writeback_submit = fuse_iomap_writeback_submit,
+};
+
static int fuse_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_fill_wb_data data;
- int err;
+ struct fuse_fill_wb_data data = {};
+ struct iomap_writepage_ctx wpc = {
+ .inode = inode,
+ .iomap.type = IOMAP_MAPPED,
+ .wbc = wbc,
+ .ops = &fuse_writeback_ops,
+ .wb_ctx = &data,
+ };
- err = -EIO;
if (fuse_is_bad(inode))
- goto out;
+ return -EIO;
if (wbc->sync_mode == WB_SYNC_NONE &&
fc->num_background >= fc->congestion_threshold)
return 0;
- data.inode = inode;
- data.wpa = NULL;
- data.ff = NULL;
-
- err = -ENOMEM;
- data.orig_folios = kcalloc(fc->max_pages,
- sizeof(struct folio *),
- GFP_NOFS);
- if (!data.orig_folios)
- goto out;
-
- err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
- if (data.wpa) {
- WARN_ON(!data.wpa->ia.ap.num_folios);
- fuse_writepages_send(&data);
- }
- if (data.ff)
- fuse_file_put(data.ff, false);
-
- kfree(data.orig_folios);
-out:
- return err;
-}
-
-/*
- * It's worthy to make sure that space is reserved on disk for the write,
- * but how to implement it without killing performance need more thinking.
- */
-static int fuse_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
-{
- pgoff_t index = pos >> PAGE_SHIFT;
- struct fuse_conn *fc = get_fuse_conn(file_inode(file));
- struct folio *folio;
- loff_t fsize;
- int err = -ENOMEM;
-
- WARN_ON(!fc->writeback_cache);
-
- folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
- mapping_gfp_mask(mapping));
- if (IS_ERR(folio))
- goto error;
-
- fuse_wait_on_page_writeback(mapping->host, folio->index);
-
- if (folio_test_uptodate(folio) || len >= folio_size(folio))
- goto success;
- /*
- * Check if the start of this folio comes after the end of file,
- * in which case the readpage can be optimized away.
- */
- fsize = i_size_read(mapping->host);
- if (fsize <= folio_pos(folio)) {
- size_t off = offset_in_folio(folio, pos);
- if (off)
- folio_zero_segment(folio, 0, off);
- goto success;
- }
- err = fuse_do_readfolio(file, folio);
- if (err)
- goto cleanup;
-success:
- *foliop = folio;
- return 0;
-
-cleanup:
- folio_unlock(folio);
- folio_put(folio);
-error:
- return err;
-}
-
-static int fuse_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
-{
- struct inode *inode = folio->mapping->host;
-
- /* Haven't copied anything? Skip zeroing, size extending, dirtying. */
- if (!copied)
- goto unlock;
-
- pos += copied;
- if (!folio_test_uptodate(folio)) {
- /* Zero any unwritten bytes at the end of the page */
- size_t endoff = pos & ~PAGE_MASK;
- if (endoff)
- folio_zero_segment(folio, endoff, PAGE_SIZE);
- folio_mark_uptodate(folio);
- }
-
- if (pos > inode->i_size)
- i_size_write(inode, pos);
-
- folio_mark_dirty(folio);
-
-unlock:
- folio_unlock(folio);
- folio_put(folio);
-
- return copied;
+ return iomap_writepages(&wpc);
}
static int fuse_launder_folio(struct folio *folio)
{
int err = 0;
- if (folio_clear_dirty_for_io(folio)) {
- struct inode *inode = folio->mapping->host;
+ struct fuse_fill_wb_data data = {};
+ struct iomap_writepage_ctx wpc = {
+ .inode = folio->mapping->host,
+ .iomap.type = IOMAP_MAPPED,
+ .ops = &fuse_writeback_ops,
+ .wb_ctx = &data,
+ };
- /* Serialize with pending writeback for the same page */
- fuse_wait_on_page_writeback(inode, folio->index);
- err = fuse_writepage_locked(folio);
+ if (folio_clear_dirty_for_io(folio)) {
+ err = iomap_writeback_folio(&wpc, folio);
+ err = fuse_iomap_writeback_submit(&wpc, err);
if (!err)
- fuse_wait_on_page_writeback(inode, folio->index);
+ folio_wait_writeback(folio);
}
return err;
}
@@ -2611,7 +2283,7 @@ static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf)
return VM_FAULT_NOPAGE;
}
- fuse_wait_on_folio_writeback(inode, folio);
+ folio_wait_writeback(folio);
return VM_FAULT_LOCKED;
}
@@ -3418,20 +3090,24 @@ static const struct address_space_operations fuse_file_aops = {
.readahead = fuse_readahead,
.writepages = fuse_writepages,
.launder_folio = fuse_launder_folio,
- .dirty_folio = filemap_dirty_folio,
+ .dirty_folio = iomap_dirty_folio,
+ .release_folio = iomap_release_folio,
+ .invalidate_folio = iomap_invalidate_folio,
+ .is_partially_uptodate = iomap_is_partially_uptodate,
.migrate_folio = filemap_migrate_folio,
.bmap = fuse_bmap,
.direct_IO = fuse_direct_IO,
- .write_begin = fuse_write_begin,
- .write_end = fuse_write_end,
};
void fuse_init_file_inode(struct inode *inode, unsigned int flags)
{
struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_conn *fc = get_fuse_conn(inode);
inode->i_fop = &fuse_file_operations;
inode->i_data.a_ops = &fuse_file_aops;
+ if (fc->writeback_cache)
+ mapping_set_writeback_may_deadlock_on_reclaim(&inode->i_data);
INIT_LIST_HEAD(&fi->write_files);
INIT_LIST_HEAD(&fi->queued_writes);
@@ -3439,7 +3115,6 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags)
fi->iocachectr = 0;
init_waitqueue_head(&fi->page_waitq);
init_waitqueue_head(&fi->direct_io_waitq);
- fi->writepages = RB_ROOT;
if (IS_ENABLED(CONFIG_FUSE_DAX))
fuse_dax_inode_init(inode, flags);
diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
index b3c2e32254ba..5a9bd771a319 100644
--- a/fs/fuse/fuse_dev_i.h
+++ b/fs/fuse/fuse_dev_i.h
@@ -20,7 +20,6 @@ struct fuse_iqueue;
struct fuse_forget_link;
struct fuse_copy_state {
- int write;
struct fuse_req *req;
struct iov_iter *iter;
struct pipe_buffer *pipebufs;
@@ -30,8 +29,9 @@ struct fuse_copy_state {
struct page *pg;
unsigned int len;
unsigned int offset;
- unsigned int move_pages:1;
- unsigned int is_uring:1;
+ bool write:1;
+ bool move_folios:1;
+ bool is_uring:1;
struct {
unsigned int copied_sz; /* copied size into the user buffer */
} ring;
@@ -51,7 +51,7 @@ struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique);
void fuse_dev_end_requests(struct list_head *head);
-void fuse_copy_init(struct fuse_copy_state *cs, int write,
+void fuse_copy_init(struct fuse_copy_state *cs, bool write,
struct iov_iter *iter);
int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs,
unsigned int argpages, struct fuse_arg *args,
@@ -64,7 +64,6 @@ void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req);
bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock);
bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list);
-bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing);
#endif
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index d56d4fd956db..ec248d13c8bf 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -74,8 +74,8 @@ extern struct list_head fuse_conn_list;
extern struct mutex fuse_mutex;
/** Module parameters */
-extern unsigned max_user_bgreq;
-extern unsigned max_user_congthresh;
+extern unsigned int max_user_bgreq;
+extern unsigned int max_user_congthresh;
/* One forget request */
struct fuse_forget_link {
@@ -161,9 +161,6 @@ struct fuse_inode {
/* waitq for direct-io completion */
wait_queue_head_t direct_io_waitq;
-
- /* List of writepage requestst (pending or sent) */
- struct rb_root writepages;
};
/* readdir cache (directory only) */
@@ -636,6 +633,9 @@ struct fuse_conn {
/** Number of fuse_dev's */
atomic_t dev_count;
+ /** Current epoch for up-to-date dentries */
+ atomic_t epoch;
+
struct rcu_head rcu;
/** The user id for this mount */
@@ -913,12 +913,6 @@ struct fuse_conn {
/** Device ID from the root super block */
dev_t dev;
- /** Dentries in the control filesystem */
- struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES];
-
- /** number of dentries used in the above array */
- int ctl_ndents;
-
/** Key for lock owner ID scrambling */
u32 scramble_key[4];
@@ -1109,7 +1103,6 @@ static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket)
extern const struct file_operations fuse_dev_operations;
extern const struct dentry_operations fuse_dentry_operations;
-extern const struct dentry_operations fuse_root_dentry_operations;
/**
* Get a filled in inode
@@ -1486,9 +1479,9 @@ void fuse_dax_cancel_work(struct fuse_conn *fc);
long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
unsigned long arg);
-int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int fuse_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
int fuse_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
/* iomode.c */
int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index fd48e8d37f2e..ecb869e895ab 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -9,6 +9,7 @@
#include "fuse_i.h"
#include "dev_uring_i.h"
+#include <linux/dax.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/file.h>
@@ -41,7 +42,7 @@ unsigned int fuse_max_pages_limit = 256;
unsigned int fuse_default_req_timeout;
unsigned int fuse_max_req_timeout;
-unsigned max_user_bgreq;
+unsigned int max_user_bgreq;
module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
&max_user_bgreq, 0644);
__MODULE_PARM_TYPE(max_user_bgreq, "uint");
@@ -49,7 +50,7 @@ MODULE_PARM_DESC(max_user_bgreq,
"Global limit for the maximum number of backgrounded requests an "
"unprivileged user can set");
-unsigned max_user_congthresh;
+unsigned int max_user_congthresh;
module_param_call(max_user_congthresh, set_global_limit, param_get_uint,
&max_user_congthresh, 0644);
__MODULE_PARM_TYPE(max_user_congthresh, "uint");
@@ -162,6 +163,9 @@ static void fuse_evict_inode(struct inode *inode)
/* Will write inode on close/munmap and in all other dirtiers */
WARN_ON(inode->i_state & I_DIRTY_INODE);
+ if (FUSE_IS_DAX(inode))
+ dax_break_layout_final(inode);
+
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (inode->i_sb->s_flags & SB_ACTIVE) {
@@ -962,6 +966,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
init_rwsem(&fc->killsb);
refcount_set(&fc->count, 1);
atomic_set(&fc->dev_count, 1);
+ atomic_set(&fc->epoch, 1);
init_waitqueue_head(&fc->blocked_waitq);
fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv);
INIT_LIST_HEAD(&fc->bg_queue);
@@ -1036,7 +1041,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
}
EXPORT_SYMBOL_GPL(fuse_conn_get);
-static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
+static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned int mode)
{
struct fuse_attr attr;
memset(&attr, 0, sizeof(attr));
@@ -1211,7 +1216,7 @@ static const struct super_operations fuse_super_operations = {
.show_options = fuse_show_options,
};
-static void sanitize_global_limit(unsigned *limit)
+static void sanitize_global_limit(unsigned int *limit)
{
/*
* The default maximum number of async requests is calculated to consume
@@ -1232,7 +1237,7 @@ static int set_global_limit(const char *val, const struct kernel_param *kp)
if (rv)
return rv;
- sanitize_global_limit((unsigned *)kp->arg);
+ sanitize_global_limit((unsigned int *)kp->arg);
return 0;
}
@@ -1714,7 +1719,7 @@ static int fuse_fill_super_submount(struct super_block *sb,
fi = get_fuse_inode(root);
fi->nlookup--;
- sb->s_d_op = &fuse_dentry_operations;
+ set_default_d_op(sb, &fuse_dentry_operations);
sb->s_root = d_make_root(root);
if (!sb->s_root)
return -ENOMEM;
@@ -1849,12 +1854,10 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
err = -ENOMEM;
root = fuse_get_root_inode(sb, ctx->rootmode);
- sb->s_d_op = &fuse_root_dentry_operations;
+ set_default_d_op(sb, &fuse_dentry_operations);
root_dentry = d_make_root(root);
if (!root_dentry)
goto err_dev_free;
- /* Root dentry doesn't have .d_revalidate */
- sb->s_d_op = &fuse_dentry_operations;
mutex_lock(&fuse_mutex);
err = -EINVAL;
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index 2d9abf48828f..57032eadca6c 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -502,7 +502,7 @@ static void fuse_priv_ioctl_cleanup(struct inode *inode, struct fuse_file *ff)
fuse_file_release(inode, ff, O_RDONLY, NULL, S_ISDIR(inode->i_mode));
}
-int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int fuse_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct fuse_file *ff;
@@ -536,11 +536,13 @@ int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa)
cleanup:
fuse_priv_ioctl_cleanup(inode, ff);
+ if (err == -ENOTTY)
+ err = -EOPNOTSUPP;
return err;
}
int fuse_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct fuse_file *ff;
@@ -572,5 +574,7 @@ int fuse_fileattr_set(struct mnt_idmap *idmap,
cleanup:
fuse_priv_ioctl_cleanup(inode, ff);
+ if (err == -ENOTTY)
+ err = -EOPNOTSUPP;
return err;
}
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index edcd6f18a8a8..c2aae2eef086 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -161,6 +161,7 @@ static int fuse_direntplus_link(struct file *file,
struct fuse_conn *fc;
struct inode *inode;
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ int epoch;
if (!o->nodeid) {
/*
@@ -190,6 +191,7 @@ static int fuse_direntplus_link(struct file *file,
return -EIO;
fc = get_fuse_conn(dir);
+ epoch = atomic_read(&fc->epoch);
name.hash = full_name_hash(parent, name.name, name.len);
dentry = d_lookup(parent, &name);
@@ -256,6 +258,7 @@ retry:
}
if (fc->readdirplus_auto)
set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state);
+ dentry->d_time = epoch;
fuse_change_entry_timeout(dentry, o);
dput(dentry);
@@ -332,35 +335,32 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
{
int plus;
ssize_t res;
- struct folio *folio;
struct inode *inode = file_inode(file);
struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_conn *fc = fm->fc;
struct fuse_io_args ia = {};
- struct fuse_args_pages *ap = &ia.ap;
- struct fuse_folio_desc desc = { .length = PAGE_SIZE };
+ struct fuse_args *args = &ia.ap.args;
+ void *buf;
+ size_t bufsize = clamp((unsigned int) ctx->count, PAGE_SIZE, fc->max_pages << PAGE_SHIFT);
u64 attr_version = 0, evict_ctr = 0;
bool locked;
- folio = folio_alloc(GFP_KERNEL, 0);
- if (!folio)
+ buf = kvmalloc(bufsize, GFP_KERNEL);
+ if (!buf)
return -ENOMEM;
+ args->out_args[0].value = buf;
+
plus = fuse_use_readdirplus(inode, ctx);
- ap->args.out_pages = true;
- ap->num_folios = 1;
- ap->folios = &folio;
- ap->descs = &desc;
if (plus) {
attr_version = fuse_get_attr_version(fm->fc);
evict_ctr = fuse_get_evict_ctr(fm->fc);
- fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
- FUSE_READDIRPLUS);
+ fuse_read_args_fill(&ia, file, ctx->pos, bufsize, FUSE_READDIRPLUS);
} else {
- fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
- FUSE_READDIR);
+ fuse_read_args_fill(&ia, file, ctx->pos, bufsize, FUSE_READDIR);
}
locked = fuse_lock_inode(inode);
- res = fuse_simple_request(fm, &ap->args);
+ res = fuse_simple_request(fm, args);
fuse_unlock_inode(inode, locked);
if (res >= 0) {
if (!res) {
@@ -369,16 +369,14 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
if (ff->open_flags & FOPEN_CACHE_DIR)
fuse_readdir_cache_end(file, ctx->pos);
} else if (plus) {
- res = parse_dirplusfile(folio_address(folio), res,
- file, ctx, attr_version,
+ res = parse_dirplusfile(buf, res, file, ctx, attr_version,
evict_ctr);
} else {
- res = parse_dirfile(folio_address(folio), res, file,
- ctx);
+ res = parse_dirfile(buf, res, file, ctx);
}
}
- folio_put(folio);
+ kvfree(buf);
fuse_invalidate_atime(inode);
return res;
}
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 53c2626e90e7..c826e7ca49f5 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -9,7 +9,6 @@
#include <linux/pci.h>
#include <linux/interrupt.h>
#include <linux/group_cpus.h>
-#include <linux/pfn_t.h>
#include <linux/memremap.h>
#include <linux/module.h>
#include <linux/virtio.h>
@@ -862,7 +861,7 @@ static void virtio_fs_requests_done_work(struct work_struct *work)
static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *fs)
{
const struct cpumask *mask, *masks;
- unsigned int q, cpu;
+ unsigned int q, cpu, nr_masks;
/* First attempt to map using existing transport layer affinities
* e.g. PCIe MSI-X
@@ -882,7 +881,7 @@ static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *f
return;
fallback:
/* Attempt to map evenly in groups over the CPUs */
- masks = group_cpus_evenly(fs->num_request_queues);
+ masks = group_cpus_evenly(fs->num_request_queues, &nr_masks);
/* If even this fails we default to all CPUs use first request queue */
if (!masks) {
for_each_possible_cpu(cpu)
@@ -891,7 +890,7 @@ fallback:
}
for (q = 0; q < fs->num_request_queues; q++) {
- for_each_cpu(cpu, &masks[q])
+ for_each_cpu(cpu, &masks[q % nr_masks])
fs->mq_map[cpu] = q + VQ_REQUEST;
}
kfree(masks);
@@ -1008,7 +1007,7 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev)
*/
static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
long nr_pages, enum dax_access_mode mode,
- void **kaddr, pfn_t *pfn)
+ void **kaddr, unsigned long *pfn)
{
struct virtio_fs *fs = dax_get_private(dax_dev);
phys_addr_t offset = PFN_PHYS(pgoff);
@@ -1017,7 +1016,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
if (kaddr)
*kaddr = fs->window_kaddr + offset;
if (pfn)
- *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, 0);
+ *pfn = fs->window_phys_addr + offset;
return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
}
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 14f204cd5a82..47d74afd63ac 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -159,7 +159,11 @@ static int gfs2_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
- struct iomap_writepage_ctx wpc = { };
+ struct iomap_writepage_ctx wpc = {
+ .inode = mapping->host,
+ .wbc = wbc,
+ .ops = &gfs2_writeback_ops,
+ };
int ret;
/*
@@ -168,7 +172,7 @@ static int gfs2_writepages(struct address_space *mapping,
* want balance_dirty_pages() to loop indefinitely trying to write out
* pages held in the ail that it can't find.
*/
- ret = iomap_writepages(mapping, wbc, &wpc, &gfs2_writeback_ops);
+ ret = iomap_writepages(&wpc);
if (ret == 0 && wbc->nr_to_write > 0)
set_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags);
return ret;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 7703d0471139..131091520de6 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -963,12 +963,16 @@ static struct folio *
gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len)
{
struct inode *inode = iter->inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
unsigned int blockmask = i_blocksize(inode) - 1;
struct gfs2_sbd *sdp = GFS2_SB(inode);
unsigned int blocks;
struct folio *folio;
int status;
+ if (!gfs2_is_jdata(ip) && !gfs2_is_stuffed(ip))
+ return iomap_get_folio(iter, pos, len);
+
blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits;
status = gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
if (status)
@@ -987,7 +991,7 @@ static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos,
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- if (!gfs2_is_stuffed(ip))
+ if (gfs2_is_jdata(ip) && !gfs2_is_stuffed(ip))
gfs2_trans_add_databufs(ip->i_gl, folio,
offset_in_folio(folio, pos),
copied);
@@ -995,13 +999,14 @@ static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos,
folio_unlock(folio);
folio_put(folio);
- if (tr->tr_num_buf_new)
- __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-
- gfs2_trans_end(sdp);
+ if (gfs2_is_jdata(ip) || gfs2_is_stuffed(ip)) {
+ if (tr->tr_num_buf_new)
+ __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+ gfs2_trans_end(sdp);
+ }
}
-static const struct iomap_folio_ops gfs2_iomap_folio_ops = {
+const struct iomap_write_ops gfs2_iomap_write_ops = {
.get_folio = gfs2_iomap_get_folio,
.put_folio = gfs2_iomap_put_folio,
};
@@ -1078,8 +1083,6 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
gfs2_trans_end(sdp);
}
- if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip))
- iomap->folio_ops = &gfs2_iomap_folio_ops;
return 0;
out_trans_end:
@@ -1304,7 +1307,7 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from, loff_t length
return 0;
length = min(length, inode->i_size - from);
return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops,
- NULL);
+ &gfs2_iomap_write_ops, NULL);
}
#define GFS2_JTRUNC_REVOKES 8192
@@ -2469,23 +2472,26 @@ out:
return error;
}
-static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
- loff_t offset, unsigned int len)
+static ssize_t gfs2_writeback_range(struct iomap_writepage_ctx *wpc,
+ struct folio *folio, u64 offset, unsigned int len, u64 end_pos)
{
- int ret;
-
- if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode))))
+ if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(wpc->inode))))
return -EIO;
- if (offset >= wpc->iomap.offset &&
- offset < wpc->iomap.offset + wpc->iomap.length)
- return 0;
+ if (offset < wpc->iomap.offset ||
+ offset >= wpc->iomap.offset + wpc->iomap.length) {
+ int ret;
- memset(&wpc->iomap, 0, sizeof(wpc->iomap));
- ret = gfs2_iomap_get(inode, offset, INT_MAX, &wpc->iomap);
- return ret;
+ memset(&wpc->iomap, 0, sizeof(wpc->iomap));
+ ret = gfs2_iomap_get(wpc->inode, offset, INT_MAX, &wpc->iomap);
+ if (ret)
+ return ret;
+ }
+
+ return iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
}
const struct iomap_writeback_ops gfs2_writeback_ops = {
- .map_blocks = gfs2_map_blocks,
+ .writeback_range = gfs2_writeback_range,
+ .writeback_submit = iomap_ioend_writeback_submit,
};
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index 4e8b1e8ebdf3..6cdc72dd55a3 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -44,6 +44,7 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
}
extern const struct iomap_ops gfs2_iomap_ops;
+extern const struct iomap_write_ops gfs2_iomap_write_ops;
extern const struct iomap_writeback_ops gfs2_writeback_ops;
int gfs2_unstuff_dinode(struct gfs2_inode *ip);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index dbf1aede744c..509e2f0d97e7 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -60,6 +60,7 @@
#include <linux/crc32.h>
#include <linux/vmalloc.h>
#include <linux/bio.h>
+#include <linux/log2.h>
#include "gfs2.h"
#include "incore.h"
@@ -912,7 +913,6 @@ static int dir_make_exhash(struct inode *inode)
struct qstr args;
struct buffer_head *bh, *dibh;
struct gfs2_leaf *leaf;
- int y;
u32 x;
__be64 *lp;
u64 bn;
@@ -979,9 +979,7 @@ static int dir_make_exhash(struct inode *inode)
i_size_write(inode, sdp->sd_sb.sb_bsize / 2);
gfs2_add_inode_blocks(&dip->i_inode, 1);
dip->i_diskflags |= GFS2_DIF_EXHASH;
-
- for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
- dip->i_depth = y;
+ dip->i_depth = ilog2(sdp->sd_hash_ptrs);
gfs2_dinode_out(dip, dibh->b_data);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index fd1147aa3891..72d95185a39f 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -155,7 +155,7 @@ static inline u32 gfs2_gfsflags_to_fsflags(struct inode *inode, u32 gfsflags)
return fsflags;
}
-int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int gfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct gfs2_inode *ip = GFS2_I(inode);
@@ -276,7 +276,7 @@ out:
}
int gfs2_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
u32 fsflags = fa->flags, gfsflags = 0;
@@ -1058,7 +1058,8 @@ retry:
}
pagefault_disable();
- ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops, NULL);
+ ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops,
+ &gfs2_iomap_write_ops, NULL);
pagefault_enable();
if (ret > 0)
written += ret;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ba25b884169e..b6fd1cb17de7 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -590,35 +590,31 @@ static void gfs2_demote_wake(struct gfs2_glock *gl)
static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
{
const struct gfs2_glock_operations *glops = gl->gl_ops;
- struct gfs2_holder *gh;
- unsigned state = ret & LM_OUT_ST_MASK;
- trace_gfs2_glock_state_change(gl, state);
- state_change(gl, state);
- gh = find_first_waiter(gl);
+ if (!(ret & ~LM_OUT_ST_MASK)) {
+ unsigned state = ret & LM_OUT_ST_MASK;
+
+ trace_gfs2_glock_state_change(gl, state);
+ state_change(gl, state);
+ }
+
/* Demote to UN request arrived during demote to SH or DF */
if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
- state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED)
+ gl->gl_state != LM_ST_UNLOCKED &&
+ gl->gl_demote_state == LM_ST_UNLOCKED)
gl->gl_target = LM_ST_UNLOCKED;
/* Check for state != intended state */
- if (unlikely(state != gl->gl_target)) {
- if (gh && (ret & LM_OUT_CANCELED))
- gfs2_holder_wake(gh);
+ if (unlikely(gl->gl_state != gl->gl_target)) {
+ struct gfs2_holder *gh = find_first_waiter(gl);
+
if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) {
if (ret & LM_OUT_CANCELED) {
list_del_init(&gh->gh_list);
trace_gfs2_glock_queue(gh, 0);
+ gfs2_holder_wake(gh);
gl->gl_target = gl->gl_state;
- gh = find_first_waiter(gl);
- if (gh) {
- gl->gl_target = gh->gh_state;
- if (do_promote(gl))
- goto out;
- do_xmote(gl, gh, gl->gl_target);
- return;
- }
goto out;
}
/* Some error or failed "try lock" - report it */
@@ -629,7 +625,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
goto out;
}
}
- switch(state) {
+ switch(gl->gl_state) {
/* Unlocked due to conversion deadlock, try again */
case LM_ST_UNLOCKED:
do_xmote(gl, gh, gl->gl_target);
@@ -640,8 +636,10 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
do_xmote(gl, gh, LM_ST_UNLOCKED);
break;
default: /* Everything else */
- fs_err(gl->gl_name.ln_sbd, "wanted %u got %u\n",
- gl->gl_target, state);
+ fs_err(gl->gl_name.ln_sbd,
+ "glock %u:%llu requested=%u ret=%u\n",
+ gl->gl_name.ln_type, gl->gl_name.ln_number,
+ gl->gl_req, ret);
GLOCK_BUG_ON(gl, 1);
}
return;
@@ -650,7 +648,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
/* Fast path - we got what we asked for */
if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
gfs2_demote_wake(gl);
- if (state != LM_ST_UNLOCKED) {
+ if (gl->gl_state != LM_ST_UNLOCKED) {
if (glops->go_xmote_bh) {
int rv;
@@ -802,7 +800,8 @@ skip_inval:
* We skip telling dlm to do the locking, so we won't get a
* reply that would otherwise clear GLF_LOCK. So we clear it here.
*/
- clear_bit(GLF_LOCK, &gl->gl_flags);
+ if (!test_bit(GLF_CANCELING, &gl->gl_flags))
+ clear_bit(GLF_LOCK, &gl->gl_flags);
clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD);
return;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index c171f745650f..9339a3bff6ee 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -92,12 +92,22 @@ enum {
* LM_OUT_ST_MASK
* Masks the lower two bits of lock state in the returned value.
*
+ * LM_OUT_TRY_AGAIN
+ * The trylock request failed.
+ *
+ * LM_OUT_DEADLOCK
+ * The lock request failed because it would deadlock.
+ *
* LM_OUT_CANCELED
* The lock request was canceled.
*
+ * LM_OUT_ERROR
+ * The lock request timed out or failed.
*/
#define LM_OUT_ST_MASK 0x00000003
+#define LM_OUT_TRY_AGAIN 0x00000020
+#define LM_OUT_DEADLOCK 0x00000010
#define LM_OUT_CANCELED 0x00000008
#define LM_OUT_ERROR 0x00000004
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index cebd66b22694..fe0faad4892f 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -11,6 +11,7 @@
#include <linux/bio.h>
#include <linux/posix_acl.h>
#include <linux/security.h>
+#include <linux/log2.h>
#include "gfs2.h"
#include "incore.h"
@@ -450,6 +451,11 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
gfs2_consist_inode(ip);
return -EIO;
}
+ if ((ip->i_diskflags & GFS2_DIF_EXHASH) &&
+ depth < ilog2(sdp->sd_hash_ptrs)) {
+ gfs2_consist_inode(ip);
+ return -EIO;
+ }
ip->i_depth = (u8)depth;
ip->i_entries = be32_to_cpu(str->di_entries);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 0a41c4e76b32..d4ad82f47eee 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -375,7 +375,6 @@ struct gfs2_glock {
enum {
GIF_QD_LOCKED = 1,
- GIF_ALLOC_FAILED = 2,
GIF_SW_PAGED = 3,
GIF_FREE_VFS_INODE = 5,
GIF_GLOP_PENDING = 6,
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 187d789a8f1e..8760e7e20c9d 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -444,11 +444,9 @@ static void gfs2_final_release_pages(struct gfs2_inode *ip)
struct inode *inode = &ip->i_inode;
struct gfs2_glock *gl = ip->i_gl;
- if (unlikely(!gl)) {
- /* This can only happen during incomplete inode creation. */
- BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags));
+ /* This can only happen during incomplete inode creation. */
+ if (unlikely(!gl))
return;
- }
truncate_inode_pages(gfs2_glock2aspace(gl), 0);
truncate_inode_pages(&inode->i_data, 0);
@@ -902,7 +900,6 @@ fail_gunlock3:
fail_gunlock2:
gfs2_glock_put(io_gl);
fail_dealloc_inode:
- set_bit(GIF_ALLOC_FAILED, &ip->i_flags);
dealloc_error = 0;
if (ip->i_eattr)
dealloc_error = gfs2_ea_dealloc(ip, xattr_initialized);
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index eafe123617e6..e43f08eb26e7 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -44,17 +44,17 @@ static inline int gfs2_is_dir(const struct gfs2_inode *ip)
static inline void gfs2_set_inode_blocks(struct inode *inode, u64 blocks)
{
- inode->i_blocks = blocks << (inode->i_blkbits - 9);
+ inode->i_blocks = blocks << (inode->i_blkbits - SECTOR_SHIFT);
}
static inline u64 gfs2_get_inode_blocks(const struct inode *inode)
{
- return inode->i_blocks >> (inode->i_blkbits - 9);
+ return inode->i_blocks >> (inode->i_blkbits - SECTOR_SHIFT);
}
static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change)
{
- change <<= inode->i_blkbits - 9;
+ change <<= inode->i_blkbits - SECTOR_SHIFT;
gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks >= -change));
inode->i_blocks += change;
}
@@ -107,9 +107,9 @@ loff_t gfs2_seek_hole(struct file *file, loff_t offset);
extern const struct file_operations gfs2_file_fops_nolock;
extern const struct file_operations gfs2_dir_fops_nolock;
-int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int gfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
int gfs2_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
void gfs2_set_inode_flags(struct inode *inode);
#ifdef CONFIG_GFS2_FS_LOCKING_DLM
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 7cb9d216d8bb..cee5d199d2d8 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -119,7 +119,7 @@ static inline void gfs2_update_request_times(struct gfs2_glock *gl)
static void gdlm_ast(void *arg)
{
struct gfs2_glock *gl = arg;
- unsigned ret = gl->gl_state;
+ unsigned ret;
/* If the glock is dead, we only react to a dlm_unlock() reply. */
if (__lockref_is_dead(&gl->gl_lockref) &&
@@ -139,13 +139,16 @@ static void gdlm_ast(void *arg)
gfs2_glock_free(gl);
return;
case -DLM_ECANCEL: /* Cancel while getting lock */
- ret |= LM_OUT_CANCELED;
+ ret = LM_OUT_CANCELED;
goto out;
case -EAGAIN: /* Try lock fails */
+ ret = LM_OUT_TRY_AGAIN;
+ goto out;
case -EDEADLK: /* Deadlock detected */
+ ret = LM_OUT_DEADLOCK;
goto out;
case -ETIMEDOUT: /* Canceled due to timeout */
- ret |= LM_OUT_ERROR;
+ ret = LM_OUT_ERROR;
goto out;
case 0: /* Success */
break;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 9dc8885c95d0..7fb11ff71b5a 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -103,6 +103,7 @@ const struct address_space_operations gfs2_meta_aops = {
.invalidate_folio = block_invalidate_folio,
.writepages = gfs2_aspace_writepages,
.release_folio = gfs2_release_folio,
+ .migrate_folio = buffer_migrate_folio_norefs,
};
const struct address_space_operations gfs2_rgrp_aops = {
@@ -110,6 +111,7 @@ const struct address_space_operations gfs2_rgrp_aops = {
.invalidate_folio = block_invalidate_folio,
.writepages = gfs2_aspace_writepages,
.release_folio = gfs2_release_folio,
+ .migrate_folio = buffer_migrate_folio_norefs,
};
/**
@@ -228,7 +230,7 @@ static void gfs2_submit_bhs(blk_opf_t opf, struct buffer_head *bhs[], int num)
struct bio *bio;
bio = bio_alloc(bh->b_bdev, num, opf, GFP_NOIO);
- bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> SECTOR_SHIFT);
while (num > 0) {
bh = *bhs;
if (!bio_add_folio(bio, bh->b_folio, bh->b_size, bh_offset(bh))) {
@@ -443,11 +445,9 @@ void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
struct buffer_head *bh;
int ty;
- if (!ip->i_gl) {
- /* This can only happen during incomplete inode creation. */
- BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags));
+ /* This can only happen during incomplete inode creation. */
+ if (!ip->i_gl)
return;
- }
gfs2_ail1_wipe(sdp, bstart, blen);
while (blen) {
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 653f0ff4b057..efe99b732551 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -64,7 +64,10 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
void free_sbd(struct gfs2_sbd *sdp)
{
+ struct super_block *sb = sdp->sd_vfs;
+
free_percpu(sdp->sd_lkstats);
+ sb->s_fs_info = NULL;
kfree(sdp);
}
@@ -160,7 +163,7 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent)
return -EINVAL;
}
- if (sb->sb_bsize < 512 || sb->sb_bsize > PAGE_SIZE ||
+ if (sb->sb_bsize < SECTOR_SIZE || sb->sb_bsize > PAGE_SIZE ||
(sb->sb_bsize & (sb->sb_bsize - 1))) {
pr_warn("Invalid block size\n");
return -EINVAL;
@@ -221,8 +224,8 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
if (unlikely(!sb))
return -ENOMEM;
err = bdev_rw_virt(sdp->sd_vfs->s_bdev,
- sector * (sdp->sd_vfs->s_blocksize >> 9), sb, PAGE_SIZE,
- REQ_OP_READ | REQ_META);
+ sector << (sdp->sd_vfs->s_blocksize_bits - SECTOR_SHIFT),
+ sb, PAGE_SIZE, REQ_OP_READ | REQ_META);
if (err) {
pr_warn("error %d reading superblock\n", err);
kfree(sb);
@@ -254,7 +257,7 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
return error;
}
- sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - 9;
+ sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - SECTOR_SHIFT;
sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift);
sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
sizeof(struct gfs2_dinode)) / sizeof(u64);
@@ -1142,7 +1145,7 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_magic = GFS2_MAGIC;
sb->s_op = &gfs2_super_ops;
- sb->s_d_op = &gfs2_dops;
+ set_default_d_op(sb, &gfs2_dops);
sb->s_export_op = &gfs2_export_ops;
sb->s_qcop = &gfs2_quotactl_ops;
sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
@@ -1152,12 +1155,12 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
/* Set up the buffer cache and fill in some fake block size values
to allow us to read-in the on-disk superblock. */
- sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, 512);
+ sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, SECTOR_SIZE);
error = -EINVAL;
if (!sdp->sd_sb.sb_bsize)
goto fail_free;
sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits;
- sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - 9;
+ sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - SECTOR_SHIFT;
sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift);
sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit;
@@ -1314,7 +1317,6 @@ fail_iput:
iput(sdp->sd_inode);
fail_free:
free_sbd(sdp);
- sb->s_fs_info = NULL;
return error;
}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 7c518c4ff638..b42e2110084b 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -487,11 +487,9 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
int need_endtrans = 0;
int ret;
- if (unlikely(!ip->i_gl)) {
- /* This can only happen during incomplete inode creation. */
- BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags));
+ /* This can only happen during incomplete inode creation. */
+ if (unlikely(!ip->i_gl))
return;
- }
if (gfs2_withdrawing_or_withdrawn(sdp))
return;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 748125653d6c..c3c8842920d2 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -764,7 +764,6 @@ fail_reg:
fs_err(sdp, "error %d adding sysfs files\n", error);
kobject_put(&sdp->sd_kobj);
wait_for_completion(&sdp->sd_kobj_unregister);
- sb->s_fs_info = NULL;
return error;
}
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index d5a1e63fa257..24864a66074b 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -232,32 +232,23 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
*/
ret = gfs2_glock_nq(&sdp->sd_live_gh);
+ gfs2_glock_put(live_gl); /* drop extra reference we acquired */
+ clear_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags);
+
/*
* If we actually got the "live" lock in EX mode, there are no other
- * nodes available to replay our journal. So we try to replay it
- * ourselves. We hold the "live" glock to prevent other mounters
- * during recovery, then just dequeue it and reacquire it in our
- * normal SH mode. Just in case the problem that caused us to
- * withdraw prevents us from recovering our journal (e.g. io errors
- * and such) we still check if the journal is clean before proceeding
- * but we may wait forever until another mounter does the recovery.
+ * nodes available to replay our journal.
*/
if (ret == 0) {
- fs_warn(sdp, "No other mounters found. Trying to recover our "
- "own journal jid %d.\n", sdp->sd_lockstruct.ls_jid);
- if (gfs2_recover_journal(sdp->sd_jdesc, 1))
- fs_warn(sdp, "Unable to recover our journal jid %d.\n",
- sdp->sd_lockstruct.ls_jid);
- gfs2_glock_dq_wait(&sdp->sd_live_gh);
- gfs2_holder_reinit(LM_ST_SHARED,
- LM_FLAG_NOEXP | GL_EXACT | GL_NOPID,
- &sdp->sd_live_gh);
- gfs2_glock_nq(&sdp->sd_live_gh);
+ fs_warn(sdp, "No other mounters found.\n");
+ /*
+ * We are about to release the lockspace. By keeping live_gl
+ * locked here, we ensure that the next mounter coming along
+ * will be a "first" mounter which will perform recovery.
+ */
+ goto skip_recovery;
}
- gfs2_glock_put(live_gl); /* drop extra reference we acquired */
- clear_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags);
-
/*
* At this point our journal is evicted, so we need to get a new inode
* for it. Once done, we need to call gfs2_find_jhead which
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index ef9498a6e88a..34e9804e0f36 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -16,6 +16,9 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
{
void *ptr;
+ if (!tree || !fd)
+ return -EINVAL;
+
fd->tree = tree;
fd->bnode = NULL;
ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL);
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index cb823a8a6ba9..e8cd1a31f247 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -15,6 +15,48 @@
#include "btree.h"
+static inline
+bool is_bnode_offset_valid(struct hfs_bnode *node, int off)
+{
+ bool is_valid = off < node->tree->node_size;
+
+ if (!is_valid) {
+ pr_err("requested invalid offset: "
+ "NODE: id %u, type %#x, height %u, "
+ "node_size %u, offset %d\n",
+ node->this, node->type, node->height,
+ node->tree->node_size, off);
+ }
+
+ return is_valid;
+}
+
+static inline
+int check_and_correct_requested_length(struct hfs_bnode *node, int off, int len)
+{
+ unsigned int node_size;
+
+ if (!is_bnode_offset_valid(node, off))
+ return 0;
+
+ node_size = node->tree->node_size;
+
+ if ((off + len) > node_size) {
+ int new_len = (int)node_size - off;
+
+ pr_err("requested length has been corrected: "
+ "NODE: id %u, type %#x, height %u, "
+ "node_size %u, offset %d, "
+ "requested_len %d, corrected_len %d\n",
+ node->this, node->type, node->height,
+ node->tree->node_size, off, len, new_len);
+
+ return new_len;
+ }
+
+ return len;
+}
+
void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
{
struct page *page;
@@ -22,6 +64,20 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
int bytes_read;
int bytes_to_read;
+ if (!is_bnode_offset_valid(node, off))
+ return;
+
+ if (len == 0) {
+ pr_err("requested zero length: "
+ "NODE: id %u, type %#x, height %u, "
+ "node_size %u, offset %d, len %d\n",
+ node->this, node->type, node->height,
+ node->tree->node_size, off, len);
+ return;
+ }
+
+ len = check_and_correct_requested_length(node, off, len);
+
off += node->page_offset;
pagenum = off >> PAGE_SHIFT;
off &= ~PAGE_MASK; /* compute page offset for the first page */
@@ -80,6 +136,20 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
{
struct page *page;
+ if (!is_bnode_offset_valid(node, off))
+ return;
+
+ if (len == 0) {
+ pr_err("requested zero length: "
+ "NODE: id %u, type %#x, height %u, "
+ "node_size %u, offset %d, len %d\n",
+ node->this, node->type, node->height,
+ node->tree->node_size, off, len);
+ return;
+ }
+
+ len = check_and_correct_requested_length(node, off, len);
+
off += node->page_offset;
page = node->page[0];
@@ -104,6 +174,20 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
{
struct page *page;
+ if (!is_bnode_offset_valid(node, off))
+ return;
+
+ if (len == 0) {
+ pr_err("requested zero length: "
+ "NODE: id %u, type %#x, height %u, "
+ "node_size %u, offset %d, len %d\n",
+ node->this, node->type, node->height,
+ node->tree->node_size, off, len);
+ return;
+ }
+
+ len = check_and_correct_requested_length(node, off, len);
+
off += node->page_offset;
page = node->page[0];
@@ -119,6 +203,10 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
if (!len)
return;
+
+ len = check_and_correct_requested_length(src_node, src, len);
+ len = check_and_correct_requested_length(dst_node, dst, len);
+
src += src_node->page_offset;
dst += dst_node->page_offset;
src_page = src_node->page[0];
@@ -136,6 +224,10 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
if (!len)
return;
+
+ len = check_and_correct_requested_length(node, src, len);
+ len = check_and_correct_requested_length(node, dst, len);
+
src += node->page_offset;
dst += node->page_offset;
page = node->page[0];
@@ -482,6 +574,7 @@ void hfs_bnode_put(struct hfs_bnode *node)
if (test_bit(HFS_BNODE_DELETED, &node->flags)) {
hfs_bnode_unhash(node);
spin_unlock(&tree->hash_lock);
+ hfs_bnode_clear(node, 0, tree->node_size);
hfs_bmap_free(node);
hfs_bnode_free(node);
return;
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 2fa4b1f8cc7f..e86e1e235658 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -21,8 +21,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
struct hfs_btree *tree;
struct hfs_btree_header_rec *head;
struct address_space *mapping;
- struct page *page;
+ struct folio *folio;
+ struct buffer_head *bh;
unsigned int size;
+ u16 dblock;
+ sector_t start_block;
+ loff_t offset;
tree = kzalloc(sizeof(*tree), GFP_KERNEL);
if (!tree)
@@ -75,12 +79,40 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
unlock_new_inode(tree->inode);
mapping = tree->inode->i_mapping;
- page = read_mapping_page(mapping, 0, NULL);
- if (IS_ERR(page))
+ folio = filemap_grab_folio(mapping, 0);
+ if (IS_ERR(folio))
goto free_inode;
+ folio_zero_range(folio, 0, folio_size(folio));
+
+ dblock = hfs_ext_find_block(HFS_I(tree->inode)->first_extents, 0);
+ start_block = HFS_SB(sb)->fs_start + (dblock * HFS_SB(sb)->fs_div);
+
+ size = folio_size(folio);
+ offset = 0;
+ while (size > 0) {
+ size_t len;
+
+ bh = sb_bread(sb, start_block);
+ if (!bh) {
+ pr_err("unable to read tree header\n");
+ goto put_folio;
+ }
+
+ len = min_t(size_t, folio_size(folio), sb->s_blocksize);
+ memcpy_to_folio(folio, offset, bh->b_data, sb->s_blocksize);
+
+ brelse(bh);
+
+ start_block++;
+ offset += len;
+ size -= len;
+ }
+
+ folio_mark_uptodate(folio);
+
/* Load the header */
- head = (struct hfs_btree_header_rec *)(kmap_local_page(page) +
+ head = (struct hfs_btree_header_rec *)(kmap_local_folio(folio, 0) +
sizeof(struct hfs_bnode_desc));
tree->root = be32_to_cpu(head->root);
tree->leaf_count = be32_to_cpu(head->leaf_count);
@@ -95,22 +127,22 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
size = tree->node_size;
if (!is_power_of_2(size))
- goto fail_page;
+ goto fail_folio;
if (!tree->node_count)
- goto fail_page;
+ goto fail_folio;
switch (id) {
case HFS_EXT_CNID:
if (tree->max_key_len != HFS_MAX_EXT_KEYLEN) {
pr_err("invalid extent max_key_len %d\n",
tree->max_key_len);
- goto fail_page;
+ goto fail_folio;
}
break;
case HFS_CAT_CNID:
if (tree->max_key_len != HFS_MAX_CAT_KEYLEN) {
pr_err("invalid catalog max_key_len %d\n",
tree->max_key_len);
- goto fail_page;
+ goto fail_folio;
}
break;
default:
@@ -121,12 +153,15 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
tree->pages_per_bnode = (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
kunmap_local(head);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return tree;
-fail_page:
+fail_folio:
kunmap_local(head);
- put_page(page);
+put_folio:
+ folio_unlock(folio);
+ folio_put(folio);
free_inode:
tree->inode->i_mapping->a_ops = &hfs_aops;
iput(tree->inode);
diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
index 4a0ce131e233..580c62981dbd 100644
--- a/fs/hfs/extent.c
+++ b/fs/hfs/extent.c
@@ -71,7 +71,7 @@ int hfs_ext_keycmp(const btree_key *key1, const btree_key *key2)
*
* Find a block within an extent record
*/
-static u16 hfs_ext_find_block(struct hfs_extent *ext, u16 off)
+u16 hfs_ext_find_block(struct hfs_extent *ext, u16 off)
{
int i;
u16 count;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index a0c7cb0f79fc..7c5a7ecfa246 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -190,6 +190,7 @@ extern const struct inode_operations hfs_dir_inode_operations;
/* extent.c */
extern int hfs_ext_keycmp(const btree_key *, const btree_key *);
+extern u16 hfs_ext_find_block(struct hfs_extent *ext, u16 off);
extern int hfs_free_fork(struct super_block *, struct hfs_cat_file *, int);
extern int hfs_ext_write_extent(struct inode *);
extern int hfs_extend_file(struct inode *);
@@ -201,7 +202,7 @@ extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
extern const struct address_space_operations hfs_aops;
extern const struct address_space_operations hfs_btree_aops;
-int hfs_write_begin(struct file *file, struct address_space *mapping,
+int hfs_write_begin(const struct kiocb *iocb, struct address_space *mapping,
loff_t pos, unsigned len, struct folio **foliop, void **fsdata);
extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t);
extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index a81ce7a740b9..bf4cb7e78396 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -44,12 +44,12 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to)
}
}
-int hfs_write_begin(struct file *file, struct address_space *mapping,
+int hfs_write_begin(const struct kiocb *iocb, struct address_space *mapping,
loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
{
int ret;
- ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata,
+ ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata,
hfs_get_block,
&HFS_I(mapping->host)->phys_size);
if (unlikely(ret))
@@ -690,8 +690,9 @@ static const struct file_operations hfs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.splice_read = filemap_splice_read,
+ .splice_write = iter_file_splice_write,
.fsync = hfs_file_fsync,
.open = hfs_file_open,
.release = hfs_file_release,
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index fe09c2093a93..388a318297ec 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -365,7 +365,7 @@ static int hfs_fill_super(struct super_block *sb, struct fs_context *fc)
if (!root_inode)
goto bail_no_root;
- sb->s_d_op = &hfs_dentry_operations;
+ set_default_d_op(sb, &hfs_dentry_operations);
res = -ENOMEM;
sb->s_root = d_make_root(root_inode);
if (!sb->s_root)
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 079ea80534f7..14f4995588ff 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -18,12 +18,68 @@
#include "hfsplus_fs.h"
#include "hfsplus_raw.h"
+static inline
+bool is_bnode_offset_valid(struct hfs_bnode *node, int off)
+{
+ bool is_valid = off < node->tree->node_size;
+
+ if (!is_valid) {
+ pr_err("requested invalid offset: "
+ "NODE: id %u, type %#x, height %u, "
+ "node_size %u, offset %d\n",
+ node->this, node->type, node->height,
+ node->tree->node_size, off);
+ }
+
+ return is_valid;
+}
+
+static inline
+int check_and_correct_requested_length(struct hfs_bnode *node, int off, int len)
+{
+ unsigned int node_size;
+
+ if (!is_bnode_offset_valid(node, off))
+ return 0;
+
+ node_size = node->tree->node_size;
+
+ if ((off + len) > node_size) {
+ int new_len = (int)node_size - off;
+
+ pr_err("requested length has been corrected: "
+ "NODE: id %u, type %#x, height %u, "
+ "node_size %u, offset %d, "
+ "requested_len %d, corrected_len %d\n",
+ node->this, node->type, node->height,
+ node->tree->node_size, off, len, new_len);
+
+ return new_len;
+ }
+
+ return len;
+}
+
/* Copy a specified range of bytes from the raw data of a node */
void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
{
struct page **pagep;
int l;
+ if (!is_bnode_offset_valid(node, off))
+ return;
+
+ if (len == 0) {
+ pr_err("requested zero length: "
+ "NODE: id %u, type %#x, height %u, "
+ "node_size %u, offset %d, len %d\n",
+ node->this, node->type, node->height,
+ node->tree->node_size, off, len);
+ return;
+ }
+
+ len = check_and_correct_requested_length(node, off, len);
+
off += node->page_offset;
pagep = node->page + (off >> PAGE_SHIFT);
off &= ~PAGE_MASK;
@@ -81,6 +137,20 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
struct page **pagep;
int l;
+ if (!is_bnode_offset_valid(node, off))
+ return;
+
+ if (len == 0) {
+ pr_err("requested zero length: "
+ "NODE: id %u, type %#x, height %u, "
+ "node_size %u, offset %d, len %d\n",
+ node->this, node->type, node->height,
+ node->tree->node_size, off, len);
+ return;
+ }
+
+ len = check_and_correct_requested_length(node, off, len);
+
off += node->page_offset;
pagep = node->page + (off >> PAGE_SHIFT);
off &= ~PAGE_MASK;
@@ -109,6 +179,20 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
struct page **pagep;
int l;
+ if (!is_bnode_offset_valid(node, off))
+ return;
+
+ if (len == 0) {
+ pr_err("requested zero length: "
+ "NODE: id %u, type %#x, height %u, "
+ "node_size %u, offset %d, len %d\n",
+ node->this, node->type, node->height,
+ node->tree->node_size, off, len);
+ return;
+ }
+
+ len = check_and_correct_requested_length(node, off, len);
+
off += node->page_offset;
pagep = node->page + (off >> PAGE_SHIFT);
off &= ~PAGE_MASK;
@@ -133,6 +217,10 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
if (!len)
return;
+
+ len = check_and_correct_requested_length(src_node, src, len);
+ len = check_and_correct_requested_length(dst_node, dst, len);
+
src += src_node->page_offset;
dst += dst_node->page_offset;
src_page = src_node->page + (src >> PAGE_SHIFT);
@@ -187,6 +275,10 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
if (!len)
return;
+
+ len = check_and_correct_requested_length(node, src, len);
+ len = check_and_correct_requested_length(node, dst, len);
+
src += node->page_offset;
dst += node->page_offset;
if (dst > src) {
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index a6d61685ae79..b1699b3c246a 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -342,9 +342,6 @@ static int hfsplus_free_extents(struct super_block *sb,
int i;
int err = 0;
- /* Mapping the allocation file may lock the extent tree */
- WARN_ON(mutex_is_locked(&HFSPLUS_SB(sb)->ext_tree->tree_lock));
-
hfsplus_dump_extent(extent);
for (i = 0; i < 8; extent++, i++) {
count = be32_to_cpu(extent->block_count);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 2f089bff0095..96a5c24813dd 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -473,8 +473,10 @@ extern const struct address_space_operations hfsplus_aops;
extern const struct address_space_operations hfsplus_btree_aops;
extern const struct dentry_operations hfsplus_dentry_operations;
-int hfsplus_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct folio **foliop, void **fsdata);
+int hfsplus_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, struct folio **foliop,
+ void **fsdata);
struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir,
umode_t mode);
void hfsplus_delete_inode(struct inode *inode);
@@ -489,9 +491,9 @@ int hfsplus_getattr(struct mnt_idmap *idmap, const struct path *path,
unsigned int query_flags);
int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
int datasync);
-int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int hfsplus_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
int hfsplus_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
/* ioctl.c */
long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index f331e9574217..b51a411ecd23 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -38,12 +38,14 @@ static void hfsplus_write_failed(struct address_space *mapping, loff_t to)
}
}
-int hfsplus_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
+int hfsplus_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos,
+ unsigned len, struct folio **foliop,
+ void **fsdata)
{
int ret;
- ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata,
+ ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata,
hfsplus_get_block,
&HFSPLUS_I(mapping->host)->phys_size);
if (unlikely(ret))
@@ -366,8 +368,9 @@ static const struct file_operations hfsplus_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.splice_read = filemap_splice_read,
+ .splice_write = iter_file_splice_write,
.fsync = hfsplus_file_fsync,
.open = hfsplus_file_open,
.release = hfsplus_file_release,
@@ -654,7 +657,7 @@ out:
return res;
}
-int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int hfsplus_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
@@ -673,7 +676,7 @@ int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
int hfsplus_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 948b8aaee33e..86351bdc8985 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -222,8 +222,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
error2 = hfsplus_submit_bio(sb,
sbi->part_start + HFSPLUS_VOLHEAD_SECTOR,
- sbi->s_vhdr_buf, NULL, REQ_OP_WRITE |
- REQ_SYNC);
+ sbi->s_vhdr_buf, NULL, REQ_OP_WRITE);
if (!error)
error = error2;
if (!write_backup)
@@ -231,8 +230,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
error2 = hfsplus_submit_bio(sb,
sbi->part_start + sbi->sect_count - 2,
- sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE |
- REQ_SYNC);
+ sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE);
if (!error)
error2 = error;
out:
@@ -508,7 +506,7 @@ static int hfsplus_fill_super(struct super_block *sb, struct fs_context *fc)
goto out_put_alloc_file;
}
- sb->s_d_op = &hfsplus_dentry_operations;
+ set_default_d_op(sb, &hfsplus_dentry_operations);
sb->s_root = d_make_root(root);
if (!sb->s_root) {
err = -ENOMEM;
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 73342c925a4b..36b6cf2a3abb 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -132,7 +132,14 @@ int hfsplus_uni2asc(struct super_block *sb,
op = astr;
ip = ustr->unicode;
+
ustrlen = be16_to_cpu(ustr->length);
+ if (ustrlen > HFSPLUS_MAX_STRLEN) {
+ ustrlen = HFSPLUS_MAX_STRLEN;
+ pr_err("invalid length %u has been corrected to %d\n",
+ be16_to_cpu(ustr->length), ustrlen);
+ }
+
len = *len_p;
ce1 = NULL;
compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 9a1a93e3888b..18dc3d254d21 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -172,7 +172,11 @@ check_attr_tree_state_again:
return PTR_ERR(attr_file);
}
- BUG_ON(i_size_read(attr_file) != 0);
+ if (i_size_read(attr_file) != 0) {
+ err = -EIO;
+ pr_err("detected inconsistent attributes file, running fsck.hfsplus is recommended.\n");
+ goto end_attr_file_creation;
+ }
hip = HFSPLUS_I(attr_file);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 702c41317589..01e516175bcd 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -382,7 +382,7 @@ static const struct file_operations hostfs_file_fops = {
.splice_write = iter_file_splice_write,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.open = hostfs_open,
.release = hostfs_file_release,
.fsync = hostfs_fsync,
@@ -445,7 +445,8 @@ static int hostfs_read_folio(struct file *file, struct folio *folio)
return ret;
}
-static int hostfs_write_begin(struct file *file, struct address_space *mapping,
+static int hostfs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
@@ -458,7 +459,8 @@ static int hostfs_write_begin(struct file *file, struct address_space *mapping,
return 0;
}
-static int hostfs_write_end(struct file *file, struct address_space *mapping,
+static int hostfs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
{
@@ -468,7 +470,7 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping,
int err;
buffer = kmap_local_folio(folio, from);
- err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer, copied);
+ err = write_file(FILE_HOSTFS_I(iocb->ki_filp)->fd, &pos, buffer, copied);
kunmap_local(buffer);
if (!folio_test_uptodate(folio) && err == folio_size(folio))
@@ -933,7 +935,7 @@ static int hostfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_blocksize_bits = 10;
sb->s_magic = HOSTFS_SUPER_MAGIC;
sb->s_op = &hostfs_sbops;
- sb->s_d_op = &simple_dentry_operations;
+ sb->s_d_flags = DCACHE_DONTCACHE;
sb->s_maxbytes = MAX_LFS_FILESIZE;
err = super_setup_bdi(sb);
if (err)
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 449a3fc1b8d9..263b5bbe1849 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -188,13 +188,14 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to)
hpfs_unlock(inode->i_sb);
}
-static int hpfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int hpfs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int ret;
- ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata,
+ ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata,
hpfs_get_block,
&hpfs_i(mapping->host)->mmu_private);
if (unlikely(ret))
@@ -203,13 +204,14 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
-static int hpfs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+static int hpfs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
int err;
- err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
if (err < len)
hpfs_write_failed(mapping, pos + len);
if (!(err < 0)) {
@@ -255,7 +257,7 @@ const struct file_operations hpfs_file_ops =
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.release = hpfs_file_release,
.fsync = hpfs_file_fsync,
.splice_read = filemap_splice_read,
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 27567920abe4..42b779b4d87f 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -554,7 +554,7 @@ static int hpfs_fill_super(struct super_block *s, struct fs_context *fc)
/* Fill superblock stuff */
s->s_magic = HPFS_SUPER_MAGIC;
s->s_op = &hpfs_sops;
- s->s_d_op = &hpfs_dentry_operations;
+ set_default_d_op(s, &hpfs_dentry_operations);
s->s_time_min = local_to_gmt(s, 0);
s->s_time_max = local_to_gmt(s, U32_MAX);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e4de5425838d..09d4baef29cf 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -150,10 +150,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
if (inode->i_flags & S_PRIVATE)
vm_flags |= VM_NORESERVE;
- if (!hugetlb_reserve_pages(inode,
+ if (hugetlb_reserve_pages(inode,
vma->vm_pgoff >> huge_page_order(h),
len >> huge_page_shift(h), vma,
- vm_flags))
+ vm_flags) < 0)
goto out;
ret = 0;
@@ -179,12 +179,8 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
if (len & ~huge_page_mask(h))
return -EINVAL;
- if (flags & MAP_FIXED) {
- if (addr & ~huge_page_mask(h))
- return -EINVAL;
- if (prepare_hugepage_range(file, addr, len))
- return -EINVAL;
- }
+ if ((flags & MAP_FIXED) && (addr & ~huge_page_mask(h)))
+ return -EINVAL;
if (addr)
addr0 = ALIGN(addr, huge_page_size(h));
@@ -311,7 +307,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
return retval;
}
-static int hugetlbfs_write_begin(struct file *file,
+static int hugetlbfs_write_begin(const struct kiocb *iocb,
struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
@@ -319,9 +315,10 @@ static int hugetlbfs_write_begin(struct file *file,
return -EINVAL;
}
-static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+static int hugetlbfs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
BUG();
return -EINVAL;
@@ -1433,6 +1430,7 @@ hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_blocksize_bits = huge_page_shift(ctx->hstate);
sb->s_magic = HUGETLBFS_MAGIC;
sb->s_op = &hugetlbfs_ops;
+ sb->s_d_flags = DCACHE_DONTCACHE;
sb->s_time_gran = 1;
/*
@@ -1561,9 +1559,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
inode->i_size = size;
clear_nlink(inode);
- if (!hugetlb_reserve_pages(inode, 0,
+ if (hugetlb_reserve_pages(inode, 0,
size >> huge_page_shift(hstate_inode(inode)), NULL,
- acctflag))
+ acctflag) < 0)
file = ERR_PTR(-ENOMEM);
else
file = alloc_file_pseudo(inode, mnt, name, O_RDWR,
@@ -1587,7 +1585,7 @@ static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h)
} else {
struct hugetlbfs_fs_context *ctx = fc->fs_private;
ctx->hstate = h;
- mnt = fc_mount(fc);
+ mnt = fc_mount_longterm(fc);
put_fs_context(fc);
}
if (IS_ERR(mnt))
diff --git a/fs/inode.c b/fs/inode.c
index 99318b157a9a..01ebdc40021e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -865,12 +865,12 @@ static void dispose_list(struct list_head *head)
*/
void evict_inodes(struct super_block *sb)
{
- struct inode *inode, *next;
+ struct inode *inode;
LIST_HEAD(dispose);
again:
spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
if (atomic_read(&inode->i_count))
continue;
@@ -1158,9 +1158,8 @@ void lockdep_annotate_inode_mutex_key(struct inode *inode)
/* Set new key only if filesystem hasn't already changed it */
if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) {
/*
- * ensure nobody is actually holding i_mutex
+ * ensure nobody is actually holding i_rwsem
*/
- // mutex_destroy(&inode->i_mutex);
init_rwsem(&inode->i_rwsem);
lockdep_set_class(&inode->i_rwsem,
&type->i_mutex_dir_key);
@@ -2615,7 +2614,7 @@ EXPORT_SYMBOL(inode_dio_finished);
* proceed with a truncate or equivalent operation.
*
* Must be called under a lock that serializes taking new references
- * to i_dio_count, usually by inode->i_mutex.
+ * to i_dio_count, usually by inode->i_rwsem.
*/
void inode_dio_wait(struct inode *inode)
{
@@ -2633,7 +2632,7 @@ EXPORT_SYMBOL(inode_dio_wait_interruptible);
/*
* inode_set_flags - atomically set some inode flags
*
- * Note: the caller should be holding i_mutex, or else be sure that
+ * Note: the caller should be holding i_rwsem exclusively, or else be sure that
* they have exclusive access to the inode structure (i.e., while the
* inode is being instantiated). The reason for the cmpxchg() loop
* --- which wouldn't be necessary if all code paths which modify
@@ -2641,7 +2640,7 @@ EXPORT_SYMBOL(inode_dio_wait_interruptible);
* code path which doesn't today so we use cmpxchg() out of an abundance
* of caution.
*
- * In the long run, i_mutex is overkill, and we should probably look
+ * In the long run, i_rwsem is overkill, and we should probably look
* at using the i_lock spinlock to protect i_flags, and then make sure
* it is so documented in include/linux/fs.h and that all code follows
* the locking convention!!
diff --git a/fs/internal.h b/fs/internal.h
index 393f6c5c24f6..38e8aab27bbd 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -101,6 +101,7 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
struct file *alloc_empty_file(int flags, const struct cred *cred);
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
struct file *alloc_empty_backing_file(int flags, const struct cred *cred);
+void backing_file_set_user_path(struct file *f, const struct path *path);
static inline void file_put_write_access(struct file *file)
{
@@ -322,12 +323,15 @@ struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns);
struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
void mnt_idmap_put(struct mnt_idmap *idmap);
struct stashed_operations {
+ struct dentry *(*stash_dentry)(struct dentry **stashed,
+ struct dentry *dentry);
void (*put_data)(void *data);
int (*init_inode)(struct inode *inode, void *data);
};
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
struct path *path);
void stashed_dentry_prune(struct dentry *dentry);
+struct dentry *stash_dentry(struct dentry **stashed, struct dentry *dentry);
struct dentry *stashed_dentry_get(struct dentry **stashed);
/**
* path_mounted - check whether path is mounted
@@ -350,3 +354,4 @@ int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
unsigned int query_flags);
int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr);
+void pidfs_get_root(struct path *path);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 69107a245b4c..0248cb8db2d3 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -453,315 +453,6 @@ out:
return ret;
}
-/**
- * fileattr_fill_xflags - initialize fileattr with xflags
- * @fa: fileattr pointer
- * @xflags: FS_XFLAG_* flags
- *
- * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags). All
- * other fields are zeroed.
- */
-void fileattr_fill_xflags(struct fileattr *fa, u32 xflags)
-{
- memset(fa, 0, sizeof(*fa));
- fa->fsx_valid = true;
- fa->fsx_xflags = xflags;
- if (fa->fsx_xflags & FS_XFLAG_IMMUTABLE)
- fa->flags |= FS_IMMUTABLE_FL;
- if (fa->fsx_xflags & FS_XFLAG_APPEND)
- fa->flags |= FS_APPEND_FL;
- if (fa->fsx_xflags & FS_XFLAG_SYNC)
- fa->flags |= FS_SYNC_FL;
- if (fa->fsx_xflags & FS_XFLAG_NOATIME)
- fa->flags |= FS_NOATIME_FL;
- if (fa->fsx_xflags & FS_XFLAG_NODUMP)
- fa->flags |= FS_NODUMP_FL;
- if (fa->fsx_xflags & FS_XFLAG_DAX)
- fa->flags |= FS_DAX_FL;
- if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
- fa->flags |= FS_PROJINHERIT_FL;
-}
-EXPORT_SYMBOL(fileattr_fill_xflags);
-
-/**
- * fileattr_fill_flags - initialize fileattr with flags
- * @fa: fileattr pointer
- * @flags: FS_*_FL flags
- *
- * Set ->flags, ->flags_valid and ->fsx_xflags (translated flags).
- * All other fields are zeroed.
- */
-void fileattr_fill_flags(struct fileattr *fa, u32 flags)
-{
- memset(fa, 0, sizeof(*fa));
- fa->flags_valid = true;
- fa->flags = flags;
- if (fa->flags & FS_SYNC_FL)
- fa->fsx_xflags |= FS_XFLAG_SYNC;
- if (fa->flags & FS_IMMUTABLE_FL)
- fa->fsx_xflags |= FS_XFLAG_IMMUTABLE;
- if (fa->flags & FS_APPEND_FL)
- fa->fsx_xflags |= FS_XFLAG_APPEND;
- if (fa->flags & FS_NODUMP_FL)
- fa->fsx_xflags |= FS_XFLAG_NODUMP;
- if (fa->flags & FS_NOATIME_FL)
- fa->fsx_xflags |= FS_XFLAG_NOATIME;
- if (fa->flags & FS_DAX_FL)
- fa->fsx_xflags |= FS_XFLAG_DAX;
- if (fa->flags & FS_PROJINHERIT_FL)
- fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
-}
-EXPORT_SYMBOL(fileattr_fill_flags);
-
-/**
- * vfs_fileattr_get - retrieve miscellaneous file attributes
- * @dentry: the object to retrieve from
- * @fa: fileattr pointer
- *
- * Call i_op->fileattr_get() callback, if exists.
- *
- * Return: 0 on success, or a negative error on failure.
- */
-int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
-{
- struct inode *inode = d_inode(dentry);
-
- if (!inode->i_op->fileattr_get)
- return -ENOIOCTLCMD;
-
- return inode->i_op->fileattr_get(dentry, fa);
-}
-EXPORT_SYMBOL(vfs_fileattr_get);
-
-/**
- * copy_fsxattr_to_user - copy fsxattr to userspace.
- * @fa: fileattr pointer
- * @ufa: fsxattr user pointer
- *
- * Return: 0 on success, or -EFAULT on failure.
- */
-int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa)
-{
- struct fsxattr xfa;
-
- memset(&xfa, 0, sizeof(xfa));
- xfa.fsx_xflags = fa->fsx_xflags;
- xfa.fsx_extsize = fa->fsx_extsize;
- xfa.fsx_nextents = fa->fsx_nextents;
- xfa.fsx_projid = fa->fsx_projid;
- xfa.fsx_cowextsize = fa->fsx_cowextsize;
-
- if (copy_to_user(ufa, &xfa, sizeof(xfa)))
- return -EFAULT;
-
- return 0;
-}
-EXPORT_SYMBOL(copy_fsxattr_to_user);
-
-static int copy_fsxattr_from_user(struct fileattr *fa,
- struct fsxattr __user *ufa)
-{
- struct fsxattr xfa;
-
- if (copy_from_user(&xfa, ufa, sizeof(xfa)))
- return -EFAULT;
-
- fileattr_fill_xflags(fa, xfa.fsx_xflags);
- fa->fsx_extsize = xfa.fsx_extsize;
- fa->fsx_nextents = xfa.fsx_nextents;
- fa->fsx_projid = xfa.fsx_projid;
- fa->fsx_cowextsize = xfa.fsx_cowextsize;
-
- return 0;
-}
-
-/*
- * Generic function to check FS_IOC_FSSETXATTR/FS_IOC_SETFLAGS values and reject
- * any invalid configurations.
- *
- * Note: must be called with inode lock held.
- */
-static int fileattr_set_prepare(struct inode *inode,
- const struct fileattr *old_ma,
- struct fileattr *fa)
-{
- int err;
-
- /*
- * The IMMUTABLE and APPEND_ONLY flags can only be changed by
- * the relevant capability.
- */
- if ((fa->flags ^ old_ma->flags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) &&
- !capable(CAP_LINUX_IMMUTABLE))
- return -EPERM;
-
- err = fscrypt_prepare_setflags(inode, old_ma->flags, fa->flags);
- if (err)
- return err;
-
- /*
- * Project Quota ID state is only allowed to change from within the init
- * namespace. Enforce that restriction only if we are trying to change
- * the quota ID state. Everything else is allowed in user namespaces.
- */
- if (current_user_ns() != &init_user_ns) {
- if (old_ma->fsx_projid != fa->fsx_projid)
- return -EINVAL;
- if ((old_ma->fsx_xflags ^ fa->fsx_xflags) &
- FS_XFLAG_PROJINHERIT)
- return -EINVAL;
- } else {
- /*
- * Caller is allowed to change the project ID. If it is being
- * changed, make sure that the new value is valid.
- */
- if (old_ma->fsx_projid != fa->fsx_projid &&
- !projid_valid(make_kprojid(&init_user_ns, fa->fsx_projid)))
- return -EINVAL;
- }
-
- /* Check extent size hints. */
- if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode))
- return -EINVAL;
-
- if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
- !S_ISDIR(inode->i_mode))
- return -EINVAL;
-
- if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) &&
- !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
- return -EINVAL;
-
- /*
- * It is only valid to set the DAX flag on regular files and
- * directories on filesystems.
- */
- if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
- !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
- return -EINVAL;
-
- /* Extent size hints of zero turn off the flags. */
- if (fa->fsx_extsize == 0)
- fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
- if (fa->fsx_cowextsize == 0)
- fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
-
- return 0;
-}
-
-/**
- * vfs_fileattr_set - change miscellaneous file attributes
- * @idmap: idmap of the mount
- * @dentry: the object to change
- * @fa: fileattr pointer
- *
- * After verifying permissions, call i_op->fileattr_set() callback, if
- * exists.
- *
- * Verifying attributes involves retrieving current attributes with
- * i_op->fileattr_get(), this also allows initializing attributes that have
- * not been set by the caller to current values. Inode lock is held
- * thoughout to prevent racing with another instance.
- *
- * Return: 0 on success, or a negative error on failure.
- */
-int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,
- struct fileattr *fa)
-{
- struct inode *inode = d_inode(dentry);
- struct fileattr old_ma = {};
- int err;
-
- if (!inode->i_op->fileattr_set)
- return -ENOIOCTLCMD;
-
- if (!inode_owner_or_capable(idmap, inode))
- return -EPERM;
-
- inode_lock(inode);
- err = vfs_fileattr_get(dentry, &old_ma);
- if (!err) {
- /* initialize missing bits from old_ma */
- if (fa->flags_valid) {
- fa->fsx_xflags |= old_ma.fsx_xflags & ~FS_XFLAG_COMMON;
- fa->fsx_extsize = old_ma.fsx_extsize;
- fa->fsx_nextents = old_ma.fsx_nextents;
- fa->fsx_projid = old_ma.fsx_projid;
- fa->fsx_cowextsize = old_ma.fsx_cowextsize;
- } else {
- fa->flags |= old_ma.flags & ~FS_COMMON_FL;
- }
- err = fileattr_set_prepare(inode, &old_ma, fa);
- if (!err)
- err = inode->i_op->fileattr_set(idmap, dentry, fa);
- }
- inode_unlock(inode);
-
- return err;
-}
-EXPORT_SYMBOL(vfs_fileattr_set);
-
-static int ioctl_getflags(struct file *file, unsigned int __user *argp)
-{
- struct fileattr fa = { .flags_valid = true }; /* hint only */
- int err;
-
- err = vfs_fileattr_get(file->f_path.dentry, &fa);
- if (!err)
- err = put_user(fa.flags, argp);
- return err;
-}
-
-static int ioctl_setflags(struct file *file, unsigned int __user *argp)
-{
- struct mnt_idmap *idmap = file_mnt_idmap(file);
- struct dentry *dentry = file->f_path.dentry;
- struct fileattr fa;
- unsigned int flags;
- int err;
-
- err = get_user(flags, argp);
- if (!err) {
- err = mnt_want_write_file(file);
- if (!err) {
- fileattr_fill_flags(&fa, flags);
- err = vfs_fileattr_set(idmap, dentry, &fa);
- mnt_drop_write_file(file);
- }
- }
- return err;
-}
-
-static int ioctl_fsgetxattr(struct file *file, void __user *argp)
-{
- struct fileattr fa = { .fsx_valid = true }; /* hint only */
- int err;
-
- err = vfs_fileattr_get(file->f_path.dentry, &fa);
- if (!err)
- err = copy_fsxattr_to_user(&fa, argp);
-
- return err;
-}
-
-static int ioctl_fssetxattr(struct file *file, void __user *argp)
-{
- struct mnt_idmap *idmap = file_mnt_idmap(file);
- struct dentry *dentry = file->f_path.dentry;
- struct fileattr fa;
- int err;
-
- err = copy_fsxattr_from_user(&fa, argp);
- if (!err) {
- err = mnt_want_write_file(file);
- if (!err) {
- err = vfs_fileattr_set(idmap, dentry, &fa);
- mnt_drop_write_file(file);
- }
- }
- return err;
-}
-
static int ioctl_getfsuuid(struct file *file, void __user *argp)
{
struct super_block *sb = file_inode(file)->i_sb;
diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
index 69e8ebb41302..f7e1c8534c46 100644
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -9,9 +9,9 @@ ccflags-y += -I $(src) # needed for trace events
obj-$(CONFIG_FS_IOMAP) += iomap.o
iomap-y += trace.o \
- iter.o
-iomap-$(CONFIG_BLOCK) += buffered-io.o \
- direct-io.o \
+ iter.o \
+ buffered-io.o
+iomap-$(CONFIG_BLOCK) += direct-io.o \
ioend.o \
fiemap.o \
seek.o
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 233abf598f65..fd827398afd2 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -3,20 +3,11 @@
* Copyright (C) 2010 Red Hat, Inc.
* Copyright (C) 2016-2023 Christoph Hellwig.
*/
-#include <linux/module.h>
-#include <linux/compiler.h>
-#include <linux/fs.h>
#include <linux/iomap.h>
-#include <linux/pagemap.h>
-#include <linux/uio.h>
#include <linux/buffer_head.h>
-#include <linux/dax.h>
#include <linux/writeback.h>
#include <linux/swap.h>
-#include <linux/bio.h>
-#include <linux/sched/signal.h>
#include <linux/migrate.h>
-#include "internal.h"
#include "trace.h"
#include "../internal.h"
@@ -71,6 +62,9 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off,
unsigned long flags;
bool uptodate = true;
+ if (folio_test_uptodate(folio))
+ return;
+
if (ifs) {
spin_lock_irqsave(&ifs->state_lock, flags);
uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
@@ -284,6 +278,46 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
*lenp = plen;
}
+static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
+ loff_t pos)
+{
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
+
+ return srcmap->type != IOMAP_MAPPED ||
+ (srcmap->flags & IOMAP_F_NEW) ||
+ pos >= i_size_read(iter->inode);
+}
+
+/**
+ * iomap_read_inline_data - copy inline data into the page cache
+ * @iter: iteration structure
+ * @folio: folio to copy to
+ *
+ * Copy the inline data in @iter into @folio and zero out the rest of the folio.
+ * Only a single IOMAP_INLINE extent is allowed at the end of each file.
+ * Returns zero for success to complete the read, or the usual negative errno.
+ */
+static int iomap_read_inline_data(const struct iomap_iter *iter,
+ struct folio *folio)
+{
+ const struct iomap *iomap = iomap_iter_srcmap(iter);
+ size_t size = i_size_read(iter->inode) - iomap->offset;
+ size_t offset = offset_in_folio(folio, iomap->offset);
+
+ if (folio_test_uptodate(folio))
+ return 0;
+
+ if (WARN_ON_ONCE(size > iomap->length))
+ return -EIO;
+ if (offset > 0)
+ ifs_alloc(iter->inode, folio, iter->flags);
+
+ folio_fill_tail(folio, offset, iomap->inline_data, size);
+ iomap_set_range_uptodate(folio, offset, folio_size(folio) - offset);
+ return 0;
+}
+
+#ifdef CONFIG_BLOCK
static void iomap_finish_folio_read(struct folio *folio, size_t off,
size_t len, int error)
{
@@ -323,45 +357,6 @@ struct iomap_readpage_ctx {
struct readahead_control *rac;
};
-/**
- * iomap_read_inline_data - copy inline data into the page cache
- * @iter: iteration structure
- * @folio: folio to copy to
- *
- * Copy the inline data in @iter into @folio and zero out the rest of the folio.
- * Only a single IOMAP_INLINE extent is allowed at the end of each file.
- * Returns zero for success to complete the read, or the usual negative errno.
- */
-static int iomap_read_inline_data(const struct iomap_iter *iter,
- struct folio *folio)
-{
- const struct iomap *iomap = iomap_iter_srcmap(iter);
- size_t size = i_size_read(iter->inode) - iomap->offset;
- size_t offset = offset_in_folio(folio, iomap->offset);
-
- if (folio_test_uptodate(folio))
- return 0;
-
- if (WARN_ON_ONCE(size > iomap->length))
- return -EIO;
- if (offset > 0)
- ifs_alloc(iter->inode, folio, iter->flags);
-
- folio_fill_tail(folio, offset, iomap->inline_data, size);
- iomap_set_range_uptodate(folio, offset, folio_size(folio) - offset);
- return 0;
-}
-
-static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
- loff_t pos)
-{
- const struct iomap *srcmap = iomap_iter_srcmap(iter);
-
- return srcmap->type != IOMAP_MAPPED ||
- (srcmap->flags & IOMAP_F_NEW) ||
- pos >= i_size_read(iter->inode);
-}
-
static int iomap_readpage_iter(struct iomap_iter *iter,
struct iomap_readpage_ctx *ctx)
{
@@ -554,6 +549,27 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
}
EXPORT_SYMBOL_GPL(iomap_readahead);
+static int iomap_read_folio_range(const struct iomap_iter *iter,
+ struct folio *folio, loff_t pos, size_t len)
+{
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ struct bio_vec bvec;
+ struct bio bio;
+
+ bio_init(&bio, srcmap->bdev, &bvec, 1, REQ_OP_READ);
+ bio.bi_iter.bi_sector = iomap_sector(srcmap, pos);
+ bio_add_folio_nofail(&bio, folio, len, offset_in_folio(folio, pos));
+ return submit_bio_wait(&bio);
+}
+#else
+static int iomap_read_folio_range(const struct iomap_iter *iter,
+ struct folio *folio, loff_t pos, size_t len)
+{
+ WARN_ON_ONCE(1);
+ return -EIO;
+}
+#endif /* CONFIG_BLOCK */
+
/*
* iomap_is_partially_uptodate checks whether blocks within a folio are
* uptodate or not.
@@ -667,22 +683,10 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
pos + len - 1);
}
-static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
- size_t poff, size_t plen, const struct iomap *iomap)
-{
- struct bio_vec bvec;
- struct bio bio;
-
- bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ);
- bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
- bio_add_folio_nofail(&bio, folio, plen, poff);
- return submit_bio_wait(&bio);
-}
-
-static int __iomap_write_begin(const struct iomap_iter *iter, size_t len,
+static int __iomap_write_begin(const struct iomap_iter *iter,
+ const struct iomap_write_ops *write_ops, size_t len,
struct folio *folio)
{
- const struct iomap *srcmap = iomap_iter_srcmap(iter);
struct iomap_folio_state *ifs;
loff_t pos = iter->pos;
loff_t block_size = i_blocksize(iter->inode);
@@ -731,8 +735,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter, size_t len,
if (iter->flags & IOMAP_NOWAIT)
return -EAGAIN;
- status = iomap_read_folio_sync(block_start, folio,
- poff, plen, srcmap);
+ if (write_ops && write_ops->read_folio_range)
+ status = write_ops->read_folio_range(iter,
+ folio, block_start, plen);
+ else
+ status = iomap_read_folio_range(iter,
+ folio, block_start, plen);
if (status)
return status;
}
@@ -742,28 +750,27 @@ static int __iomap_write_begin(const struct iomap_iter *iter, size_t len,
return 0;
}
-static struct folio *__iomap_get_folio(struct iomap_iter *iter, size_t len)
+static struct folio *__iomap_get_folio(struct iomap_iter *iter,
+ const struct iomap_write_ops *write_ops, size_t len)
{
- const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
loff_t pos = iter->pos;
if (!mapping_large_folio_support(iter->inode->i_mapping))
len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));
- if (folio_ops && folio_ops->get_folio)
- return folio_ops->get_folio(iter, pos, len);
- else
- return iomap_get_folio(iter, pos, len);
+ if (write_ops && write_ops->get_folio)
+ return write_ops->get_folio(iter, pos, len);
+ return iomap_get_folio(iter, pos, len);
}
-static void __iomap_put_folio(struct iomap_iter *iter, size_t ret,
+static void __iomap_put_folio(struct iomap_iter *iter,
+ const struct iomap_write_ops *write_ops, size_t ret,
struct folio *folio)
{
- const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
loff_t pos = iter->pos;
- if (folio_ops && folio_ops->put_folio) {
- folio_ops->put_folio(iter->inode, pos, ret, folio);
+ if (write_ops && write_ops->put_folio) {
+ write_ops->put_folio(iter->inode, pos, ret, folio);
} else {
folio_unlock(folio);
folio_put(folio);
@@ -800,10 +807,10 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter,
* offset, and length. Callers can optionally pass a max length *plen,
* otherwise init to zero.
*/
-static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop,
+static int iomap_write_begin(struct iomap_iter *iter,
+ const struct iomap_write_ops *write_ops, struct folio **foliop,
size_t *poffset, u64 *plen)
{
- const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
loff_t pos = iter->pos;
u64 len = min_t(u64, SIZE_MAX, iomap_length(iter));
@@ -818,7 +825,7 @@ static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop,
if (fatal_signal_pending(current))
return -EINTR;
- folio = __iomap_get_folio(iter, len);
+ folio = __iomap_get_folio(iter, write_ops, len);
if (IS_ERR(folio))
return PTR_ERR(folio);
@@ -832,8 +839,8 @@ static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop,
* could do the wrong thing here (zero a page range incorrectly or fail
* to zero) and corrupt data.
*/
- if (folio_ops && folio_ops->iomap_valid) {
- bool iomap_valid = folio_ops->iomap_valid(iter->inode,
+ if (write_ops && write_ops->iomap_valid) {
+ bool iomap_valid = write_ops->iomap_valid(iter->inode,
&iter->iomap);
if (!iomap_valid) {
iter->iomap.flags |= IOMAP_F_STALE;
@@ -849,7 +856,7 @@ static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop,
else if (srcmap->flags & IOMAP_F_BUFFER_HEAD)
status = __block_write_begin_int(folio, pos, len, NULL, srcmap);
else
- status = __iomap_write_begin(iter, len, folio);
+ status = __iomap_write_begin(iter, write_ops, len, folio);
if (unlikely(status))
goto out_unlock;
@@ -859,8 +866,7 @@ static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop,
return 0;
out_unlock:
- __iomap_put_folio(iter, 0, folio);
-
+ __iomap_put_folio(iter, write_ops, 0, folio);
return status;
}
@@ -923,8 +929,7 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied,
if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
size_t bh_written;
- bh_written = block_write_end(NULL, iter->inode->i_mapping, pos,
- len, copied, folio, NULL);
+ bh_written = block_write_end(pos, len, copied, folio);
WARN_ON_ONCE(bh_written != copied && bh_written != 0);
return bh_written == copied;
}
@@ -932,7 +937,8 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied,
return __iomap_write_end(iter->inode, pos, len, copied, folio);
}
-static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
+static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i,
+ const struct iomap_write_ops *write_ops)
{
ssize_t total_written = 0;
int status = 0;
@@ -976,7 +982,8 @@ retry:
break;
}
- status = iomap_write_begin(iter, &folio, &offset, &bytes);
+ status = iomap_write_begin(iter, write_ops, &folio, &offset,
+ &bytes);
if (unlikely(status)) {
iomap_write_failed(iter->inode, iter->pos, bytes);
break;
@@ -1005,7 +1012,7 @@ retry:
i_size_write(iter->inode, pos + written);
iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
}
- __iomap_put_folio(iter, written, folio);
+ __iomap_put_folio(iter, write_ops, written, folio);
if (old_size < pos)
pagecache_isize_extended(iter->inode, old_size, pos);
@@ -1038,7 +1045,8 @@ retry:
ssize_t
iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
- const struct iomap_ops *ops, void *private)
+ const struct iomap_ops *ops,
+ const struct iomap_write_ops *write_ops, void *private)
{
struct iomap_iter iter = {
.inode = iocb->ki_filp->f_mapping->host,
@@ -1055,7 +1063,7 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
iter.flags |= IOMAP_DONTCACHE;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.status = iomap_write_iter(&iter, i);
+ iter.status = iomap_write_iter(&iter, i, write_ops);
if (unlikely(iter.pos == iocb->ki_pos))
return ret;
@@ -1289,7 +1297,8 @@ void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
}
EXPORT_SYMBOL_GPL(iomap_write_delalloc_release);
-static int iomap_unshare_iter(struct iomap_iter *iter)
+static int iomap_unshare_iter(struct iomap_iter *iter,
+ const struct iomap_write_ops *write_ops)
{
struct iomap *iomap = &iter->iomap;
u64 bytes = iomap_length(iter);
@@ -1304,14 +1313,15 @@ static int iomap_unshare_iter(struct iomap_iter *iter)
bool ret;
bytes = min_t(u64, SIZE_MAX, bytes);
- status = iomap_write_begin(iter, &folio, &offset, &bytes);
+ status = iomap_write_begin(iter, write_ops, &folio, &offset,
+ &bytes);
if (unlikely(status))
return status;
if (iomap->flags & IOMAP_F_STALE)
break;
ret = iomap_write_end(iter, bytes, bytes, folio);
- __iomap_put_folio(iter, bytes, folio);
+ __iomap_put_folio(iter, write_ops, bytes, folio);
if (WARN_ON_ONCE(!ret))
return -EIO;
@@ -1329,7 +1339,8 @@ static int iomap_unshare_iter(struct iomap_iter *iter)
int
iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
- const struct iomap_ops *ops)
+ const struct iomap_ops *ops,
+ const struct iomap_write_ops *write_ops)
{
struct iomap_iter iter = {
.inode = inode,
@@ -1344,7 +1355,7 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
iter.len = min(len, size - pos);
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.status = iomap_unshare_iter(&iter);
+ iter.status = iomap_unshare_iter(&iter, write_ops);
return ret;
}
EXPORT_SYMBOL_GPL(iomap_file_unshare);
@@ -1363,7 +1374,8 @@ static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i)
return filemap_write_and_wait_range(mapping, i->pos, end);
}
-static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
+static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
+ const struct iomap_write_ops *write_ops)
{
u64 bytes = iomap_length(iter);
int status;
@@ -1374,7 +1386,8 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
bool ret;
bytes = min_t(u64, SIZE_MAX, bytes);
- status = iomap_write_begin(iter, &folio, &offset, &bytes);
+ status = iomap_write_begin(iter, write_ops, &folio, &offset,
+ &bytes);
if (status)
return status;
if (iter->iomap.flags & IOMAP_F_STALE)
@@ -1387,7 +1400,7 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
folio_mark_accessed(folio);
ret = iomap_write_end(iter, bytes, bytes, folio);
- __iomap_put_folio(iter, bytes, folio);
+ __iomap_put_folio(iter, write_ops, bytes, folio);
if (WARN_ON_ONCE(!ret))
return -EIO;
@@ -1403,7 +1416,8 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
int
iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
- const struct iomap_ops *ops, void *private)
+ const struct iomap_ops *ops,
+ const struct iomap_write_ops *write_ops, void *private)
{
struct iomap_iter iter = {
.inode = inode,
@@ -1433,7 +1447,8 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) {
iter.len = plen;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.status = iomap_zero_iter(&iter, did_zero);
+ iter.status = iomap_zero_iter(&iter, did_zero,
+ write_ops);
iter.len = len - (iter.pos - pos);
if (ret || !iter.len)
@@ -1464,7 +1479,7 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
continue;
}
- iter.status = iomap_zero_iter(&iter, did_zero);
+ iter.status = iomap_zero_iter(&iter, did_zero, write_ops);
}
return ret;
}
@@ -1472,7 +1487,8 @@ EXPORT_SYMBOL_GPL(iomap_zero_range);
int
iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
- const struct iomap_ops *ops, void *private)
+ const struct iomap_ops *ops,
+ const struct iomap_write_ops *write_ops, void *private)
{
unsigned int blocksize = i_blocksize(inode);
unsigned int off = pos & (blocksize - 1);
@@ -1481,7 +1497,7 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
if (!off)
return 0;
return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops,
- private);
+ write_ops, private);
}
EXPORT_SYMBOL_GPL(iomap_truncate_page);
@@ -1535,278 +1551,54 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
-static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
+void iomap_start_folio_write(struct inode *inode, struct folio *folio,
size_t len)
{
struct iomap_folio_state *ifs = folio->private;
WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
- WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
-
- if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
- folio_end_writeback(folio);
-}
-
-/*
- * We're now finished for good with this ioend structure. Update the page
- * state, release holds on bios, and finally free up memory. Do not use the
- * ioend after this.
- */
-u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
-{
- struct inode *inode = ioend->io_inode;
- struct bio *bio = &ioend->io_bio;
- struct folio_iter fi;
- u32 folio_count = 0;
-
- if (ioend->io_error) {
- mapping_set_error(inode->i_mapping, ioend->io_error);
- if (!bio_flagged(bio, BIO_QUIET)) {
- pr_err_ratelimited(
-"%s: writeback error on inode %lu, offset %lld, sector %llu",
- inode->i_sb->s_id, inode->i_ino,
- ioend->io_offset, ioend->io_sector);
- }
- }
-
- /* walk all folios in bio, ending page IO on them */
- bio_for_each_folio_all(fi, bio) {
- iomap_finish_folio_write(inode, fi.folio, fi.length);
- folio_count++;
- }
-
- bio_put(bio); /* frees the ioend */
- return folio_count;
-}
-
-static void iomap_writepage_end_bio(struct bio *bio)
-{
- struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
-
- ioend->io_error = blk_status_to_errno(bio->bi_status);
- iomap_finish_ioend_buffered(ioend);
-}
-
-/*
- * Submit an ioend.
- *
- * If @error is non-zero, it means that we have a situation where some part of
- * the submission process has failed after we've marked pages for writeback.
- * We cannot cancel ioend directly in that case, so call the bio end I/O handler
- * with the error status here to run the normal I/O completion handler to clear
- * the writeback bit and let the file system proess the errors.
- */
-static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
-{
- if (!wpc->ioend)
- return error;
-
- /*
- * Let the file systems prepare the I/O submission and hook in an I/O
- * comletion handler. This also needs to happen in case after a
- * failure happened so that the file system end I/O handler gets called
- * to clean up.
- */
- if (wpc->ops->submit_ioend) {
- error = wpc->ops->submit_ioend(wpc, error);
- } else {
- if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE))
- error = -EIO;
- if (!error)
- submit_bio(&wpc->ioend->io_bio);
- }
-
- if (error) {
- wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
- bio_endio(&wpc->ioend->io_bio);
- }
-
- wpc->ioend = NULL;
- return error;
-}
-
-static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct inode *inode, loff_t pos,
- u16 ioend_flags)
-{
- struct bio *bio;
-
- bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
- REQ_OP_WRITE | wbc_to_write_flags(wbc),
- GFP_NOFS, &iomap_ioend_bioset);
- bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
- bio->bi_end_io = iomap_writepage_end_bio;
- bio->bi_write_hint = inode->i_write_hint;
- wbc_init_bio(wbc, bio);
- wpc->nr_folios = 0;
- return iomap_init_ioend(inode, bio, pos, ioend_flags);
-}
-
-static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
- u16 ioend_flags)
-{
- if (ioend_flags & IOMAP_IOEND_BOUNDARY)
- return false;
- if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
- (wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
- return false;
- if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
- return false;
- if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) &&
- iomap_sector(&wpc->iomap, pos) !=
- bio_end_sector(&wpc->ioend->io_bio))
- return false;
- /*
- * Limit ioend bio chain lengths to minimise IO completion latency. This
- * also prevents long tight loops ending page writeback on all the
- * folios in the ioend.
- */
- if (wpc->nr_folios >= IOEND_BATCH_SIZE)
- return false;
- return true;
+ if (ifs)
+ atomic_add(len, &ifs->write_bytes_pending);
}
+EXPORT_SYMBOL_GPL(iomap_start_folio_write);
-/*
- * Test to see if we have an existing ioend structure that we could append to
- * first; otherwise finish off the current ioend and start another.
- *
- * If a new ioend is created and cached, the old ioend is submitted to the block
- * layer instantly. Batching optimisations are provided by higher level block
- * plugging.
- *
- * At the end of a writeback pass, there will be a cached ioend remaining on the
- * writepage context that the caller will need to submit.
- */
-static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct folio *folio,
- struct inode *inode, loff_t pos, loff_t end_pos,
- unsigned len)
+void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
+ size_t len)
{
struct iomap_folio_state *ifs = folio->private;
- size_t poff = offset_in_folio(folio, pos);
- unsigned int ioend_flags = 0;
- int error;
-
- if (wpc->iomap.type == IOMAP_UNWRITTEN)
- ioend_flags |= IOMAP_IOEND_UNWRITTEN;
- if (wpc->iomap.flags & IOMAP_F_SHARED)
- ioend_flags |= IOMAP_IOEND_SHARED;
- if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
- ioend_flags |= IOMAP_IOEND_BOUNDARY;
- if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
-new_ioend:
- error = iomap_submit_ioend(wpc, 0);
- if (error)
- return error;
- wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos,
- ioend_flags);
- }
-
- if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
- goto new_ioend;
-
- if (ifs)
- atomic_add(len, &ifs->write_bytes_pending);
-
- /*
- * Clamp io_offset and io_size to the incore EOF so that ondisk
- * file size updates in the ioend completion are byte-accurate.
- * This avoids recovering files with zeroed tail regions when
- * writeback races with appending writes:
- *
- * Thread 1: Thread 2:
- * ------------ -----------
- * write [A, A+B]
- * update inode size to A+B
- * submit I/O [A, A+BS]
- * write [A+B, A+B+C]
- * update inode size to A+B+C
- * <I/O completes, updates disk size to min(A+B+C, A+BS)>
- * <power failure>
- *
- * After reboot:
- * 1) with A+B+C < A+BS, the file has zero padding in range
- * [A+B, A+B+C]
- *
- * |< Block Size (BS) >|
- * |DDDDDDDDDDDD0000000000000|
- * ^ ^ ^
- * A A+B A+B+C
- * (EOF)
- *
- * 2) with A+B+C > A+BS, the file has zero padding in range
- * [A+B, A+BS]
- *
- * |< Block Size (BS) >|< Block Size (BS) >|
- * |DDDDDDDDDDDD0000000000000|00000000000000000000000000|
- * ^ ^ ^ ^
- * A A+B A+BS A+B+C
- * (EOF)
- *
- * D = Valid Data
- * 0 = Zero Padding
- *
- * Note that this defeats the ability to chain the ioends of
- * appending writes.
- */
- wpc->ioend->io_size += len;
- if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
- wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;
+ WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
+ WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
- wbc_account_cgroup_owner(wbc, folio, len);
- return 0;
+ if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
+ folio_end_writeback(folio);
}
+EXPORT_SYMBOL_GPL(iomap_finish_folio_write);
-static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct folio *folio,
- struct inode *inode, u64 pos, u64 end_pos,
- unsigned dirty_len, unsigned *count)
+static int iomap_writeback_range(struct iomap_writepage_ctx *wpc,
+ struct folio *folio, u64 pos, u32 rlen, u64 end_pos,
+ bool *wb_pending)
{
- int error;
-
do {
- unsigned map_len;
-
- error = wpc->ops->map_blocks(wpc, inode, pos, dirty_len);
- if (error)
- break;
- trace_iomap_writepage_map(inode, pos, dirty_len, &wpc->iomap);
+ ssize_t ret;
- map_len = min_t(u64, dirty_len,
- wpc->iomap.offset + wpc->iomap.length - pos);
- WARN_ON_ONCE(!folio->private && map_len < dirty_len);
+ ret = wpc->ops->writeback_range(wpc, folio, pos, rlen, end_pos);
+ if (WARN_ON_ONCE(ret == 0 || ret > rlen))
+ return -EIO;
+ if (ret < 0)
+ return ret;
+ rlen -= ret;
+ pos += ret;
- switch (wpc->iomap.type) {
- case IOMAP_INLINE:
- WARN_ON_ONCE(1);
- error = -EIO;
- break;
- case IOMAP_HOLE:
- break;
- default:
- error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos,
- end_pos, map_len);
- if (!error)
- (*count)++;
- break;
- }
- dirty_len -= map_len;
- pos += map_len;
- } while (dirty_len && !error);
+ /*
+ * Holes are not be written back by ->writeback_range, so track
+ * if we did handle anything that is not a hole here.
+ */
+ if (wpc->iomap.type != IOMAP_HOLE)
+ *wb_pending = true;
+ } while (rlen);
- /*
- * We cannot cancel the ioend directly here on error. We may have
- * already set other pages under writeback and hence we have to run I/O
- * completion to mark the error state of the pages under writeback
- * appropriately.
- *
- * Just let the file system know what portion of the folio failed to
- * map.
- */
- if (error && wpc->ops->discard_folio)
- wpc->ops->discard_folio(folio, pos);
- return error;
+ return 0;
}
/*
@@ -1815,7 +1607,7 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
* If the folio is entirely beyond i_size, return false. If it straddles
* i_size, adjust end_pos and zero all data beyond i_size.
*/
-static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
+static bool iomap_writeback_handle_eof(struct folio *folio, struct inode *inode,
u64 *end_pos)
{
u64 isize = i_size_read(inode);
@@ -1867,15 +1659,14 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
return true;
}
-static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct folio *folio)
+int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
{
struct iomap_folio_state *ifs = folio->private;
- struct inode *inode = folio->mapping->host;
+ struct inode *inode = wpc->inode;
u64 pos = folio_pos(folio);
u64 end_pos = pos + folio_size(folio);
u64 end_aligned = 0;
- unsigned count = 0;
+ bool wb_pending = false;
int error = 0;
u32 rlen;
@@ -1883,12 +1674,10 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
WARN_ON_ONCE(folio_test_dirty(folio));
WARN_ON_ONCE(folio_test_writeback(folio));
- trace_iomap_writepage(inode, pos, folio_size(folio));
+ trace_iomap_writeback_folio(inode, pos, folio_size(folio));
- if (!iomap_writepage_handle_eof(folio, inode, &end_pos)) {
- folio_unlock(folio);
+ if (!iomap_writeback_handle_eof(folio, inode, &end_pos))
return 0;
- }
WARN_ON_ONCE(end_pos <= pos);
if (i_blocks_per_folio(inode, folio) > 1) {
@@ -1904,7 +1693,7 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
* all blocks.
*/
WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
- atomic_inc(&ifs->write_bytes_pending);
+ iomap_start_folio_write(inode, folio, 1);
}
/*
@@ -1918,14 +1707,14 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
*/
end_aligned = round_up(end_pos, i_blocksize(inode));
while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) {
- error = iomap_writepage_map_blocks(wpc, wbc, folio, inode,
- pos, end_pos, rlen, &count);
+ error = iomap_writeback_range(wpc, folio, pos, rlen, end_pos,
+ &wb_pending);
if (error)
break;
pos += rlen;
}
- if (count)
+ if (wb_pending)
wpc->nr_folios++;
/*
@@ -1942,23 +1731,22 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
* already at this point. In that case we need to clear the writeback
* bit ourselves right after unlocking the page.
*/
- folio_unlock(folio);
if (ifs) {
if (atomic_dec_and_test(&ifs->write_bytes_pending))
folio_end_writeback(folio);
} else {
- if (!count)
+ if (!wb_pending)
folio_end_writeback(folio);
}
mapping_set_error(inode->i_mapping, error);
return error;
}
+EXPORT_SYMBOL_GPL(iomap_writeback_folio);
int
-iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
- struct iomap_writepage_ctx *wpc,
- const struct iomap_writeback_ops *ops)
+iomap_writepages(struct iomap_writepage_ctx *wpc)
{
+ struct address_space *mapping = wpc->inode->i_mapping;
struct folio *folio = NULL;
int error;
@@ -1970,9 +1758,22 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
PF_MEMALLOC))
return -EIO;
- wpc->ops = ops;
- while ((folio = writeback_iter(mapping, wbc, folio, &error)))
- error = iomap_writepage_map(wpc, wbc, folio);
- return iomap_submit_ioend(wpc, error);
+ while ((folio = writeback_iter(mapping, wpc->wbc, folio, &error))) {
+ error = iomap_writeback_folio(wpc, folio);
+ folio_unlock(folio);
+ }
+
+ /*
+ * If @error is non-zero, it means that we have a situation where some
+ * part of the submission process has failed after we've marked pages
+ * for writeback.
+ *
+ * We cannot cancel the writeback directly in that case, so always call
+ * ->writeback_submit to run the I/O completion handler to clear the
+ * writeback bit and let the file system proess the errors.
+ */
+ if (wpc->wb_ctx)
+ return wpc->ops->writeback_submit(wpc, error);
+ return error;
}
EXPORT_SYMBOL_GPL(iomap_writepages);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 844261a31156..6f25d4cfea9f 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -3,14 +3,9 @@
* Copyright (C) 2010 Red Hat, Inc.
* Copyright (c) 2016-2025 Christoph Hellwig.
*/
-#include <linux/module.h>
-#include <linux/compiler.h>
-#include <linux/fs.h>
#include <linux/fscrypt.h>
#include <linux/pagemap.h>
#include <linux/iomap.h>
-#include <linux/backing-dev.h>
-#include <linux/uio.h>
#include <linux/task_io_accounting_ops.h>
#include "internal.h"
#include "trace.h"
diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c
index 80675c42e94e..d11dadff8286 100644
--- a/fs/iomap/fiemap.c
+++ b/fs/iomap/fiemap.c
@@ -2,9 +2,6 @@
/*
* Copyright (c) 2016-2021 Christoph Hellwig.
*/
-#include <linux/module.h>
-#include <linux/compiler.h>
-#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/fiemap.h>
#include <linux/pagemap.h>
diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h
index f6992a3bf66a..d05cb3aed96e 100644
--- a/fs/iomap/internal.h
+++ b/fs/iomap/internal.h
@@ -4,7 +4,6 @@
#define IOEND_BATCH_SIZE 4096
-u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend);
u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend);
#endif /* _IOMAP_INTERNAL_H */
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
index 18894ebba6db..b49fa75eab26 100644
--- a/fs/iomap/ioend.c
+++ b/fs/iomap/ioend.c
@@ -1,10 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Copyright (c) 2024-2025 Christoph Hellwig.
+ * Copyright (c) 2016-2025 Christoph Hellwig.
*/
#include <linux/iomap.h>
#include <linux/list_sort.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
#include "internal.h"
+#include "trace.h"
struct bio_set iomap_ioend_bioset;
EXPORT_SYMBOL_GPL(iomap_ioend_bioset);
@@ -28,6 +31,221 @@ struct iomap_ioend *iomap_init_ioend(struct inode *inode,
}
EXPORT_SYMBOL_GPL(iomap_init_ioend);
+/*
+ * We're now finished for good with this ioend structure. Update the folio
+ * state, release holds on bios, and finally free up memory. Do not use the
+ * ioend after this.
+ */
+static u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
+{
+ struct inode *inode = ioend->io_inode;
+ struct bio *bio = &ioend->io_bio;
+ struct folio_iter fi;
+ u32 folio_count = 0;
+
+ if (ioend->io_error) {
+ mapping_set_error(inode->i_mapping, ioend->io_error);
+ if (!bio_flagged(bio, BIO_QUIET)) {
+ pr_err_ratelimited(
+"%s: writeback error on inode %lu, offset %lld, sector %llu",
+ inode->i_sb->s_id, inode->i_ino,
+ ioend->io_offset, ioend->io_sector);
+ }
+ }
+
+ /* walk all folios in bio, ending page IO on them */
+ bio_for_each_folio_all(fi, bio) {
+ iomap_finish_folio_write(inode, fi.folio, fi.length);
+ folio_count++;
+ }
+
+ bio_put(bio); /* frees the ioend */
+ return folio_count;
+}
+
+static void ioend_writeback_end_bio(struct bio *bio)
+{
+ struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
+
+ ioend->io_error = blk_status_to_errno(bio->bi_status);
+ iomap_finish_ioend_buffered(ioend);
+}
+
+/*
+ * We cannot cancel the ioend directly in case of an error, so call the bio end
+ * I/O handler with the error status here to run the normal I/O completion
+ * handler.
+ */
+int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error)
+{
+ struct iomap_ioend *ioend = wpc->wb_ctx;
+
+ if (!ioend->io_bio.bi_end_io)
+ ioend->io_bio.bi_end_io = ioend_writeback_end_bio;
+
+ if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE))
+ error = -EIO;
+
+ if (error) {
+ ioend->io_bio.bi_status = errno_to_blk_status(error);
+ bio_endio(&ioend->io_bio);
+ return error;
+ }
+
+ submit_bio(&ioend->io_bio);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_ioend_writeback_submit);
+
+static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
+ loff_t pos, u16 ioend_flags)
+{
+ struct bio *bio;
+
+ bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
+ REQ_OP_WRITE | wbc_to_write_flags(wpc->wbc),
+ GFP_NOFS, &iomap_ioend_bioset);
+ bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
+ bio->bi_write_hint = wpc->inode->i_write_hint;
+ wbc_init_bio(wpc->wbc, bio);
+ wpc->nr_folios = 0;
+ return iomap_init_ioend(wpc->inode, bio, pos, ioend_flags);
+}
+
+static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
+ u16 ioend_flags)
+{
+ struct iomap_ioend *ioend = wpc->wb_ctx;
+
+ if (ioend_flags & IOMAP_IOEND_BOUNDARY)
+ return false;
+ if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
+ (ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
+ return false;
+ if (pos != ioend->io_offset + ioend->io_size)
+ return false;
+ if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) &&
+ iomap_sector(&wpc->iomap, pos) != bio_end_sector(&ioend->io_bio))
+ return false;
+ /*
+ * Limit ioend bio chain lengths to minimise IO completion latency. This
+ * also prevents long tight loops ending page writeback on all the
+ * folios in the ioend.
+ */
+ if (wpc->nr_folios >= IOEND_BATCH_SIZE)
+ return false;
+ return true;
+}
+
+/*
+ * Test to see if we have an existing ioend structure that we could append to
+ * first; otherwise finish off the current ioend and start another.
+ *
+ * If a new ioend is created and cached, the old ioend is submitted to the block
+ * layer instantly. Batching optimisations are provided by higher level block
+ * plugging.
+ *
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
+ * writepage context that the caller will need to submit.
+ */
+ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
+ loff_t pos, loff_t end_pos, unsigned int dirty_len)
+{
+ struct iomap_ioend *ioend = wpc->wb_ctx;
+ size_t poff = offset_in_folio(folio, pos);
+ unsigned int ioend_flags = 0;
+ unsigned int map_len = min_t(u64, dirty_len,
+ wpc->iomap.offset + wpc->iomap.length - pos);
+ int error;
+
+ trace_iomap_add_to_ioend(wpc->inode, pos, dirty_len, &wpc->iomap);
+
+ WARN_ON_ONCE(!folio->private && map_len < dirty_len);
+
+ switch (wpc->iomap.type) {
+ case IOMAP_INLINE:
+ WARN_ON_ONCE(1);
+ return -EIO;
+ case IOMAP_HOLE:
+ return map_len;
+ default:
+ break;
+ }
+
+ if (wpc->iomap.type == IOMAP_UNWRITTEN)
+ ioend_flags |= IOMAP_IOEND_UNWRITTEN;
+ if (wpc->iomap.flags & IOMAP_F_SHARED)
+ ioend_flags |= IOMAP_IOEND_SHARED;
+ if (folio_test_dropbehind(folio))
+ ioend_flags |= IOMAP_IOEND_DONTCACHE;
+ if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
+ ioend_flags |= IOMAP_IOEND_BOUNDARY;
+
+ if (!ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
+new_ioend:
+ if (ioend) {
+ error = wpc->ops->writeback_submit(wpc, 0);
+ if (error)
+ return error;
+ }
+ wpc->wb_ctx = ioend = iomap_alloc_ioend(wpc, pos, ioend_flags);
+ }
+
+ if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff))
+ goto new_ioend;
+
+ iomap_start_folio_write(wpc->inode, folio, map_len);
+
+ /*
+ * Clamp io_offset and io_size to the incore EOF so that ondisk
+ * file size updates in the ioend completion are byte-accurate.
+ * This avoids recovering files with zeroed tail regions when
+ * writeback races with appending writes:
+ *
+ * Thread 1: Thread 2:
+ * ------------ -----------
+ * write [A, A+B]
+ * update inode size to A+B
+ * submit I/O [A, A+BS]
+ * write [A+B, A+B+C]
+ * update inode size to A+B+C
+ * <I/O completes, updates disk size to min(A+B+C, A+BS)>
+ * <power failure>
+ *
+ * After reboot:
+ * 1) with A+B+C < A+BS, the file has zero padding in range
+ * [A+B, A+B+C]
+ *
+ * |< Block Size (BS) >|
+ * |DDDDDDDDDDDD0000000000000|
+ * ^ ^ ^
+ * A A+B A+B+C
+ * (EOF)
+ *
+ * 2) with A+B+C > A+BS, the file has zero padding in range
+ * [A+B, A+BS]
+ *
+ * |< Block Size (BS) >|< Block Size (BS) >|
+ * |DDDDDDDDDDDD0000000000000|00000000000000000000000000|
+ * ^ ^ ^ ^
+ * A A+B A+BS A+B+C
+ * (EOF)
+ *
+ * D = Valid Data
+ * 0 = Zero Padding
+ *
+ * Note that this defeats the ability to chain the ioends of
+ * appending writes.
+ */
+ ioend->io_size += map_len;
+ if (ioend->io_offset + ioend->io_size > end_pos)
+ ioend->io_size = end_pos - ioend->io_offset;
+
+ wbc_account_cgroup_owner(wpc->wbc, folio, map_len);
+ return map_len;
+}
+EXPORT_SYMBOL_GPL(iomap_add_to_ioend);
+
static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error)
{
if (ioend->io_parent) {
diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c
index 6ffc6a7b9ba5..cef77ca0c20b 100644
--- a/fs/iomap/iter.c
+++ b/fs/iomap/iter.c
@@ -3,7 +3,6 @@
* Copyright (C) 2010 Red Hat, Inc.
* Copyright (c) 2016-2021 Christoph Hellwig.
*/
-#include <linux/fs.h>
#include <linux/iomap.h>
#include "trace.h"
diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c
index 04d7919636c1..56db2dd4b10d 100644
--- a/fs/iomap/seek.c
+++ b/fs/iomap/seek.c
@@ -3,12 +3,8 @@
* Copyright (C) 2017 Red Hat, Inc.
* Copyright (c) 2018-2021 Christoph Hellwig.
*/
-#include <linux/module.h>
-#include <linux/compiler.h>
-#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/pagemap.h>
-#include <linux/pagevec.h>
static int iomap_seek_hole_iter(struct iomap_iter *iter,
loff_t *hole_pos)
diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c
index c1a762c10ce4..0db77c449467 100644
--- a/fs/iomap/swapfile.c
+++ b/fs/iomap/swapfile.c
@@ -3,9 +3,6 @@
* Copyright (C) 2018 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <darrick.wong@oracle.com>
*/
-#include <linux/module.h>
-#include <linux/compiler.h>
-#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/swap.h>
diff --git a/fs/iomap/trace.c b/fs/iomap/trace.c
index 728d5443daf5..da217246b1a9 100644
--- a/fs/iomap/trace.c
+++ b/fs/iomap/trace.c
@@ -3,7 +3,6 @@
* Copyright (c) 2019 Christoph Hellwig
*/
#include <linux/iomap.h>
-#include <linux/uio.h>
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 455cc6f90be0..6ad66e6ba653 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -79,7 +79,7 @@ DECLARE_EVENT_CLASS(iomap_range_class,
DEFINE_EVENT(iomap_range_class, name, \
TP_PROTO(struct inode *inode, loff_t off, u64 len),\
TP_ARGS(inode, off, len))
-DEFINE_RANGE_EVENT(iomap_writepage);
+DEFINE_RANGE_EVENT(iomap_writeback_folio);
DEFINE_RANGE_EVENT(iomap_release_folio);
DEFINE_RANGE_EVENT(iomap_invalidate_folio);
DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail);
@@ -169,7 +169,7 @@ DEFINE_EVENT(iomap_class, name, \
DEFINE_IOMAP_EVENT(iomap_iter_dstmap);
DEFINE_IOMAP_EVENT(iomap_iter_srcmap);
-TRACE_EVENT(iomap_writepage_map,
+TRACE_EVENT(iomap_add_to_ioend,
TP_PROTO(struct inode *inode, u64 pos, unsigned int dirty_len,
struct iomap *iomap),
TP_ARGS(inode, pos, dirty_len, iomap),
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index d5da9817df9b..6f0e6b19383c 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -939,7 +939,7 @@ root_found:
sbi->s_check = opt->check;
if (table)
- s->s_d_op = &isofs_dentry_ops[table - 1];
+ set_default_d_op(s, &isofs_dentry_ops[table - 1]);
/* get the root dentry */
s->s_root = d_make_root(inode);
@@ -1440,9 +1440,16 @@ static int isofs_read_inode(struct inode *inode, int relocated)
inode->i_op = &page_symlink_inode_operations;
inode_nohighmem(inode);
inode->i_data.a_ops = &isofs_symlink_aops;
- } else
+ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
/* XXX - parse_rock_ridge_inode() had already set i_rdev. */
init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ } else {
+ printk(KERN_DEBUG "ISOFS: Invalid file type 0%04o for inode %lu.\n",
+ inode->i_mode, inode->i_ino);
+ ret = -EIO;
+ goto fail;
+ }
ret = 0;
out:
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 6d5e76848733..d480b94117cd 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -134,7 +134,7 @@ static __be32 jbd2_superblock_csum(journal_superblock_t *sb)
static void commit_timeout(struct timer_list *t)
{
- journal_t *journal = from_timer(journal, t, j_commit_timer);
+ journal_t *journal = timer_container_of(journal, t, j_commit_timer);
wake_up_process(journal->j_task);
}
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index ef3a1e1b6cb0..fda9f4d6093f 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -425,7 +425,9 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
.totlen = cpu_to_je32(c->cleanmarker_size)
};
- jffs2_prealloc_raw_node_refs(c, jeb, 1);
+ ret = jffs2_prealloc_raw_node_refs(c, jeb, 1);
+ if (ret)
+ goto filebad;
marker.hdr_crc = cpu_to_je32(crc32(0, &marker, sizeof(struct jffs2_unknown_node)-4));
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 13c18ccc13b0..dd3dff95cb24 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -21,12 +21,14 @@
#include <linux/jffs2.h>
#include "nodelist.h"
-static int jffs2_write_end(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata);
-static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata);
+static int jffs2_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata);
+static int jffs2_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata);
static int jffs2_read_folio(struct file *filp, struct folio *folio);
int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
@@ -54,7 +56,7 @@ const struct file_operations jffs2_file_operations =
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.unlocked_ioctl=jffs2_ioctl,
- .mmap = generic_file_readonly_mmap,
+ .mmap_prepare = generic_file_readonly_mmap_prepare,
.fsync = jffs2_fsync,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
@@ -121,9 +123,10 @@ static int jffs2_read_folio(struct file *file, struct folio *folio)
return ret;
}
-static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int jffs2_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
struct folio *folio;
struct inode *inode = mapping->host;
@@ -235,9 +238,10 @@ out_err:
return ret;
}
-static int jffs2_write_end(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+static int jffs2_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
/* Actually commit the write from the page cache page we're looking at.
* For now, we write the full page out each time. It sucks, but it's simple
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 29671e33a171..62879c218d4b 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -256,7 +256,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
jffs2_dbg(1, "%s(): Skipping %d bytes in nextblock to ensure page alignment\n",
__func__, skip);
- jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
+ ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
+ if (ret)
+ goto out;
jffs2_scan_dirty_space(c, c->nextblock, skip);
}
#endif
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 4fe64519870f..d83372d3e1a0 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -858,7 +858,10 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
spin_unlock(&c->erase_completion_lock);
jeb = c->nextblock;
- jffs2_prealloc_raw_node_refs(c, jeb, 1);
+ ret = jffs2_prealloc_raw_node_refs(c, jeb, 1);
+
+ if (ret)
+ goto out;
if (!c->summary->sum_num || !c->summary->sum_list_head) {
JFFS2_WARNING("Empty summary info!!!\n");
@@ -872,6 +875,8 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
datasize += padsize;
ret = jffs2_sum_write_data(c, jeb, infosize, datasize, padsize);
+
+out:
spin_lock(&c->erase_completion_lock);
return ret;
}
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 01b6912e60f8..2a4a288b821c 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -44,6 +44,9 @@ static int jfs_open(struct inode *inode, struct file *file)
{
int rc;
+ if (S_ISREG(inode->i_mode) && inode->i_size < 0)
+ return -EIO;
+
if ((rc = dquot_file_open(inode, file)))
return rc;
@@ -143,7 +146,7 @@ const struct file_operations jfs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
.fsync = jfs_fsync,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 60fc92dee24d..fcedeb514e14 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -145,9 +145,9 @@ void jfs_evict_inode(struct inode *inode)
if (!inode->i_nlink && !is_bad_inode(inode)) {
dquot_initialize(inode);
+ truncate_inode_pages_final(&inode->i_data);
if (JFS_IP(inode)->fileset == FILESYSTEM_I) {
struct inode *ipimap = JFS_SBI(inode->i_sb)->ipimap;
- truncate_inode_pages_final(&inode->i_data);
if (test_cflag(COMMIT_Freewmap, inode))
jfs_free_zero_link(inode);
@@ -290,9 +290,10 @@ static void jfs_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int jfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int jfs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int ret;
@@ -303,13 +304,14 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
-static int jfs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied, struct folio *folio,
- void *fsdata)
+static int jfs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
int ret;
- ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
if (ret < len)
jfs_write_failed(mapping, pos + len);
return ret;
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index f7bd7e8f5be4..563f148be8af 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -57,7 +57,7 @@ static long jfs_map_ext2(unsigned long flags, int from)
return mapped;
}
-int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int jfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct jfs_inode_info *jfs_inode = JFS_IP(d_inode(dentry));
unsigned int flags = jfs_inode->mode2 & JFS_FL_USER_VISIBLE;
@@ -71,7 +71,7 @@ int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
int jfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct jfs_inode_info *jfs_inode = JFS_IP(inode);
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 35e063c9f3a4..cdfa699cd7c8 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -1389,6 +1389,12 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
(1 << (L2LPERCTL - (bmp->db_agheight << 1))) / bmp->db_agwidth;
ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
+ if (ti < 0 || ti >= le32_to_cpu(dcp->nleafs)) {
+ jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n");
+ release_metapage(mp);
+ return -EIO;
+ }
+
/* dmap control page trees fan-out by 4 and a single allocation
* group may be described by 1 or 2 subtrees within the ag level
* dmap control page, depending upon the ag size. examine the ag's
@@ -1809,8 +1815,10 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
return -EIO;
dp = (struct dmap *) mp->data;
- if (dp->tree.budmin < 0)
+ if (dp->tree.budmin < 0) {
+ release_metapage(mp);
return -EIO;
+ }
/* try to allocate the blocks.
*/
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index ea80661597ac..2c6c81c8cb9f 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -9,9 +9,9 @@ struct fid;
extern struct inode *ialloc(struct inode *, umode_t);
extern int jfs_fsync(struct file *, loff_t, loff_t, int);
-extern int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+extern int jfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
extern int jfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
extern struct inode *jfs_iget(struct super_block *, unsigned long);
extern int jfs_commit_inode(struct inode *, int);
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index df575a873ec6..b98cf3bb6c1f 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -15,6 +15,7 @@
#include <linux/mempool.h>
#include <linux/seq_file.h>
#include <linux/writeback.h>
+#include <linux/migrate.h>
#include "jfs_incore.h"
#include "jfs_superblock.h"
#include "jfs_filsys.h"
@@ -151,7 +152,59 @@ static inline void dec_io(struct folio *folio, blk_status_t status,
handler(folio, anchor->status);
}
+#ifdef CONFIG_MIGRATION
+static int __metapage_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src,
+ enum migrate_mode mode)
+{
+ struct meta_anchor *src_anchor = src->private;
+ struct metapage *mps[MPS_PER_PAGE] = {0};
+ struct metapage *mp;
+ int i, rc;
+
+ for (i = 0; i < MPS_PER_PAGE; i++) {
+ mp = src_anchor->mp[i];
+ if (mp && metapage_locked(mp))
+ return -EAGAIN;
+ }
+
+ rc = filemap_migrate_folio(mapping, dst, src, mode);
+ if (rc != MIGRATEPAGE_SUCCESS)
+ return rc;
+
+ for (i = 0; i < MPS_PER_PAGE; i++) {
+ mp = src_anchor->mp[i];
+ if (!mp)
+ continue;
+ if (unlikely(insert_metapage(dst, mp))) {
+ /* If error, roll-back previosly inserted pages */
+ for (int j = 0 ; j < i; j++) {
+ if (mps[j])
+ remove_metapage(dst, mps[j]);
+ }
+ return -EAGAIN;
+ }
+ mps[i] = mp;
+ }
+
+ /* Update the metapage and remove it from src */
+ for (i = 0; i < MPS_PER_PAGE; i++) {
+ mp = mps[i];
+ if (mp) {
+ int page_offset = mp->data - folio_address(src);
+
+ mp->data = folio_address(dst) + page_offset;
+ mp->folio = dst;
+ remove_metapage(src, mp);
+ }
+ }
+
+ return MIGRATEPAGE_SUCCESS;
+}
+#endif /* CONFIG_MIGRATION */
+
#else
+
static inline struct metapage *folio_to_mp(struct folio *folio, int offset)
{
return folio->private;
@@ -175,6 +228,35 @@ static inline void remove_metapage(struct folio *folio, struct metapage *mp)
#define inc_io(folio) do {} while(0)
#define dec_io(folio, status, handler) handler(folio, status)
+#ifdef CONFIG_MIGRATION
+static int __metapage_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src,
+ enum migrate_mode mode)
+{
+ struct metapage *mp;
+ int page_offset;
+ int rc;
+
+ mp = folio_to_mp(src, 0);
+ if (metapage_locked(mp))
+ return -EAGAIN;
+
+ rc = filemap_migrate_folio(mapping, dst, src, mode);
+ if (rc != MIGRATEPAGE_SUCCESS)
+ return rc;
+
+ if (unlikely(insert_metapage(dst, mp)))
+ return -EAGAIN;
+
+ page_offset = mp->data - folio_address(src);
+ mp->data = folio_address(dst) + page_offset;
+ mp->folio = dst;
+ remove_metapage(src, mp);
+
+ return MIGRATEPAGE_SUCCESS;
+}
+#endif /* CONFIG_MIGRATION */
+
#endif
static inline struct metapage *alloc_metapage(gfp_t gfp_mask)
@@ -339,7 +421,7 @@ static void metapage_write_end_io(struct bio *bio)
}
static int metapage_write_folio(struct folio *folio,
- struct writeback_control *wbc, void *unused)
+ struct writeback_control *wbc)
{
struct bio *bio = NULL;
int block_offset; /* block offset of mp within page */
@@ -468,10 +550,12 @@ static int metapage_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct blk_plug plug;
+ struct folio *folio = NULL;
int err;
blk_start_plug(&plug);
- err = write_cache_pages(mapping, wbc, metapage_write_folio, NULL);
+ while ((folio = writeback_iter(mapping, wbc, folio, &err)))
+ err = metapage_write_folio(folio, wbc);
blk_finish_plug(&plug);
return err;
@@ -554,6 +638,29 @@ static bool metapage_release_folio(struct folio *folio, gfp_t gfp_mask)
return ret;
}
+#ifdef CONFIG_MIGRATION
+/*
+ * metapage_migrate_folio - Migration function for JFS metapages
+ */
+static int metapage_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src,
+ enum migrate_mode mode)
+{
+ int expected_count;
+
+ if (!src->private)
+ return filemap_migrate_folio(mapping, dst, src, mode);
+
+ /* Check whether page does not have extra refs before we do more work */
+ expected_count = folio_expected_ref_count(src) + 1;
+ if (folio_ref_count(src) != expected_count)
+ return -EAGAIN;
+ return __metapage_migrate_folio(mapping, dst, src, mode);
+}
+#else
+#define metapage_migrate_folio NULL
+#endif /* CONFIG_MIGRATION */
+
static void metapage_invalidate_folio(struct folio *folio, size_t offset,
size_t length)
{
@@ -570,6 +677,7 @@ const struct address_space_operations jfs_metapage_aops = {
.release_folio = metapage_release_folio,
.invalidate_folio = metapage_invalidate_folio,
.dirty_folio = filemap_dirty_folio,
+ .migrate_folio = metapage_migrate_folio,
};
struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
@@ -707,7 +815,7 @@ static int metapage_write_one(struct folio *folio)
if (folio_clear_dirty_for_io(folio)) {
folio_get(folio);
- ret = metapage_write_folio(folio, &wbc, NULL);
+ ret = metapage_write_folio(folio, &wbc);
if (ret == 0)
folio_wait_writeback(folio);
folio_put(folio);
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 5ee618d17e77..28c3cf960c6f 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -49,26 +49,6 @@
#define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot)
-/* get page buffer for specified block address */
-/* ToDo: Replace this ugly macro with a function */
-#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC) \
-do { \
- BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot); \
- if (!(RC)) { \
- if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) || \
- (le16_to_cpu((P)->header.nextindex) > \
- le16_to_cpu((P)->header.maxentry)) || \
- (le16_to_cpu((P)->header.maxentry) > \
- (((BN) == 0) ? XTROOTMAXSLOT : PSIZE >> L2XTSLOTSIZE))) { \
- jfs_error((IP)->i_sb, \
- "XT_GETPAGE: xtree page corrupt\n"); \
- BT_PUTPAGE(MP); \
- MP = NULL; \
- RC = -EIO; \
- } \
- } \
-} while (0)
-
/* for consistency */
#define XT_PUTPAGE(MP) BT_PUTPAGE(MP)
@@ -115,6 +95,42 @@ static int xtSplitRoot(tid_t tid, struct inode *ip,
struct xtsplit * split, struct metapage ** rmpp);
/*
+ * xt_getpage()
+ *
+ * function: get the page buffer for a specified block address.
+ *
+ * parameters:
+ * ip - pointer to the inode
+ * bn - block number (s64) of the xtree page to be retrieved;
+ * mp - pointer to a metapage pointer where the page buffer is returned;
+ *
+ * returns:
+ * A pointer to the xtree page (xtpage_t) on success, -EIO on error.
+ */
+
+static inline xtpage_t *xt_getpage(struct inode *ip, s64 bn, struct metapage **mp)
+{
+ xtpage_t *p;
+ int rc;
+
+ BT_GETPAGE(ip, bn, *mp, xtpage_t, PSIZE, p, rc, i_xtroot);
+
+ if (rc)
+ return ERR_PTR(rc);
+ if ((le16_to_cpu(p->header.nextindex) < XTENTRYSTART) ||
+ (le16_to_cpu(p->header.nextindex) >
+ le16_to_cpu(p->header.maxentry)) ||
+ (le16_to_cpu(p->header.maxentry) >
+ ((bn == 0) ? XTROOTMAXSLOT : PSIZE >> L2XTSLOTSIZE))) {
+ jfs_error(ip->i_sb, "xt_getpage: xtree page corrupt\n");
+ BT_PUTPAGE(*mp);
+ *mp = NULL;
+ return ERR_PTR(-EIO);
+ }
+ return p;
+}
+
+/*
* xtLookup()
*
* function: map a single page into a physical extent;
@@ -216,7 +232,6 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp,
int *cmpp, struct btstack * btstack, int flag)
{
struct jfs_inode_info *jfs_ip = JFS_IP(ip);
- int rc = 0;
int cmp = 1; /* init for empty page */
s64 bn; /* block number */
struct metapage *mp; /* page buffer */
@@ -252,9 +267,9 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp,
*/
for (bn = 0;;) {
/* get/pin the page to search */
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
/* try sequential access heuristics with the previous
* access entry in target leaf page:
@@ -807,10 +822,10 @@ xtSplitUp(tid_t tid,
* insert router entry in parent for new right child page <rp>
*/
/* get/pin the parent page <sp> */
- XT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc);
- if (rc) {
+ sp = xt_getpage(ip, parent->bn, &smp);
+ if (IS_ERR(sp)) {
XT_PUTPAGE(rcmp);
- return rc;
+ return PTR_ERR(sp);
}
/*
@@ -1062,10 +1077,10 @@ xtSplitPage(tid_t tid, struct inode *ip,
* update previous pointer of old next/right page of <sp>
*/
if (nextbn != 0) {
- XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
- if (rc) {
+ p = xt_getpage(ip, nextbn, &mp);
+ if (IS_ERR(p)) {
XT_PUTPAGE(rmp);
- goto clean_up;
+ return PTR_ERR(p);
}
BT_MARK_DIRTY(mp, ip);
@@ -1417,9 +1432,9 @@ int xtExtend(tid_t tid, /* transaction id */
return rc;
/* get back old page */
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
/*
* if leaf root has been split, original root has been
* copied to new child page, i.e., original entry now
@@ -1433,9 +1448,9 @@ int xtExtend(tid_t tid, /* transaction id */
XT_PUTPAGE(mp);
/* get new child page */
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
BT_MARK_DIRTY(mp, ip);
if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -1711,9 +1726,9 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
return rc;
/* get back old page */
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
/*
* if leaf root has been split, original root has been
* copied to new child page, i.e., original entry now
@@ -1727,9 +1742,9 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
XT_PUTPAGE(mp);
/* get new child page */
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
BT_MARK_DIRTY(mp, ip);
if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -1788,9 +1803,9 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
XT_PUTPAGE(mp);
/* get new right page */
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
BT_MARK_DIRTY(mp, ip);
if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -1864,9 +1879,9 @@ printf("xtUpdate.updateLeft.split p:0x%p\n", p);
return rc;
/* get back old page */
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
/*
* if leaf root has been split, original root has been
@@ -1881,9 +1896,9 @@ printf("xtUpdate.updateLeft.split p:0x%p\n", p);
XT_PUTPAGE(mp);
/* get new child page */
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
BT_MARK_DIRTY(mp, ip);
if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -2187,7 +2202,6 @@ void xtInitRoot(tid_t tid, struct inode *ip)
*/
s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
{
- int rc = 0;
s64 teof;
struct metapage *mp;
xtpage_t *p;
@@ -2268,9 +2282,9 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
* first access of each page:
*/
getPage:
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
/* process entries backward from last index */
index = le16_to_cpu(p->header.nextindex) - 1;
@@ -2506,9 +2520,9 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
/* get back the parent page */
bn = parent->bn;
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
index = parent->index;
@@ -2791,9 +2805,9 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
* first access of each page:
*/
getPage:
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
/* process entries backward from last index */
index = le16_to_cpu(p->header.nextindex) - 1;
@@ -2836,9 +2850,9 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
/* get back the parent page */
bn = parent->bn;
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
index = parent->index;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 10368c188c5e..3cfb86c5a36e 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -542,7 +542,7 @@ static int jfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_magic = JFS_SUPER_MAGIC;
if (sbi->mntflag & JFS_OS2)
- sb->s_d_op = &jfs_ci_dentry_operations;
+ set_default_d_op(sb, &jfs_ci_dentry_operations);
inode = jfs_iget(sb, ROOT_I);
if (IS_ERR(inode)) {
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index b83054da68b3..3c293a5a21b1 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -24,45 +24,46 @@ static const struct inode_operations kernfs_iops = {
.listxattr = kernfs_iop_listxattr,
};
-static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc)
+static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, bool alloc)
{
- static DEFINE_MUTEX(iattr_mutex);
- struct kernfs_iattrs *ret;
+ struct kernfs_iattrs *ret __free(kfree) = NULL;
+ struct kernfs_iattrs *attr;
- mutex_lock(&iattr_mutex);
+ attr = READ_ONCE(kn->iattr);
+ if (attr || !alloc)
+ return attr;
- if (kn->iattr || !alloc)
- goto out_unlock;
-
- kn->iattr = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL);
- if (!kn->iattr)
- goto out_unlock;
+ ret = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL);
+ if (!ret)
+ return NULL;
/* assign default attributes */
- kn->iattr->ia_uid = GLOBAL_ROOT_UID;
- kn->iattr->ia_gid = GLOBAL_ROOT_GID;
-
- ktime_get_real_ts64(&kn->iattr->ia_atime);
- kn->iattr->ia_mtime = kn->iattr->ia_atime;
- kn->iattr->ia_ctime = kn->iattr->ia_atime;
-
- simple_xattrs_init(&kn->iattr->xattrs);
- atomic_set(&kn->iattr->nr_user_xattrs, 0);
- atomic_set(&kn->iattr->user_xattr_size, 0);
-out_unlock:
- ret = kn->iattr;
- mutex_unlock(&iattr_mutex);
- return ret;
+ ret->ia_uid = GLOBAL_ROOT_UID;
+ ret->ia_gid = GLOBAL_ROOT_GID;
+
+ ktime_get_real_ts64(&ret->ia_atime);
+ ret->ia_mtime = ret->ia_atime;
+ ret->ia_ctime = ret->ia_atime;
+
+ simple_xattrs_init(&ret->xattrs);
+ atomic_set(&ret->nr_user_xattrs, 0);
+ atomic_set(&ret->user_xattr_size, 0);
+
+ /* If someone raced us, recognize it. */
+ if (!try_cmpxchg(&kn->iattr, &attr, ret))
+ return READ_ONCE(kn->iattr);
+
+ return no_free_ptr(ret);
}
static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
{
- return __kernfs_iattrs(kn, 1);
+ return __kernfs_iattrs(kn, true);
}
static struct kernfs_iattrs *kernfs_iattrs_noalloc(struct kernfs_node *kn)
{
- return __kernfs_iattrs(kn, 0);
+ return __kernfs_iattrs(kn, false);
}
int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
@@ -141,9 +142,9 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
struct kernfs_node *kn = kernfs_dentry_node(dentry);
struct kernfs_iattrs *attrs;
- attrs = kernfs_iattrs(kn);
+ attrs = kernfs_iattrs_noalloc(kn);
if (!attrs)
- return -ENOMEM;
+ return -ENODATA;
return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size);
}
@@ -166,9 +167,10 @@ static inline void set_inode_attr(struct inode *inode,
static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
{
- struct kernfs_iattrs *attrs = kn->iattr;
+ struct kernfs_iattrs *attrs;
inode->i_mode = kn->mode;
+ attrs = kernfs_iattrs_noalloc(kn);
if (attrs)
/*
* kernfs_node has non-default attributes get them from
@@ -306,7 +308,9 @@ int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
const void *value, size_t size, int flags)
{
struct simple_xattr *old_xattr;
- struct kernfs_iattrs *attrs = kernfs_iattrs(kn);
+ struct kernfs_iattrs *attrs;
+
+ attrs = kernfs_iattrs(kn);
if (!attrs)
return -ENOMEM;
@@ -345,8 +349,9 @@ static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn,
struct simple_xattrs *xattrs,
const void *value, size_t size, int flags)
{
- atomic_t *sz = &kn->iattr->user_xattr_size;
- atomic_t *nr = &kn->iattr->nr_user_xattrs;
+ struct kernfs_iattrs *attr = kernfs_iattrs_noalloc(kn);
+ atomic_t *sz = &attr->user_xattr_size;
+ atomic_t *nr = &attr->nr_user_xattrs;
struct simple_xattr *old_xattr;
int ret;
@@ -384,8 +389,9 @@ static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn,
struct simple_xattrs *xattrs,
const void *value, size_t size, int flags)
{
- atomic_t *sz = &kn->iattr->user_xattr_size;
- atomic_t *nr = &kn->iattr->nr_user_xattrs;
+ struct kernfs_iattrs *attr = kernfs_iattrs_noalloc(kn);
+ atomic_t *sz = &attr->user_xattr_size;
+ atomic_t *nr = &attr->nr_user_xattrs;
struct simple_xattr *old_xattr;
old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags);
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index c1719b5778a1..e384a69fbece 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -318,7 +318,7 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k
return -ENOMEM;
}
sb->s_root = root;
- sb->s_d_op = &kernfs_dops;
+ set_default_d_op(sb, &kernfs_dops);
return 0;
}
diff --git a/fs/libfs.c b/fs/libfs.c
index 9ea0ecc325a8..ce8c496a6940 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -62,11 +62,6 @@ int always_delete_dentry(const struct dentry *dentry)
}
EXPORT_SYMBOL(always_delete_dentry);
-const struct dentry_operations simple_dentry_operations = {
- .d_delete = always_delete_dentry,
-};
-EXPORT_SYMBOL(simple_dentry_operations);
-
/*
* Lookup the data. This is trivial - if the dentry didn't already
* exist, we know it is negative. Set d_op to delete negative dentries.
@@ -75,9 +70,11 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned
{
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
- if (!dentry->d_sb->s_d_op)
- d_set_d_op(dentry, &simple_dentry_operations);
-
+ if (!dentry->d_op && !(dentry->d_flags & DCACHE_DONTCACHE)) {
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags |= DCACHE_DONTCACHE;
+ spin_unlock(&dentry->d_lock);
+ }
if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
return NULL;
@@ -605,15 +602,16 @@ struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
}
EXPORT_SYMBOL(find_next_child);
-void simple_recursive_removal(struct dentry *dentry,
- void (*callback)(struct dentry *))
+static void __simple_recursive_removal(struct dentry *dentry,
+ void (*callback)(struct dentry *),
+ bool locked)
{
struct dentry *this = dget(dentry);
while (true) {
struct dentry *victim = NULL, *child;
struct inode *inode = this->d_inode;
- inode_lock(inode);
+ inode_lock_nested(inode, I_MUTEX_CHILD);
if (d_is_dir(this))
inode->i_flags |= S_DEAD;
while ((child = find_next_child(this, victim)) == NULL) {
@@ -625,15 +623,13 @@ void simple_recursive_removal(struct dentry *dentry,
victim = this;
this = this->d_parent;
inode = this->d_inode;
- inode_lock(inode);
+ if (!locked || victim != dentry)
+ inode_lock_nested(inode, I_MUTEX_CHILD);
if (simple_positive(victim)) {
d_invalidate(victim); // avoid lost mounts
- if (d_is_dir(victim))
- fsnotify_rmdir(inode, victim);
- else
- fsnotify_unlink(inode, victim);
if (callback)
callback(victim);
+ fsnotify_delete(inode, d_inode(victim), victim);
dput(victim); // unpin it
}
if (victim == dentry) {
@@ -641,7 +637,8 @@ void simple_recursive_removal(struct dentry *dentry,
inode_set_ctime_current(inode));
if (d_is_dir(dentry))
drop_nlink(inode);
- inode_unlock(inode);
+ if (!locked)
+ inode_unlock(inode);
dput(dentry);
return;
}
@@ -650,8 +647,22 @@ void simple_recursive_removal(struct dentry *dentry,
this = child;
}
}
+
+void simple_recursive_removal(struct dentry *dentry,
+ void (*callback)(struct dentry *))
+{
+ return __simple_recursive_removal(dentry, callback, false);
+}
EXPORT_SYMBOL(simple_recursive_removal);
+/* caller holds parent directory with I_MUTEX_PARENT */
+void locked_recursive_removal(struct dentry *dentry,
+ void (*callback)(struct dentry *))
+{
+ return __simple_recursive_removal(dentry, callback, true);
+}
+EXPORT_SYMBOL(locked_recursive_removal);
+
static const struct super_operations simple_super_operations = {
.statfs = simple_statfs,
};
@@ -684,7 +695,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
s->s_root = d_make_root(root);
if (!s->s_root)
return -ENOMEM;
- s->s_d_op = ctx->dops;
+ set_default_d_op(s, ctx->dops);
return 0;
}
@@ -910,7 +921,7 @@ static int simple_read_folio(struct file *file, struct folio *folio)
return 0;
}
-int simple_write_begin(struct file *file, struct address_space *mapping,
+int simple_write_begin(const struct kiocb *iocb, struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
@@ -935,7 +946,7 @@ EXPORT_SYMBOL(simple_write_begin);
/**
* simple_write_end - .write_end helper for non-block-device FSes
- * @file: See .write_end of address_space_operations
+ * @iocb: kernel I/O control block
* @mapping: "
* @pos: "
* @len: "
@@ -946,7 +957,8 @@ EXPORT_SYMBOL(simple_write_begin);
* simple_write_end does the minimum needed for updating a folio after
* writing is done. It has the same API signature as the .write_end of
* address_space_operations vector. So it can just be set onto .write_end for
- * FSes that don't need any other processing. i_mutex is assumed to be held.
+ * FSes that don't need any other processing. i_rwsem is assumed to be held
+ * exclusively.
* Block based filesystems should use generic_write_end().
* NOTE: Even though i_size might get updated by this function, mark_inode_dirty
* is not called, so a filesystem that actually does store data in .write_inode
@@ -955,9 +967,10 @@ EXPORT_SYMBOL(simple_write_begin);
*
* Use *ONLY* with simple_read_folio()
*/
-static int simple_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+static int simple_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
struct inode *inode = folio->mapping->host;
loff_t last_pos = pos + copied;
@@ -973,7 +986,7 @@ static int simple_write_end(struct file *file, struct address_space *mapping,
}
/*
* No need to use i_size_read() here, the i_size
- * cannot change under us because we hold the i_mutex.
+ * cannot change under us because we hold the i_rwsem.
*/
if (last_pos > inode->i_size)
i_size_write(inode, last_pos);
@@ -1583,13 +1596,17 @@ EXPORT_SYMBOL(generic_file_fsync);
int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
{
u64 last_fs_block = num_blocks - 1;
- u64 last_fs_page =
- last_fs_block >> (PAGE_SHIFT - blocksize_bits);
+ u64 last_fs_page, max_bytes;
+
+ if (check_shl_overflow(num_blocks, blocksize_bits, &max_bytes))
+ return -EFBIG;
+
+ last_fs_page = (max_bytes >> PAGE_SHIFT) - 1;
if (unlikely(num_blocks == 0))
return 0;
- if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT))
+ if (blocksize_bits < 9)
return -EINVAL;
if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
@@ -1649,12 +1666,10 @@ struct inode *alloc_anon_inode(struct super_block *s)
*/
inode->i_state = I_DIRTY;
/*
- * Historically anonymous inodes didn't have a type at all and
- * userspace has come to rely on this. Internally they're just
- * regular files but S_IFREG is masked off when reporting
- * information to userspace.
+ * Historically anonymous inodes don't have a type at all and
+ * userspace has come to rely on this.
*/
- inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
+ inode->i_mode = S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_flags |= S_PRIVATE | S_ANON_INODE;
@@ -1950,22 +1965,22 @@ static const struct dentry_operations generic_encrypted_dentry_ops = {
* @sb: superblock to be configured
*
* Filesystems supporting casefolding and/or fscrypt can call this
- * helper at mount-time to configure sb->s_d_op to best set of dentry
- * operations required for the enabled features. The helper must be
- * called after these have been configured, but before the root dentry
- * is created.
+ * helper at mount-time to configure default dentry_operations to the
+ * best set of dentry operations required for the enabled features.
+ * The helper must be called after these have been configured, but
+ * before the root dentry is created.
*/
void generic_set_sb_d_ops(struct super_block *sb)
{
#if IS_ENABLED(CONFIG_UNICODE)
if (sb->s_encoding) {
- sb->s_d_op = &generic_ci_dentry_ops;
+ set_default_d_op(sb, &generic_ci_dentry_ops);
return;
}
#endif
#ifdef CONFIG_FS_ENCRYPTION
if (sb->s_cop) {
- sb->s_d_op = &generic_encrypted_dentry_ops;
+ set_default_d_op(sb, &generic_encrypted_dentry_ops);
return;
}
#endif
@@ -2128,6 +2143,8 @@ struct dentry *stashed_dentry_get(struct dentry **stashed)
dentry = rcu_dereference(*stashed);
if (!dentry)
return NULL;
+ if (IS_ERR(dentry))
+ return dentry;
if (!lockref_get_not_dead(&dentry->d_lockref))
return NULL;
return dentry;
@@ -2160,7 +2177,6 @@ static struct dentry *prepare_anon_dentry(struct dentry **stashed,
/* Notice when this is changed. */
WARN_ON_ONCE(!S_ISREG(inode->i_mode));
- WARN_ON_ONCE(!IS_IMMUTABLE(inode));
dentry = d_alloc_anon(sb);
if (!dentry) {
@@ -2176,8 +2192,7 @@ static struct dentry *prepare_anon_dentry(struct dentry **stashed,
return dentry;
}
-static struct dentry *stash_dentry(struct dentry **stashed,
- struct dentry *dentry)
+struct dentry *stash_dentry(struct dentry **stashed, struct dentry *dentry)
{
guard(rcu)();
for (;;) {
@@ -2218,14 +2233,16 @@ static struct dentry *stash_dentry(struct dentry **stashed,
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
struct path *path)
{
- struct dentry *dentry;
+ struct dentry *dentry, *res;
const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;
/* See if dentry can be reused. */
- path->dentry = stashed_dentry_get(stashed);
- if (path->dentry) {
+ res = stashed_dentry_get(stashed);
+ if (IS_ERR(res))
+ return PTR_ERR(res);
+ if (res) {
sops->put_data(data);
- goto out_path;
+ goto make_path;
}
/* Allocate a new dentry. */
@@ -2234,14 +2251,22 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
return PTR_ERR(dentry);
/* Added a new dentry. @data is now owned by the filesystem. */
- path->dentry = stash_dentry(stashed, dentry);
- if (path->dentry != dentry)
+ if (sops->stash_dentry)
+ res = sops->stash_dentry(stashed, dentry);
+ else
+ res = stash_dentry(stashed, dentry);
+ if (IS_ERR(res)) {
+ dput(dentry);
+ return PTR_ERR(res);
+ }
+ if (res != dentry)
dput(dentry);
-out_path:
- WARN_ON_ONCE(path->dentry->d_fsdata != stashed);
- WARN_ON_ONCE(d_inode(path->dentry)->i_private != data);
+make_path:
+ path->dentry = res;
path->mnt = mntget(mnt);
+ VFS_WARN_ON_ONCE(path->dentry->d_fsdata != stashed);
+ VFS_WARN_ON_ONCE(d_inode(path->dentry)->i_private != data);
return 0;
}
@@ -2263,3 +2288,28 @@ void stashed_dentry_prune(struct dentry *dentry)
*/
cmpxchg(stashed, dentry, NULL);
}
+
+/* parent must be held exclusive */
+struct dentry *simple_start_creating(struct dentry *parent, const char *name)
+{
+ struct dentry *dentry;
+ struct inode *dir = d_inode(parent);
+
+ inode_lock(dir);
+ if (unlikely(IS_DEADDIR(dir))) {
+ inode_unlock(dir);
+ return ERR_PTR(-ENOENT);
+ }
+ dentry = lookup_noperm(&QSTR(name), parent);
+ if (IS_ERR(dentry)) {
+ inode_unlock(dir);
+ return dentry;
+ }
+ if (dentry->d_inode) {
+ dput(dentry);
+ inode_unlock(dir);
+ return ERR_PTR(-EEXIST);
+ }
+ return dentry;
+}
+EXPORT_SYMBOL(simple_start_creating);
diff --git a/fs/locks.c b/fs/locks.c
index 1619cddfa7a4..559f02aa4172 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -712,7 +712,7 @@ static void __locks_wake_up_blocks(struct file_lock_core *blocker)
fl->fl_lmops && fl->fl_lmops->lm_notify)
fl->fl_lmops->lm_notify(fl);
else
- locks_wake_up(fl);
+ locks_wake_up_waiter(waiter);
/*
* The setting of flc_blocker to NULL marks the "done"
@@ -1794,7 +1794,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **pr
/*
* In the delegation case we need mutual exclusion with
- * a number of operations that take the i_mutex. We trylock
+ * a number of operations that take the i_rwsem. We trylock
* because delegations are an optional optimization, and if
* there's some chance of a conflict--we'd rather not
* bother, maybe that's a sign this just isn't a good file to
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index dd2a425b41f0..19052fc47e9e 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -45,7 +45,7 @@ static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len)
struct address_space *mapping = folio->mapping;
struct inode *dir = mapping->host;
- block_write_end(NULL, mapping, pos, len, len, folio, NULL);
+ block_write_end(pos, len, len, folio);
if (pos+len > dir->i_size) {
i_size_write(dir, pos+len);
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 906d192ab7f3..dca7ac71f049 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -17,7 +17,7 @@ const struct file_operations minix_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.fsync = generic_file_fsync,
.splice_read = filemap_splice_read,
};
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index f007e389d5d2..df9d11479caf 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -442,9 +442,10 @@ static void minix_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int minix_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int minix_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int ret;
diff --git a/fs/mount.h b/fs/mount.h
index ad7173037924..97737051a8b9 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -44,7 +44,6 @@ struct mountpoint {
struct hlist_node m_hash;
struct dentry *m_dentry;
struct hlist_head m_list;
- int m_count;
};
struct mount {
@@ -70,8 +69,8 @@ struct mount {
struct list_head mnt_list;
struct list_head mnt_expire; /* link in fs-specific expiry list */
struct list_head mnt_share; /* circular list of shared mounts */
- struct list_head mnt_slave_list;/* list of slave mounts */
- struct list_head mnt_slave; /* slave list entry */
+ struct hlist_head mnt_slave_list;/* list of slave mounts */
+ struct hlist_node mnt_slave; /* slave list entry */
struct mount *mnt_master; /* slave is on master->mnt_slave_list */
struct mnt_namespace *mnt_ns; /* containing namespace */
struct mountpoint *mnt_mp; /* where is it mounted */
@@ -79,21 +78,38 @@ struct mount {
struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */
struct hlist_node mnt_umount;
};
- struct list_head mnt_umounting; /* list entry for umount propagation */
#ifdef CONFIG_FSNOTIFY
struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
__u32 mnt_fsnotify_mask;
struct list_head to_notify; /* need to queue notification */
struct mnt_namespace *prev_ns; /* previous namespace (NULL if none) */
#endif
+ int mnt_t_flags; /* namespace_sem-protected flags */
int mnt_id; /* mount identifier, reused */
u64 mnt_id_unique; /* mount ID unique until reboot */
int mnt_group_id; /* peer group identifier */
int mnt_expiry_mark; /* true if marked for expiry */
struct hlist_head mnt_pins;
struct hlist_head mnt_stuck_children;
+ struct mount *overmount; /* mounted on ->mnt_root */
} __randomize_layout;
+enum {
+ T_SHARED = 1, /* mount is shared */
+ T_UNBINDABLE = 2, /* mount is unbindable */
+ T_MARKED = 4, /* internal mark for propagate_... */
+ T_UMOUNT_CANDIDATE = 8, /* for propagate_umount */
+
+ /*
+ * T_SHARED_MASK is the set of flags that should be cleared when a
+ * mount becomes shared. Currently, this is only the flag that says a
+ * mount cannot be bind mounted, since this is how we create a mount
+ * that shares events with another mount. If you add a new T_*
+ * flag, consider how it interacts with shared mounts.
+ */
+ T_SHARED_MASK = T_UNBINDABLE,
+};
+
#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
static inline struct mount *real_mount(struct vfsmount *mnt)
@@ -101,7 +117,7 @@ static inline struct mount *real_mount(struct vfsmount *mnt)
return container_of(mnt, struct mount, mnt);
}
-static inline int mnt_has_parent(struct mount *mnt)
+static inline int mnt_has_parent(const struct mount *mnt)
{
return mnt != mnt->mnt_parent;
}
@@ -146,8 +162,8 @@ struct proc_mounts {
extern const struct seq_operations mounts_op;
-extern bool __is_local_mountpoint(struct dentry *dentry);
-static inline bool is_local_mountpoint(struct dentry *dentry)
+extern bool __is_local_mountpoint(const struct dentry *dentry);
+static inline bool is_local_mountpoint(const struct dentry *dentry)
{
if (!d_mountpoint(dentry))
return false;
@@ -160,6 +176,13 @@ static inline bool is_anon_ns(struct mnt_namespace *ns)
return ns->seq == 0;
}
+static inline bool anon_ns_root(const struct mount *m)
+{
+ struct mnt_namespace *ns = READ_ONCE(m->mnt_ns);
+
+ return !IS_ERR_OR_NULL(ns) && is_anon_ns(ns) && m == ns->root;
+}
+
static inline bool mnt_ns_attached(const struct mount *mnt)
{
return !RB_EMPTY_NODE(&mnt->mnt_node);
@@ -170,7 +193,7 @@ static inline bool mnt_ns_empty(const struct mnt_namespace *ns)
return RB_EMPTY_ROOT(&ns->mounts);
}
-static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
+static inline void move_from_ns(struct mount *mnt)
{
struct mnt_namespace *ns = mnt->mnt_ns;
WARN_ON(!mnt_ns_attached(mnt));
@@ -180,7 +203,6 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
ns->mnt_first_node = rb_next(&mnt->mnt_node);
rb_erase(&mnt->mnt_node, &ns->mounts);
RB_CLEAR_NODE(&mnt->mnt_node);
- list_add_tail(&mnt->mnt_list, dt_list);
}
bool has_locked_children(struct mount *mnt, struct dentry *dentry);
diff --git a/fs/namei.c b/fs/namei.c
index 4bb889fc980b..cd43ff89fbaa 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1012,10 +1012,10 @@ static int set_root(struct nameidata *nd)
unsigned seq;
do {
- seq = read_seqcount_begin(&fs->seq);
+ seq = read_seqbegin(&fs->seq);
nd->root = fs->root;
nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
- } while (read_seqcount_retry(&fs->seq, seq));
+ } while (read_seqretry(&fs->seq, seq));
} else {
get_fs_root(fs, &nd->root);
nd->state |= ND_ROOT_GRABBED;
@@ -1469,7 +1469,7 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
int ret = 0;
while (flags & DCACHE_MANAGED_DENTRY) {
- /* Allow the filesystem to manage the transit without i_mutex
+ /* Allow the filesystem to manage the transit without i_rwsem
* being held. */
if (flags & DCACHE_MANAGE_TRANSIT) {
ret = path->dentry->d_op->d_manage(path, false);
@@ -1665,9 +1665,17 @@ static struct dentry *lookup_dcache(const struct qstr *name,
return dentry;
}
-static struct dentry *lookup_one_qstr_excl_raw(const struct qstr *name,
- struct dentry *base,
- unsigned int flags)
+/*
+ * Parent directory has inode locked exclusive. This is one
+ * and only case when ->lookup() gets called on non in-lookup
+ * dentries - as the matter of fact, this only gets called
+ * when directory is guaranteed to have no in-lookup children
+ * at all.
+ * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed.
+ * Will return -EEXIST if name is found and LOOKUP_EXCL was passed.
+ */
+struct dentry *lookup_one_qstr_excl(const struct qstr *name,
+ struct dentry *base, unsigned int flags)
{
struct dentry *dentry;
struct dentry *old;
@@ -1675,7 +1683,7 @@ static struct dentry *lookup_one_qstr_excl_raw(const struct qstr *name,
dentry = lookup_dcache(name, base, flags);
if (dentry)
- return dentry;
+ goto found;
/* Don't create child dentry for a dead directory. */
dir = base->d_inode;
@@ -1691,24 +1699,7 @@ static struct dentry *lookup_one_qstr_excl_raw(const struct qstr *name,
dput(dentry);
dentry = old;
}
- return dentry;
-}
-
-/*
- * Parent directory has inode locked exclusive. This is one
- * and only case when ->lookup() gets called on non in-lookup
- * dentries - as the matter of fact, this only gets called
- * when directory is guaranteed to have no in-lookup children
- * at all.
- * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed.
- * Will return -EEXIST if name is found and LOOKUP_EXCL was passed.
- */
-struct dentry *lookup_one_qstr_excl(const struct qstr *name,
- struct dentry *base, unsigned int flags)
-{
- struct dentry *dentry;
-
- dentry = lookup_one_qstr_excl_raw(name, base, flags);
+found:
if (IS_ERR(dentry))
return dentry;
if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) {
@@ -2580,11 +2571,11 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
unsigned seq;
do {
- seq = read_seqcount_begin(&fs->seq);
+ seq = read_seqbegin(&fs->seq);
nd->path = fs->pwd;
nd->inode = nd->path.dentry->d_inode;
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
- } while (read_seqcount_retry(&fs->seq, seq));
+ } while (read_seqretry(&fs->seq, seq));
} else {
get_fs_pwd(current->fs, &nd->path);
nd->inode = nd->path.dentry->d_inode;
@@ -2790,7 +2781,7 @@ struct dentry *kern_path_locked_negative(const char *name, struct path *path)
if (unlikely(type != LAST_NORM))
return ERR_PTR(-EINVAL);
inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT);
- d = lookup_one_qstr_excl_raw(&last, parent_path.dentry, 0);
+ d = lookup_one_qstr_excl(&last, parent_path.dentry, LOOKUP_CREATE);
if (IS_ERR(d)) {
inode_unlock(parent_path.dentry->d_inode);
return d;
@@ -2917,7 +2908,8 @@ static int lookup_one_common(struct mnt_idmap *idmap,
* @base: base directory to lookup from
*
* Look up a dentry by name in the dcache, returning NULL if it does not
- * currently exist. The function does not try to create a dentry.
+ * currently exist. The function does not try to create a dentry and if one
+ * is found it doesn't try to revalidate it.
*
* Note that this routine is purely a helper for filesystem usage and should
* not be called by generic code. It does no permission checking.
@@ -2933,7 +2925,7 @@ struct dentry *try_lookup_noperm(struct qstr *name, struct dentry *base)
if (err)
return ERR_PTR(err);
- return lookup_dcache(name, base, 0);
+ return d_lookup(base, name);
}
EXPORT_SYMBOL(try_lookup_noperm);
@@ -2945,7 +2937,7 @@ EXPORT_SYMBOL(try_lookup_noperm);
* Note that this routine is purely a helper for filesystem usage and should
* not be called by generic code. It does no permission checking.
*
- * The caller must hold base->i_mutex.
+ * The caller must hold base->i_rwsem.
*/
struct dentry *lookup_noperm(struct qstr *name, struct dentry *base)
{
@@ -2971,7 +2963,7 @@ EXPORT_SYMBOL(lookup_noperm);
*
* This can be used for in-kernel filesystem clients such as file servers.
*
- * The caller must hold base->i_mutex.
+ * The caller must hold base->i_rwsem.
*/
struct dentry *lookup_one(struct mnt_idmap *idmap, struct qstr *name,
struct dentry *base)
@@ -3057,14 +3049,22 @@ EXPORT_SYMBOL(lookup_one_positive_unlocked);
* Note that this routine is purely a helper for filesystem usage and should
* not be called by generic code. It does no permission checking.
*
- * Unlike lookup_noperm, it should be called without the parent
+ * Unlike lookup_noperm(), it should be called without the parent
* i_rwsem held, and will take the i_rwsem itself if necessary.
+ *
+ * Unlike try_lookup_noperm() it *does* revalidate the dentry if it already
+ * existed.
*/
struct dentry *lookup_noperm_unlocked(struct qstr *name, struct dentry *base)
{
struct dentry *ret;
+ int err;
- ret = try_lookup_noperm(name, base);
+ err = lookup_noperm_common(name, base);
+ if (err)
+ return ERR_PTR(err);
+
+ ret = lookup_dcache(name, base, 0);
if (!ret)
ret = lookup_slow(name, base, 0);
return ret;
@@ -3471,7 +3471,7 @@ static int may_open(struct mnt_idmap *idmap, const struct path *path,
return -EACCES;
break;
default:
- VFS_BUG_ON_INODE(1, inode);
+ VFS_BUG_ON_INODE(!IS_ANON_FILE(inode), inode);
}
error = inode_permission(idmap, inode, MAY_OPEN | acc_mode);
@@ -4542,13 +4542,13 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
* @dentry: victim
* @delegated_inode: returns victim inode, if the inode is delegated.
*
- * The caller must hold dir->i_mutex.
+ * The caller must hold dir->i_rwsem exclusively.
*
* If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
* return a reference to the inode in delegated_inode. The caller
* should then break the delegation on that inode and retry. Because
* breaking a delegation may take a long time, the caller should drop
- * dir->i_mutex before doing so.
+ * dir->i_rwsem before doing so.
*
* Alternatively, a caller may pass NULL for delegated_inode. This may
* be appropriate for callers that expect the underlying filesystem not
@@ -4607,7 +4607,7 @@ EXPORT_SYMBOL(vfs_unlink);
/*
* Make sure that the actual truncation of the file will occur outside its
- * directory's i_mutex. Truncate can take a long time if there is a lot of
+ * directory's i_rwsem. Truncate can take a long time if there is a lot of
* writeout happening, and we don't want to prevent access to the directory
* while waiting on the I/O.
*/
@@ -4785,13 +4785,13 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
* @new_dentry: where to create the new link
* @delegated_inode: returns inode needing a delegation break
*
- * The caller must hold dir->i_mutex
+ * The caller must hold dir->i_rwsem exclusively.
*
* If vfs_link discovers a delegation on the to-be-linked file in need
* of breaking, it will return -EWOULDBLOCK and return a reference to the
* inode in delegated_inode. The caller should then break the delegation
* and retry. Because breaking a delegation may take a long time, the
- * caller should drop the i_mutex before doing so.
+ * caller should drop the i_rwsem before doing so.
*
* Alternatively, a caller may pass NULL for delegated_inode. This may
* be appropriate for callers that expect the underlying filesystem not
@@ -4987,7 +4987,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
* c) we may have to lock up to _four_ objects - parents and victim (if it exists),
* and source (if it's a non-directory or a subdirectory that moves to
* different parent).
- * And that - after we got ->i_mutex on parents (until then we don't know
+ * And that - after we got ->i_rwsem on parents (until then we don't know
* whether the target exists). Solution: try to be smart with locking
* order for inodes. We rely on the fact that tree topology may change
* only under ->s_vfs_rename_mutex _and_ that parent of the object we
@@ -4999,15 +4999,16 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
* has no more than 1 dentry. If "hybrid" objects will ever appear,
* we'd better make sure that there's no link(2) for them.
* d) conversion from fhandle to dentry may come in the wrong moment - when
- * we are removing the target. Solution: we will have to grab ->i_mutex
+ * we are removing the target. Solution: we will have to grab ->i_rwsem
* in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
- * ->i_mutex on parents, which works but leads to some truly excessive
+ * ->i_rwsem on parents, which works but leads to some truly excessive
* locking].
*/
int vfs_rename(struct renamedata *rd)
{
int error;
- struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
+ struct inode *old_dir = d_inode(rd->old_parent);
+ struct inode *new_dir = d_inode(rd->new_parent);
struct dentry *old_dentry = rd->old_dentry;
struct dentry *new_dentry = rd->new_dentry;
struct inode **delegated_inode = rd->delegated_inode;
@@ -5266,10 +5267,10 @@ retry_deleg:
if (error)
goto exit5;
- rd.old_dir = old_path.dentry->d_inode;
+ rd.old_parent = old_path.dentry;
rd.old_dentry = old_dentry;
rd.old_mnt_idmap = mnt_idmap(old_path.mnt);
- rd.new_dir = new_path.dentry->d_inode;
+ rd.new_parent = new_path.dentry;
rd.new_dentry = new_dentry;
rd.new_mnt_idmap = mnt_idmap(new_path.mnt);
rd.delegated_inode = &delegated_inode;
diff --git a/fs/namespace.c b/fs/namespace.c
index 2f2e93927f46..ddfd4457d338 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -79,6 +79,7 @@ static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
+static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */
static DEFINE_SEQLOCK(mnt_ns_tree_lock);
#ifdef CONFIG_FSNOTIFY
@@ -380,10 +381,9 @@ static struct mount *alloc_vfsmnt(const char *name)
INIT_LIST_HEAD(&mnt->mnt_list);
INIT_LIST_HEAD(&mnt->mnt_expire);
INIT_LIST_HEAD(&mnt->mnt_share);
- INIT_LIST_HEAD(&mnt->mnt_slave_list);
- INIT_LIST_HEAD(&mnt->mnt_slave);
+ INIT_HLIST_HEAD(&mnt->mnt_slave_list);
+ INIT_HLIST_NODE(&mnt->mnt_slave);
INIT_HLIST_NODE(&mnt->mnt_mp_list);
- INIT_LIST_HEAD(&mnt->mnt_umounting);
INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
RB_CLEAR_NODE(&mnt->mnt_node);
mnt->mnt.mnt_idmap = &nop_mnt_idmap;
@@ -894,7 +894,7 @@ struct vfsmount *lookup_mnt(const struct path *path)
* namespace not just a mount that happens to have some specified
* parent mount.
*/
-bool __is_local_mountpoint(struct dentry *dentry)
+bool __is_local_mountpoint(const struct dentry *dentry)
{
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
struct mount *mnt, *n;
@@ -911,42 +911,48 @@ bool __is_local_mountpoint(struct dentry *dentry)
return is_covered;
}
-static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
+struct pinned_mountpoint {
+ struct hlist_node node;
+ struct mountpoint *mp;
+};
+
+static bool lookup_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m)
{
struct hlist_head *chain = mp_hash(dentry);
struct mountpoint *mp;
hlist_for_each_entry(mp, chain, m_hash) {
if (mp->m_dentry == dentry) {
- mp->m_count++;
- return mp;
+ hlist_add_head(&m->node, &mp->m_list);
+ m->mp = mp;
+ return true;
}
}
- return NULL;
+ return false;
}
-static struct mountpoint *get_mountpoint(struct dentry *dentry)
+static int get_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m)
{
- struct mountpoint *mp, *new = NULL;
+ struct mountpoint *mp __free(kfree) = NULL;
+ bool found;
int ret;
if (d_mountpoint(dentry)) {
/* might be worth a WARN_ON() */
if (d_unlinked(dentry))
- return ERR_PTR(-ENOENT);
+ return -ENOENT;
mountpoint:
read_seqlock_excl(&mount_lock);
- mp = lookup_mountpoint(dentry);
+ found = lookup_mountpoint(dentry, m);
read_sequnlock_excl(&mount_lock);
- if (mp)
- goto done;
+ if (found)
+ return 0;
}
- if (!new)
- new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
- if (!new)
- return ERR_PTR(-ENOMEM);
-
+ if (!mp)
+ mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
+ if (!mp)
+ return -ENOMEM;
/* Exactly one processes may set d_mounted */
ret = d_set_mounted(dentry);
@@ -956,34 +962,28 @@ mountpoint:
goto mountpoint;
/* The dentry is not available as a mountpoint? */
- mp = ERR_PTR(ret);
if (ret)
- goto done;
+ return ret;
/* Add the new mountpoint to the hash table */
read_seqlock_excl(&mount_lock);
- new->m_dentry = dget(dentry);
- new->m_count = 1;
- hlist_add_head(&new->m_hash, mp_hash(dentry));
- INIT_HLIST_HEAD(&new->m_list);
+ mp->m_dentry = dget(dentry);
+ hlist_add_head(&mp->m_hash, mp_hash(dentry));
+ INIT_HLIST_HEAD(&mp->m_list);
+ hlist_add_head(&m->node, &mp->m_list);
+ m->mp = no_free_ptr(mp);
read_sequnlock_excl(&mount_lock);
-
- mp = new;
- new = NULL;
-done:
- kfree(new);
- return mp;
+ return 0;
}
/*
* vfsmount lock must be held. Additionally, the caller is responsible
* for serializing calls for given disposal list.
*/
-static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)
+static void maybe_free_mountpoint(struct mountpoint *mp, struct list_head *list)
{
- if (!--mp->m_count) {
+ if (hlist_empty(&mp->m_list)) {
struct dentry *dentry = mp->m_dentry;
- BUG_ON(!hlist_empty(&mp->m_list));
spin_lock(&dentry->d_lock);
dentry->d_flags &= ~DCACHE_MOUNTED;
spin_unlock(&dentry->d_lock);
@@ -993,10 +993,15 @@ static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)
}
}
-/* called with namespace_lock and vfsmount lock */
-static void put_mountpoint(struct mountpoint *mp)
+/*
+ * locks: mount_lock [read_seqlock_excl], namespace_sem [excl]
+ */
+static void unpin_mountpoint(struct pinned_mountpoint *m)
{
- __put_mountpoint(mp, &ex_mountpoints);
+ if (m->mp) {
+ hlist_del(&m->node);
+ maybe_free_mountpoint(m->mp, &ex_mountpoints);
+ }
}
static inline int check_mnt(struct mount *mnt)
@@ -1038,11 +1043,14 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
}
/*
- * vfsmount lock must be held for write
+ * locks: mount_lock[write_seqlock]
*/
-static struct mountpoint *unhash_mnt(struct mount *mnt)
+static void __umount_mnt(struct mount *mnt, struct list_head *shrink_list)
{
struct mountpoint *mp;
+ struct mount *parent = mnt->mnt_parent;
+ if (unlikely(parent->overmount == mnt))
+ parent->overmount = NULL;
mnt->mnt_parent = mnt;
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
list_del_init(&mnt->mnt_child);
@@ -1050,15 +1058,15 @@ static struct mountpoint *unhash_mnt(struct mount *mnt)
hlist_del_init(&mnt->mnt_mp_list);
mp = mnt->mnt_mp;
mnt->mnt_mp = NULL;
- return mp;
+ maybe_free_mountpoint(mp, shrink_list);
}
/*
- * vfsmount lock must be held for write
+ * locks: mount_lock[write_seqlock], namespace_sem[excl] (for ex_mountpoints)
*/
static void umount_mnt(struct mount *mnt)
{
- put_mountpoint(unhash_mnt(mnt));
+ __umount_mnt(mnt, &ex_mountpoints);
}
/*
@@ -1068,43 +1076,17 @@ void mnt_set_mountpoint(struct mount *mnt,
struct mountpoint *mp,
struct mount *child_mnt)
{
- mp->m_count++;
- mnt_add_count(mnt, 1); /* essentially, that's mntget */
child_mnt->mnt_mountpoint = mp->m_dentry;
child_mnt->mnt_parent = mnt;
child_mnt->mnt_mp = mp;
hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
}
-/**
- * mnt_set_mountpoint_beneath - mount a mount beneath another one
- *
- * @new_parent: the source mount
- * @top_mnt: the mount beneath which @new_parent is mounted
- * @new_mp: the new mountpoint of @top_mnt on @new_parent
- *
- * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and
- * parent @top_mnt->mnt_parent and mount it on top of @new_parent at
- * @new_mp. And mount @new_parent on the old parent and old
- * mountpoint of @top_mnt.
- *
- * Context: This function expects namespace_lock() and lock_mount_hash()
- * to have been acquired in that order.
- */
-static void mnt_set_mountpoint_beneath(struct mount *new_parent,
- struct mount *top_mnt,
- struct mountpoint *new_mp)
-{
- struct mount *old_top_parent = top_mnt->mnt_parent;
- struct mountpoint *old_top_mp = top_mnt->mnt_mp;
-
- mnt_set_mountpoint(old_top_parent, old_top_mp, new_parent);
- mnt_change_mountpoint(new_parent, new_mp, top_mnt);
-}
-
-
-static void __attach_mnt(struct mount *mnt, struct mount *parent)
+static void make_visible(struct mount *mnt)
{
+ struct mount *parent = mnt->mnt_parent;
+ if (unlikely(mnt->mnt_mountpoint == parent->mnt.mnt_root))
+ parent->overmount = mnt;
hlist_add_head_rcu(&mnt->mnt_hash,
m_hash(&parent->mnt, mnt->mnt_mountpoint));
list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
@@ -1116,51 +1098,34 @@ static void __attach_mnt(struct mount *mnt, struct mount *parent)
* @parent: the parent
* @mnt: the new mount
* @mp: the new mountpoint
- * @beneath: whether to mount @mnt beneath or on top of @parent
*
- * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt
+ * Mount @mnt at @mp on @parent. Then attach @mnt
* to @parent's child mount list and to @mount_hashtable.
*
- * If @beneath is true, remove @mnt from its current parent and
- * mountpoint and mount it on @mp on @parent, and mount @parent on the
- * old parent and old mountpoint of @mnt. Finally, attach @parent to
- * @mnt_hashtable and @parent->mnt_parent->mnt_mounts.
- *
- * Note, when __attach_mnt() is called @mnt->mnt_parent already points
+ * Note, when make_visible() is called @mnt->mnt_parent already points
* to the correct parent.
*
* Context: This function expects namespace_lock() and lock_mount_hash()
* to have been acquired in that order.
*/
static void attach_mnt(struct mount *mnt, struct mount *parent,
- struct mountpoint *mp, bool beneath)
+ struct mountpoint *mp)
{
- if (beneath)
- mnt_set_mountpoint_beneath(mnt, parent, mp);
- else
- mnt_set_mountpoint(parent, mp, mnt);
- /*
- * Note, @mnt->mnt_parent has to be used. If @mnt was mounted
- * beneath @parent then @mnt will need to be attached to
- * @parent's old parent, not @parent. IOW, @mnt->mnt_parent
- * isn't the same mount as @parent.
- */
- __attach_mnt(mnt, mnt->mnt_parent);
+ mnt_set_mountpoint(parent, mp, mnt);
+ make_visible(mnt);
}
void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
{
struct mountpoint *old_mp = mnt->mnt_mp;
- struct mount *old_parent = mnt->mnt_parent;
list_del_init(&mnt->mnt_child);
hlist_del_init(&mnt->mnt_mp_list);
hlist_del_init_rcu(&mnt->mnt_hash);
- attach_mnt(mnt, parent, mp, false);
+ attach_mnt(mnt, parent, mp);
- put_mountpoint(old_mp);
- mnt_add_count(old_parent, -1);
+ maybe_free_mountpoint(old_mp, &ex_mountpoints);
}
static inline struct mount *node_to_mount(struct rb_node *node)
@@ -1197,32 +1162,6 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
mnt_notify_add(mnt);
}
-/*
- * vfsmount lock must be held for write
- */
-static void commit_tree(struct mount *mnt)
-{
- struct mount *parent = mnt->mnt_parent;
- struct mount *m;
- LIST_HEAD(head);
- struct mnt_namespace *n = parent->mnt_ns;
-
- BUG_ON(parent == mnt);
-
- list_add_tail(&head, &mnt->mnt_list);
- while (!list_empty(&head)) {
- m = list_first_entry(&head, typeof(*m), mnt_list);
- list_del(&m->mnt_list);
-
- mnt_add_to_ns(n, m);
- }
- n->nr_mounts += n->pending_mounts;
- n->pending_mounts = 0;
-
- __attach_mnt(mnt, parent);
- touch_mnt_namespace(n);
-}
-
static struct mount *next_mnt(struct mount *p, struct mount *root)
{
struct list_head *next = p->mnt_mounts.next;
@@ -1249,6 +1188,27 @@ static struct mount *skip_mnt_tree(struct mount *p)
return p;
}
+/*
+ * vfsmount lock must be held for write
+ */
+static void commit_tree(struct mount *mnt)
+{
+ struct mnt_namespace *n = mnt->mnt_parent->mnt_ns;
+
+ if (!mnt_ns_attached(mnt)) {
+ for (struct mount *m = mnt; m; m = next_mnt(m, mnt))
+ if (unlikely(mnt_ns_attached(m)))
+ m = skip_mnt_tree(m);
+ else
+ mnt_add_to_ns(n, m);
+ n->nr_mounts += n->pending_mounts;
+ n->pending_mounts = 0;
+ }
+
+ make_visible(mnt);
+ touch_mnt_namespace(n);
+}
+
/**
* vfs_create_mount - Create a mount for a configured superblock
* @fc: The configuration context with the superblock attached
@@ -1296,6 +1256,15 @@ struct vfsmount *fc_mount(struct fs_context *fc)
}
EXPORT_SYMBOL(fc_mount);
+struct vfsmount *fc_mount_longterm(struct fs_context *fc)
+{
+ struct vfsmount *mnt = fc_mount(fc);
+ if (!IS_ERR(mnt))
+ real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
+ return mnt;
+}
+EXPORT_SYMBOL(fc_mount_longterm);
+
struct vfsmount *vfs_kern_mount(struct file_system_type *type,
int flags, const char *name,
void *data)
@@ -1337,7 +1306,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
if (!mnt)
return ERR_PTR(-ENOMEM);
- if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
+ mnt->mnt.mnt_flags = READ_ONCE(old->mnt.mnt_flags) &
+ ~MNT_INTERNAL_FLAGS;
+
+ if (flag & (CL_SLAVE | CL_PRIVATE))
mnt->mnt_group_id = 0; /* not a peer of original */
else
mnt->mnt_group_id = old->mnt_group_id;
@@ -1348,8 +1320,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
goto out_free;
}
- mnt->mnt.mnt_flags = old->mnt.mnt_flags;
- mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
+ if (mnt->mnt_group_id)
+ set_mnt_shared(mnt);
atomic_inc(&sb->s_active);
mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
@@ -1362,30 +1334,19 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
unlock_mount_hash();
- if ((flag & CL_SLAVE) ||
- ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
- list_add(&mnt->mnt_slave, &old->mnt_slave_list);
+ if (flag & CL_PRIVATE) // we are done with it
+ return mnt;
+
+ if (peers(mnt, old))
+ list_add(&mnt->mnt_share, &old->mnt_share);
+
+ if ((flag & CL_SLAVE) && old->mnt_group_id) {
+ hlist_add_head(&mnt->mnt_slave, &old->mnt_slave_list);
mnt->mnt_master = old;
- CLEAR_MNT_SHARED(mnt);
- } else if (!(flag & CL_PRIVATE)) {
- if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
- list_add(&mnt->mnt_share, &old->mnt_share);
- if (IS_MNT_SLAVE(old))
- list_add(&mnt->mnt_slave, &old->mnt_slave);
+ } else if (IS_MNT_SLAVE(old)) {
+ hlist_add_behind(&mnt->mnt_slave, &old->mnt_slave);
mnt->mnt_master = old->mnt_master;
- } else {
- CLEAR_MNT_SHARED(mnt);
}
- if (flag & CL_MAKE_SHARED)
- set_mnt_shared(mnt);
-
- /* stick the duplicate mount on the same expiry list
- * as the original if that was on one */
- if (flag & CL_EXPIRE) {
- if (!list_empty(&old->mnt_expire))
- list_add(&mnt->mnt_expire, &old->mnt_expire);
- }
-
return mnt;
out_free:
@@ -1478,11 +1439,13 @@ static void mntput_no_expire(struct mount *mnt)
rcu_read_unlock();
list_del(&mnt->mnt_instance);
+ if (unlikely(!list_empty(&mnt->mnt_expire)))
+ list_del(&mnt->mnt_expire);
if (unlikely(!list_empty(&mnt->mnt_mounts))) {
struct mount *p, *tmp;
list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
- __put_mountpoint(unhash_mnt(p), &list);
+ __umount_mnt(p, &list);
hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
}
}
@@ -1679,23 +1642,19 @@ const struct seq_operations mounts_op = {
int may_umount_tree(struct vfsmount *m)
{
struct mount *mnt = real_mount(m);
- int actual_refs = 0;
- int minimum_refs = 0;
- struct mount *p;
- BUG_ON(!m);
+ bool busy = false;
/* write lock needed for mnt_get_count */
lock_mount_hash();
- for (p = mnt; p; p = next_mnt(p, mnt)) {
- actual_refs += mnt_get_count(p);
- minimum_refs += 2;
+ for (struct mount *p = mnt; p; p = next_mnt(p, mnt)) {
+ if (mnt_get_count(p) > (p == mnt ? 2 : 1)) {
+ busy = true;
+ break;
+ }
}
unlock_mount_hash();
- if (actual_refs > minimum_refs)
- return 0;
-
- return 1;
+ return !busy;
}
EXPORT_SYMBOL(may_umount_tree);
@@ -1771,15 +1730,18 @@ static bool need_notify_mnt_list(void)
}
#endif
+static void free_mnt_ns(struct mnt_namespace *);
static void namespace_unlock(void)
{
struct hlist_head head;
struct hlist_node *p;
struct mount *m;
+ struct mnt_namespace *ns = emptied_ns;
LIST_HEAD(list);
hlist_move_list(&unmounted, &head);
list_splice_init(&ex_mountpoints, &list);
+ emptied_ns = NULL;
if (need_notify_mnt_list()) {
/*
@@ -1793,6 +1755,11 @@ static void namespace_unlock(void)
} else {
up_write(&namespace_sem);
}
+ if (unlikely(ns)) {
+ /* Make sure we notice when we leak mounts. */
+ VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
+ free_mnt_ns(ns);
+ }
shrink_dentry_list(&list);
@@ -1865,9 +1832,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
for (p = mnt; p; p = next_mnt(p, mnt)) {
p->mnt.mnt_flags |= MNT_UMOUNT;
if (mnt_ns_attached(p))
- move_from_ns(p, &tmp_list);
- else
- list_move(&p->mnt_list, &tmp_list);
+ move_from_ns(p);
+ list_add_tail(&p->mnt_list, &tmp_list);
}
/* Hide the mounts from mnt_mounts */
@@ -1896,7 +1862,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
disconnect = disconnect_mount(p, how);
if (mnt_has_parent(p)) {
- mnt_add_count(p->mnt_parent, -1);
if (!disconnect) {
/* Don't forget about p */
list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
@@ -1973,7 +1938,7 @@ static int do_umount(struct mount *mnt, int flags)
* all race cases, but it's a slowpath.
*/
lock_mount_hash();
- if (mnt_get_count(mnt) != 2) {
+ if (!list_empty(&mnt->mnt_mounts) || mnt_get_count(mnt) != 2) {
unlock_mount_hash();
return -EBUSY;
}
@@ -2019,23 +1984,27 @@ static int do_umount(struct mount *mnt, int flags)
namespace_lock();
lock_mount_hash();
- /* Recheck MNT_LOCKED with the locks held */
+ /* Repeat the earlier racy checks, now that we are holding the locks */
retval = -EINVAL;
+ if (!check_mnt(mnt))
+ goto out;
+
if (mnt->mnt.mnt_flags & MNT_LOCKED)
goto out;
+ if (!mnt_has_parent(mnt)) /* not the absolute root */
+ goto out;
+
event++;
if (flags & MNT_DETACH) {
- if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
- umount_tree(mnt, UMOUNT_PROPAGATE);
+ umount_tree(mnt, UMOUNT_PROPAGATE);
retval = 0;
} else {
smp_mb(); // paired with __legitimize_mnt()
shrink_submounts(mnt);
retval = -EBUSY;
if (!propagate_mount_busy(mnt, 2)) {
- if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
- umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
+ umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
retval = 0;
}
}
@@ -2053,29 +2022,28 @@ out:
* detach_mounts allows lazily unmounting those mounts instead of
* leaking them.
*
- * The caller may hold dentry->d_inode->i_mutex.
+ * The caller may hold dentry->d_inode->i_rwsem.
*/
void __detach_mounts(struct dentry *dentry)
{
- struct mountpoint *mp;
+ struct pinned_mountpoint mp = {};
struct mount *mnt;
namespace_lock();
lock_mount_hash();
- mp = lookup_mountpoint(dentry);
- if (!mp)
+ if (!lookup_mountpoint(dentry, &mp))
goto out_unlock;
event++;
- while (!hlist_empty(&mp->m_list)) {
- mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
+ while (mp.node.next) {
+ mnt = hlist_entry(mp.node.next, struct mount, mnt_mp_list);
if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
umount_mnt(mnt);
hlist_add_head(&mnt->mnt_umount, &unmounted);
}
else umount_tree(mnt, UMOUNT_CONNECTED);
}
- put_mountpoint(mp);
+ unpin_mountpoint(&mp);
out_unlock:
unlock_mount_hash();
namespace_unlock();
@@ -2259,7 +2227,6 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
return dst_mnt;
src_parent = src_root;
- dst_mnt->mnt_mountpoint = src_root->mnt_mountpoint;
list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) {
if (!is_subdir(src_root_child->mnt_mountpoint, dentry))
@@ -2294,8 +2261,16 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
if (IS_ERR(dst_mnt))
goto out;
lock_mount_hash();
- list_add_tail(&dst_mnt->mnt_list, &res->mnt_list);
- attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp, false);
+ if (src_mnt->mnt.mnt_flags & MNT_LOCKED)
+ dst_mnt->mnt.mnt_flags |= MNT_LOCKED;
+ if (unlikely(flag & CL_EXPIRE)) {
+ /* stick the duplicate mount on the same expiry
+ * list as the original if that was on one */
+ if (!list_empty(&src_mnt->mnt_expire))
+ list_add(&dst_mnt->mnt_expire,
+ &src_mnt->mnt_expire);
+ }
+ attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp);
unlock_mount_hash();
}
}
@@ -2310,107 +2285,97 @@ out:
return dst_mnt;
}
-/* Caller should check returned pointer for errors */
-
-struct vfsmount *collect_mounts(const struct path *path)
+static inline bool extend_array(struct path **res, struct path **to_free,
+ unsigned n, unsigned *count, unsigned new_count)
{
- struct mount *tree;
- namespace_lock();
- if (!check_mnt(real_mount(path->mnt)))
- tree = ERR_PTR(-EINVAL);
- else
- tree = copy_tree(real_mount(path->mnt), path->dentry,
- CL_COPY_ALL | CL_PRIVATE);
- namespace_unlock();
- if (IS_ERR(tree))
- return ERR_CAST(tree);
- return &tree->mnt;
-}
+ struct path *p;
-static void free_mnt_ns(struct mnt_namespace *);
-static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
+ if (likely(n < *count))
+ return true;
+ p = kmalloc_array(new_count, sizeof(struct path), GFP_KERNEL);
+ if (p && *count)
+ memcpy(p, *res, *count * sizeof(struct path));
+ *count = new_count;
+ kfree(*to_free);
+ *to_free = *res = p;
+ return p;
+}
-static inline bool must_dissolve(struct mnt_namespace *mnt_ns)
+struct path *collect_paths(const struct path *path,
+ struct path *prealloc, unsigned count)
{
- /*
- * This mount belonged to an anonymous mount namespace
- * but was moved to a non-anonymous mount namespace and
- * then unmounted.
- */
- if (unlikely(!mnt_ns))
- return false;
+ struct mount *root = real_mount(path->mnt);
+ struct mount *child;
+ struct path *res = prealloc, *to_free = NULL;
+ unsigned n = 0;
- /*
- * This mount belongs to a non-anonymous mount namespace
- * and we know that such a mount can never transition to
- * an anonymous mount namespace again.
- */
- if (!is_anon_ns(mnt_ns)) {
- /*
- * A detached mount either belongs to an anonymous mount
- * namespace or a non-anonymous mount namespace. It
- * should never belong to something purely internal.
- */
- VFS_WARN_ON_ONCE(mnt_ns == MNT_NS_INTERNAL);
- return false;
+ guard(rwsem_read)(&namespace_sem);
+
+ if (!check_mnt(root))
+ return ERR_PTR(-EINVAL);
+ if (!extend_array(&res, &to_free, 0, &count, 32))
+ return ERR_PTR(-ENOMEM);
+ res[n++] = *path;
+ list_for_each_entry(child, &root->mnt_mounts, mnt_child) {
+ if (!is_subdir(child->mnt_mountpoint, path->dentry))
+ continue;
+ for (struct mount *m = child; m; m = next_mnt(m, child)) {
+ if (!extend_array(&res, &to_free, n, &count, 2 * count))
+ return ERR_PTR(-ENOMEM);
+ res[n].mnt = &m->mnt;
+ res[n].dentry = m->mnt.mnt_root;
+ n++;
+ }
}
+ if (!extend_array(&res, &to_free, n, &count, count + 1))
+ return ERR_PTR(-ENOMEM);
+ memset(res + n, 0, (count - n) * sizeof(struct path));
+ for (struct path *p = res; p->mnt; p++)
+ path_get(p);
+ return res;
+}
- return true;
+void drop_collected_paths(struct path *paths, struct path *prealloc)
+{
+ for (struct path *p = paths; p->mnt; p++)
+ path_put(p);
+ if (paths != prealloc)
+ kfree(paths);
}
+static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
+
void dissolve_on_fput(struct vfsmount *mnt)
{
- struct mnt_namespace *ns;
struct mount *m = real_mount(mnt);
+ /*
+ * m used to be the root of anon namespace; if it still is one,
+ * we need to dissolve the mount tree and free that namespace.
+ * Let's try to avoid taking namespace_sem if we can determine
+ * that there's nothing to do without it - rcu_read_lock() is
+ * enough to make anon_ns_root() memory-safe and once m has
+ * left its namespace, it's no longer our concern, since it will
+ * never become a root of anon ns again.
+ */
+
scoped_guard(rcu) {
- if (!must_dissolve(READ_ONCE(m->mnt_ns)))
+ if (!anon_ns_root(m))
return;
}
scoped_guard(namespace_lock, &namespace_sem) {
- ns = m->mnt_ns;
- if (!must_dissolve(ns))
- return;
-
- /*
- * After must_dissolve() we know that this is a detached
- * mount in an anonymous mount namespace.
- *
- * Now when mnt_has_parent() reports that this mount
- * tree has a parent, we know that this anonymous mount
- * tree has been moved to another anonymous mount
- * namespace.
- *
- * So when closing this file we cannot unmount the mount
- * tree. This will be done when the file referring to
- * the root of the anonymous mount namespace will be
- * closed (It could already be closed but it would sync
- * on @namespace_sem and wait for us to finish.).
- */
- if (mnt_has_parent(m))
+ if (!anon_ns_root(m))
return;
+ emptied_ns = m->mnt_ns;
lock_mount_hash();
umount_tree(m, UMOUNT_CONNECTED);
unlock_mount_hash();
}
-
- /* Make sure we notice when we leak mounts. */
- VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
- free_mnt_ns(ns);
}
-void drop_collected_mounts(struct vfsmount *mnt)
-{
- namespace_lock();
- lock_mount_hash();
- umount_tree(real_mount(mnt), 0);
- unlock_mount_hash();
- namespace_unlock();
-}
-
-bool has_locked_children(struct mount *mnt, struct dentry *dentry)
+static bool __has_locked_children(struct mount *mnt, struct dentry *dentry)
{
struct mount *child;
@@ -2424,6 +2389,16 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry)
return false;
}
+bool has_locked_children(struct mount *mnt, struct dentry *dentry)
+{
+ bool res;
+
+ read_seqlock_excl(&mount_lock);
+ res = __has_locked_children(mnt, dentry);
+ read_sequnlock_excl(&mount_lock);
+ return res;
+}
+
/*
* Check that there aren't references to earlier/same mount namespaces in the
* specified subtree. Such references can act as pins for mount namespaces
@@ -2468,23 +2443,25 @@ struct vfsmount *clone_private_mount(const struct path *path)
if (IS_MNT_UNBINDABLE(old_mnt))
return ERR_PTR(-EINVAL);
- if (mnt_has_parent(old_mnt)) {
- if (!check_mnt(old_mnt))
- return ERR_PTR(-EINVAL);
- } else {
- if (!is_mounted(&old_mnt->mnt))
- return ERR_PTR(-EINVAL);
-
- /* Make sure this isn't something purely kernel internal. */
- if (!is_anon_ns(old_mnt->mnt_ns))
+ /*
+ * Make sure the source mount is acceptable.
+ * Anything mounted in our mount namespace is allowed.
+ * Otherwise, it must be the root of an anonymous mount
+ * namespace, and we need to make sure no namespace
+ * loops get created.
+ */
+ if (!check_mnt(old_mnt)) {
+ if (!anon_ns_root(old_mnt))
return ERR_PTR(-EINVAL);
- /* Make sure we don't create mount namespace loops. */
if (!check_for_nsfs_mounts(old_mnt))
return ERR_PTR(-EINVAL);
}
- if (has_locked_children(old_mnt, path->dentry))
+ if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ if (__has_locked_children(old_mnt, path->dentry))
return ERR_PTR(-EINVAL);
new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
@@ -2497,21 +2474,6 @@ struct vfsmount *clone_private_mount(const struct path *path)
}
EXPORT_SYMBOL_GPL(clone_private_mount);
-int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
- struct vfsmount *root)
-{
- struct mount *mnt;
- int res = f(root, arg);
- if (res)
- return res;
- list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
- res = f(&mnt->mnt, arg);
- if (res)
- return res;
- }
- return 0;
-}
-
static void lock_mnt_tree(struct mount *mnt)
{
struct mount *p;
@@ -2533,7 +2495,7 @@ static void lock_mnt_tree(struct mount *mnt)
if (flags & MNT_NOEXEC)
flags |= MNT_LOCK_NOEXEC;
/* Don't allow unprivileged users to reveal what is under a mount */
- if (list_empty(&p->mnt_expire))
+ if (list_empty(&p->mnt_expire) && p != mnt)
flags |= MNT_LOCKED;
p->mnt.mnt_flags = flags;
}
@@ -2554,7 +2516,7 @@ static int invent_group_ids(struct mount *mnt, bool recurse)
struct mount *p;
for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
- if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
+ if (!p->mnt_group_id) {
int err = mnt_alloc_group_id(p);
if (err) {
cleanup_group_ids(mnt, p);
@@ -2590,17 +2552,15 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
}
enum mnt_tree_flags_t {
- MNT_TREE_MOVE = BIT(0),
- MNT_TREE_BENEATH = BIT(1),
- MNT_TREE_PROPAGATION = BIT(2),
+ MNT_TREE_BENEATH = BIT(0),
+ MNT_TREE_PROPAGATION = BIT(1),
};
/**
* attach_recursive_mnt - attach a source mount tree
* @source_mnt: mount tree to be attached
- * @top_mnt: mount that @source_mnt will be mounted on or mounted beneath
+ * @dest_mnt: mount that @source_mnt will be mounted on
* @dest_mp: the mountpoint @source_mnt will be mounted at
- * @flags: modify how @source_mnt is supposed to be attached
*
* NOTE: in the table below explains the semantics when a source mount
* of a given type is attached to a destination mount of a given type.
@@ -2663,26 +2623,31 @@ enum mnt_tree_flags_t {
* Otherwise a negative error code is returned.
*/
static int attach_recursive_mnt(struct mount *source_mnt,
- struct mount *top_mnt,
- struct mountpoint *dest_mp,
- enum mnt_tree_flags_t flags)
+ struct mount *dest_mnt,
+ struct mountpoint *dest_mp)
{
struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
HLIST_HEAD(tree_list);
- struct mnt_namespace *ns = top_mnt->mnt_ns;
- struct mountpoint *smp;
- struct mount *child, *dest_mnt, *p;
+ struct mnt_namespace *ns = dest_mnt->mnt_ns;
+ struct pinned_mountpoint root = {};
+ struct mountpoint *shorter = NULL;
+ struct mount *child, *p;
+ struct mount *top;
struct hlist_node *n;
int err = 0;
- bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH;
+ bool moving = mnt_has_parent(source_mnt);
/*
* Preallocate a mountpoint in case the new mounts need to be
* mounted beneath mounts on the same mountpoint.
*/
- smp = get_mountpoint(source_mnt->mnt.mnt_root);
- if (IS_ERR(smp))
- return PTR_ERR(smp);
+ for (top = source_mnt; unlikely(top->overmount); top = top->overmount) {
+ if (!shorter && is_mnt_ns_file(top->mnt.mnt_root))
+ shorter = top->mnt_mp;
+ }
+ err = get_mountpoint(top->mnt.mnt_root, &root);
+ if (err)
+ return err;
/* Is there space to add these mounts to the mount namespace? */
if (!moving) {
@@ -2691,11 +2656,6 @@ static int attach_recursive_mnt(struct mount *source_mnt,
goto out;
}
- if (beneath)
- dest_mnt = top_mnt->mnt_parent;
- else
- dest_mnt = top_mnt;
-
if (IS_MNT_SHARED(dest_mnt)) {
err = invent_group_ids(source_mnt, true);
if (err)
@@ -2712,42 +2672,50 @@ static int attach_recursive_mnt(struct mount *source_mnt,
}
if (moving) {
- if (beneath)
- dest_mp = smp;
- unhash_mnt(source_mnt);
- attach_mnt(source_mnt, top_mnt, dest_mp, beneath);
+ umount_mnt(source_mnt);
mnt_notify_add(source_mnt);
- touch_mnt_namespace(source_mnt->mnt_ns);
+ /* if the mount is moved, it should no longer be expired
+ * automatically */
+ list_del_init(&source_mnt->mnt_expire);
} else {
if (source_mnt->mnt_ns) {
- LIST_HEAD(head);
-
/* move from anon - the caller will destroy */
+ emptied_ns = source_mnt->mnt_ns;
for (p = source_mnt; p; p = next_mnt(p, source_mnt))
- move_from_ns(p, &head);
- list_del_init(&head);
+ move_from_ns(p);
}
- if (beneath)
- mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp);
- else
- mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
- commit_tree(source_mnt);
}
+ mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
+ /*
+ * Now the original copy is in the same state as the secondaries -
+ * its root attached to mountpoint, but not hashed and all mounts
+ * in it are either in our namespace or in no namespace at all.
+ * Add the original to the list of copies and deal with the
+ * rest of work for all of them uniformly.
+ */
+ hlist_add_head(&source_mnt->mnt_hash, &tree_list);
+
hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
struct mount *q;
hlist_del_init(&child->mnt_hash);
- q = __lookup_mnt(&child->mnt_parent->mnt,
- child->mnt_mountpoint);
- if (q)
- mnt_change_mountpoint(child, smp, q);
/* Notice when we are propagating across user namespaces */
if (child->mnt_parent->mnt_ns->user_ns != user_ns)
lock_mnt_tree(child);
- child->mnt.mnt_flags &= ~MNT_LOCKED;
+ q = __lookup_mnt(&child->mnt_parent->mnt,
+ child->mnt_mountpoint);
+ if (q) {
+ struct mountpoint *mp = root.mp;
+ struct mount *r = child;
+ while (unlikely(r->overmount))
+ r = r->overmount;
+ if (unlikely(shorter) && child != source_mnt)
+ mp = shorter;
+ mnt_change_mountpoint(r, mp, q);
+ }
commit_tree(child);
}
- put_mountpoint(smp);
+ unpin_mountpoint(&root);
unlock_mount_hash();
return 0;
@@ -2764,7 +2732,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
ns->pending_mounts = 0;
read_seqlock_excl(&mount_lock);
- put_mountpoint(smp);
+ unpin_mountpoint(&root);
read_sequnlock_excl(&mount_lock);
return err;
@@ -2804,12 +2772,12 @@ static int attach_recursive_mnt(struct mount *source_mnt,
* Return: Either the target mountpoint on the top mount or the top
* mount's mountpoint.
*/
-static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
+static int do_lock_mount(struct path *path, struct pinned_mountpoint *pinned, bool beneath)
{
struct vfsmount *mnt = path->mnt;
struct dentry *dentry;
- struct mountpoint *mp = ERR_PTR(-ENOENT);
struct path under = {};
+ int err = -ENOENT;
for (;;) {
struct mount *m = real_mount(mnt);
@@ -2847,8 +2815,8 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
path->dentry = dget(mnt->mnt_root);
continue; // got overmounted
}
- mp = get_mountpoint(dentry);
- if (IS_ERR(mp))
+ err = get_mountpoint(dentry, pinned);
+ if (err)
break;
if (beneath) {
/*
@@ -2859,25 +2827,25 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
*/
path_put(&under);
}
- return mp;
+ return 0;
}
namespace_unlock();
inode_unlock(dentry->d_inode);
if (beneath)
path_put(&under);
- return mp;
+ return err;
}
-static inline struct mountpoint *lock_mount(struct path *path)
+static inline int lock_mount(struct path *path, struct pinned_mountpoint *m)
{
- return do_lock_mount(path, false);
+ return do_lock_mount(path, m, false);
}
-static void unlock_mount(struct mountpoint *where)
+static void unlock_mount(struct pinned_mountpoint *m)
{
- inode_unlock(where->m_dentry->d_inode);
+ inode_unlock(m->mp->m_dentry->d_inode);
read_seqlock_excl(&mount_lock);
- put_mountpoint(where);
+ unpin_mountpoint(m);
read_sequnlock_excl(&mount_lock);
namespace_unlock();
}
@@ -2891,7 +2859,7 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
d_is_dir(mnt->mnt.mnt_root))
return -ENOTDIR;
- return attach_recursive_mnt(mnt, p, mp, 0);
+ return attach_recursive_mnt(mnt, p, mp);
}
/*
@@ -2930,16 +2898,18 @@ static int do_change_type(struct path *path, int ms_flags)
return -EINVAL;
namespace_lock();
+ if (!check_mnt(mnt)) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
if (type == MS_SHARED) {
err = invent_group_ids(mnt, recurse);
if (err)
goto out_unlock;
}
- lock_mount_hash();
for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
change_mnt_propagation(m, type);
- unlock_mount_hash();
out_unlock:
namespace_unlock();
@@ -3013,26 +2983,21 @@ static inline bool may_copy_tree(struct path *path)
static struct mount *__do_loopback(struct path *old_path, int recurse)
{
- struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
+ struct mount *old = real_mount(old_path->mnt);
if (IS_MNT_UNBINDABLE(old))
- return mnt;
+ return ERR_PTR(-EINVAL);
if (!may_copy_tree(old_path))
- return mnt;
+ return ERR_PTR(-EINVAL);
- if (!recurse && has_locked_children(old, old_path->dentry))
- return mnt;
+ if (!recurse && __has_locked_children(old, old_path->dentry))
+ return ERR_PTR(-EINVAL);
if (recurse)
- mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
+ return copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
else
- mnt = clone_mnt(old, old_path->dentry, 0);
-
- if (!IS_ERR(mnt))
- mnt->mnt.mnt_flags &= ~MNT_LOCKED;
-
- return mnt;
+ return clone_mnt(old, old_path->dentry, 0);
}
/*
@@ -3043,7 +3008,7 @@ static int do_loopback(struct path *path, const char *old_name,
{
struct path old_path;
struct mount *mnt = NULL, *parent;
- struct mountpoint *mp;
+ struct pinned_mountpoint mp = {};
int err;
if (!old_name || !*old_name)
return -EINVAL;
@@ -3055,11 +3020,9 @@ static int do_loopback(struct path *path, const char *old_name,
if (mnt_ns_loop(old_path.dentry))
goto out;
- mp = lock_mount(path);
- if (IS_ERR(mp)) {
- err = PTR_ERR(mp);
+ err = lock_mount(path, &mp);
+ if (err)
goto out;
- }
parent = real_mount(path->mnt);
if (!check_mnt(parent))
@@ -3071,14 +3034,14 @@ static int do_loopback(struct path *path, const char *old_name,
goto out2;
}
- err = graft_tree(mnt, parent, mp);
+ err = graft_tree(mnt, parent, mp.mp);
if (err) {
lock_mount_hash();
umount_tree(mnt, UMOUNT_SYNC);
unlock_mount_hash();
}
out2:
- unlock_mount(mp);
+ unlock_mount(&mp);
out:
path_put(&old_path);
return err;
@@ -3414,7 +3377,7 @@ static int do_set_group(struct path *from_path, struct path *to_path)
goto out;
/* From mount should not have locked children in place of To's root */
- if (has_locked_children(from, to->mnt.mnt_root))
+ if (__has_locked_children(from, to->mnt.mnt_root))
goto out;
/* Setting sharing groups is only allowed on private mounts */
@@ -3426,18 +3389,14 @@ static int do_set_group(struct path *from_path, struct path *to_path)
goto out;
if (IS_MNT_SLAVE(from)) {
- struct mount *m = from->mnt_master;
-
- list_add(&to->mnt_slave, &m->mnt_slave_list);
- to->mnt_master = m;
+ hlist_add_behind(&to->mnt_slave, &from->mnt_slave);
+ to->mnt_master = from->mnt_master;
}
if (IS_MNT_SHARED(from)) {
to->mnt_group_id = from->mnt_group_id;
list_add(&to->mnt_share, &from->mnt_share);
- lock_mount_hash();
set_mnt_shared(to);
- unlock_mount_hash();
}
err = 0;
@@ -3453,18 +3412,36 @@ out:
* Check if path is overmounted, i.e., if there's a mount on top of
* @path->mnt with @path->dentry as mountpoint.
*
- * Context: This function expects namespace_lock() to be held.
+ * Context: namespace_sem must be held at least shared.
+ * MUST NOT be called under lock_mount_hash() (there one should just
+ * call __lookup_mnt() and check if it returns NULL).
* Return: If path is overmounted true is returned, false if not.
*/
static inline bool path_overmounted(const struct path *path)
{
+ unsigned seq = read_seqbegin(&mount_lock);
+ bool no_child;
+
rcu_read_lock();
- if (unlikely(__lookup_mnt(path->mnt, path->dentry))) {
- rcu_read_unlock();
- return true;
- }
+ no_child = !__lookup_mnt(path->mnt, path->dentry);
rcu_read_unlock();
- return false;
+ if (need_seqretry(&mount_lock, seq)) {
+ read_seqlock_excl(&mount_lock);
+ no_child = !__lookup_mnt(path->mnt, path->dentry);
+ read_sequnlock_excl(&mount_lock);
+ }
+ return unlikely(!no_child);
+}
+
+/*
+ * Check if there is a possibly empty chain of descent from p1 to p2.
+ * Locks: namespace_sem (shared) or mount_lock (read_seqlock_excl).
+ */
+static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2)
+{
+ while (p2 != p1 && mnt_has_parent(p2))
+ p2 = p2->mnt_parent;
+ return p2 == p1;
}
/**
@@ -3518,9 +3495,8 @@ static int can_move_mount_beneath(const struct path *from,
if (parent_mnt_to == current->nsproxy->mnt_ns->root)
return -EINVAL;
- for (struct mount *p = mnt_from; mnt_has_parent(p); p = p->mnt_parent)
- if (p == mnt_to)
- return -EINVAL;
+ if (mount_is_ancestor(mnt_to, mnt_from))
+ return -EINVAL;
/*
* If the parent mount propagates to the child mount this would
@@ -3605,75 +3581,67 @@ static int do_move_mount(struct path *old_path,
struct mount *p;
struct mount *old;
struct mount *parent;
- struct mountpoint *mp, *old_mp;
+ struct pinned_mountpoint mp;
int err;
- bool attached, beneath = flags & MNT_TREE_BENEATH;
+ bool beneath = flags & MNT_TREE_BENEATH;
- mp = do_lock_mount(new_path, beneath);
- if (IS_ERR(mp))
- return PTR_ERR(mp);
+ err = do_lock_mount(new_path, &mp, beneath);
+ if (err)
+ return err;
old = real_mount(old_path->mnt);
p = real_mount(new_path->mnt);
parent = old->mnt_parent;
- attached = mnt_has_parent(old);
- if (attached)
- flags |= MNT_TREE_MOVE;
- old_mp = old->mnt_mp;
ns = old->mnt_ns;
err = -EINVAL;
- if (!may_use_mount(p))
- goto out;
- /* The thing moved must be mounted... */
- if (!is_mounted(&old->mnt))
- goto out;
-
- /* ... and either ours or the root of anon namespace */
- if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
- goto out;
-
- if (is_anon_ns(ns) && ns == p->mnt_ns) {
+ if (check_mnt(old)) {
+ /* if the source is in our namespace... */
+ /* ... it should be detachable from parent */
+ if (!mnt_has_parent(old) || IS_MNT_LOCKED(old))
+ goto out;
+ /* ... and the target should be in our namespace */
+ if (!check_mnt(p))
+ goto out;
+ /* parent of the source should not be shared */
+ if (IS_MNT_SHARED(parent))
+ goto out;
+ } else {
/*
- * Ending up with two files referring to the root of the
- * same anonymous mount namespace would cause an error
- * as this would mean trying to move the same mount
- * twice into the mount tree which would be rejected
- * later. But be explicit about it right here.
+ * otherwise the source must be the root of some anon namespace.
*/
- goto out;
- } else if (is_anon_ns(p->mnt_ns)) {
+ if (!anon_ns_root(old))
+ goto out;
/*
- * Don't allow moving an attached mount tree to an
- * anonymous mount tree.
+ * Bail out early if the target is within the same namespace -
+ * subsequent checks would've rejected that, but they lose
+ * some corner cases if we check it early.
*/
- goto out;
+ if (ns == p->mnt_ns)
+ goto out;
+ /*
+ * Target should be either in our namespace or in an acceptable
+ * anon namespace, sensu check_anonymous_mnt().
+ */
+ if (!may_use_mount(p))
+ goto out;
}
- if (old->mnt.mnt_flags & MNT_LOCKED)
- goto out;
-
if (!path_mounted(old_path))
goto out;
if (d_is_dir(new_path->dentry) !=
d_is_dir(old_path->dentry))
goto out;
- /*
- * Don't move a mount residing in a shared parent.
- */
- if (attached && IS_MNT_SHARED(parent))
- goto out;
if (beneath) {
- err = can_move_mount_beneath(old_path, new_path, mp);
+ err = can_move_mount_beneath(old_path, new_path, mp.mp);
if (err)
goto out;
err = -EINVAL;
p = p->mnt_parent;
- flags |= MNT_TREE_BENEATH;
}
/*
@@ -3685,30 +3653,12 @@ static int do_move_mount(struct path *old_path,
err = -ELOOP;
if (!check_for_nsfs_mounts(old))
goto out;
- for (; mnt_has_parent(p); p = p->mnt_parent)
- if (p == old)
- goto out;
-
- err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags);
- if (err)
+ if (mount_is_ancestor(old, p))
goto out;
- /* if the mount is moved, it should no longer be expire
- * automatically */
- list_del_init(&old->mnt_expire);
- if (attached)
- put_mountpoint(old_mp);
+ err = attach_recursive_mnt(old, p, mp.mp);
out:
- unlock_mount(mp);
- if (!err) {
- if (attached) {
- mntput_no_expire(parent);
- } else {
- /* Make sure we notice when we leak mounts. */
- VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
- free_mnt_ns(ns);
- }
- }
+ unlock_mount(&mp);
return err;
}
@@ -3769,7 +3719,7 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
unsigned int mnt_flags)
{
struct vfsmount *mnt;
- struct mountpoint *mp;
+ struct pinned_mountpoint mp = {};
struct super_block *sb = fc->root->d_sb;
int error;
@@ -3790,13 +3740,12 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
mnt_warn_timestamp_expiry(mountpoint, mnt);
- mp = lock_mount(mountpoint);
- if (IS_ERR(mp)) {
- mntput(mnt);
- return PTR_ERR(mp);
+ error = lock_mount(mountpoint, &mp);
+ if (!error) {
+ error = do_add_mount(real_mount(mnt), mp.mp,
+ mountpoint, mnt_flags);
+ unlock_mount(&mp);
}
- error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags);
- unlock_mount(mp);
if (error < 0)
mntput(mnt);
return error;
@@ -3864,7 +3813,7 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
int finish_automount(struct vfsmount *m, const struct path *path)
{
struct dentry *dentry = path->dentry;
- struct mountpoint *mp;
+ struct pinned_mountpoint mp = {};
struct mount *mnt;
int err;
@@ -3896,14 +3845,13 @@ int finish_automount(struct vfsmount *m, const struct path *path)
err = 0;
goto discard_locked;
}
- mp = get_mountpoint(dentry);
- if (IS_ERR(mp)) {
- err = PTR_ERR(mp);
+ err = get_mountpoint(dentry, &mp);
+ if (err)
goto discard_locked;
- }
- err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
- unlock_mount(mp);
+ err = do_add_mount(mnt, mp.mp, path,
+ path->mnt->mnt_flags | MNT_SHRINKABLE);
+ unlock_mount(&mp);
if (unlikely(err))
goto discard;
return 0;
@@ -3912,12 +3860,6 @@ discard_locked:
namespace_unlock();
inode_unlock(dentry->d_inode);
discard:
- /* remove m from any expiration list it may be on */
- if (!list_empty(&mnt->mnt_expire)) {
- namespace_lock();
- list_del_init(&mnt->mnt_expire);
- namespace_unlock();
- }
mntput(m);
return err;
}
@@ -3929,11 +3871,9 @@ discard:
*/
void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
{
- namespace_lock();
-
+ read_seqlock_excl(&mount_lock);
list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
-
- namespace_unlock();
+ read_sequnlock_excl(&mount_lock);
}
EXPORT_SYMBOL(mnt_set_expiry);
@@ -4287,7 +4227,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
/* First pass: copy the tree topology */
copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
if (user_ns != ns->user_ns)
- copy_flags |= CL_SHARED_TO_SLAVE;
+ copy_flags |= CL_SLAVE;
new = copy_tree(old, old->mnt.mnt_root, copy_flags);
if (IS_ERR(new)) {
namespace_unlock();
@@ -4712,7 +4652,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
{
struct path new, old, root;
struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
- struct mountpoint *old_mp, *root_mp;
+ struct pinned_mountpoint old_mp = {};
int error;
if (!may_mount())
@@ -4733,9 +4673,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
goto out2;
get_fs_root(current->fs, &root);
- old_mp = lock_mount(&old);
- error = PTR_ERR(old_mp);
- if (IS_ERR(old_mp))
+ error = lock_mount(&old, &old_mp);
+ if (error)
goto out3;
error = -EINVAL;
@@ -4762,11 +4701,11 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
if (!path_mounted(&root))
goto out4; /* not a mountpoint */
if (!mnt_has_parent(root_mnt))
- goto out4; /* not attached */
+ goto out4; /* absolute root */
if (!path_mounted(&new))
goto out4; /* not a mountpoint */
if (!mnt_has_parent(new_mnt))
- goto out4; /* not attached */
+ goto out4; /* absolute root */
/* make sure we can reach put_old from new_root */
if (!is_path_reachable(old_mnt, old.dentry, &new))
goto out4;
@@ -4775,29 +4714,25 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
goto out4;
lock_mount_hash();
umount_mnt(new_mnt);
- root_mp = unhash_mnt(root_mnt); /* we'll need its mountpoint */
if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
new_mnt->mnt.mnt_flags |= MNT_LOCKED;
root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
}
- /* mount old root on put_old */
- attach_mnt(root_mnt, old_mnt, old_mp, false);
/* mount new_root on / */
- attach_mnt(new_mnt, root_parent, root_mp, false);
- mnt_add_count(root_parent, -1);
+ attach_mnt(new_mnt, root_parent, root_mnt->mnt_mp);
+ umount_mnt(root_mnt);
+ /* mount old root on put_old */
+ attach_mnt(root_mnt, old_mnt, old_mp.mp);
touch_mnt_namespace(current->nsproxy->mnt_ns);
/* A moved mount should not expire automatically */
list_del_init(&new_mnt->mnt_expire);
- put_mountpoint(root_mp);
unlock_mount_hash();
mnt_notify_add(root_mnt);
mnt_notify_add(new_mnt);
chroot_fs_refs(&root, &new);
error = 0;
out4:
- unlock_mount(old_mp);
- if (!error)
- mntput_no_expire(ex_parent);
+ unlock_mount(&old_mp);
out3:
path_put(&root);
out2:
@@ -4999,22 +4934,7 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
err = -EINVAL;
lock_mount_hash();
- /* Ensure that this isn't anything purely vfs internal. */
- if (!is_mounted(&mnt->mnt))
- goto out;
-
- /*
- * If this is an attached mount make sure it's located in the callers
- * mount namespace. If it's not don't let the caller interact with it.
- *
- * If this mount doesn't have a parent it's most often simply a
- * detached mount with an anonymous mount namespace. IOW, something
- * that's simply not attached yet. But there are apparently also users
- * that do change mount properties on the rootfs itself. That obviously
- * neither has a parent nor is it a detached mount so we cannot
- * unconditionally check for detached mounts.
- */
- if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt))
+ if (!anon_ns_root(mnt) && !check_mnt(mnt))
goto out;
/*
@@ -5261,16 +5181,12 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,
kattr.kflags |= MOUNT_KATTR_RECURSE;
ret = wants_mount_setattr(uattr, usize, &kattr);
- if (ret < 0)
- return ret;
-
- if (ret) {
+ if (ret > 0) {
ret = do_mount_setattr(&file->f_path, &kattr);
- if (ret)
- return ret;
-
finish_mount_kattr(&kattr);
}
+ if (ret)
+ return ret;
}
fd = get_unused_fd_flags(flags & O_CLOEXEC);
@@ -5382,7 +5298,7 @@ static void statmount_mnt_basic(struct kstatmount *s)
s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);
s->sm.mnt_propagation = mnt_to_propagation_flags(m);
- s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0;
+ s->sm.mnt_peer_group = m->mnt_group_id;
s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;
}
@@ -6174,9 +6090,11 @@ static void __init init_mount_tree(void)
if (IS_ERR(mnt))
panic("Can't create rootfs");
- ns = alloc_mnt_ns(&init_user_ns, false);
+ ns = alloc_mnt_ns(&init_user_ns, true);
if (IS_ERR(ns))
panic("Can't allocate initial namespace");
+ ns->seq = atomic64_inc_return(&mnt_ns_seq);
+ ns->ns.inum = PROC_MNT_INIT_INO;
m = real_mount(mnt);
ns->root = m;
ns->nr_mounts = 1;
@@ -6186,7 +6104,6 @@ static void __init init_mount_tree(void)
root.mnt = mnt;
root.dentry = mnt->mnt_root;
- mnt->mnt_flags |= MNT_LOCKED;
set_fs_pwd(current->fs, &root);
set_fs_root(current->fs, &root);
@@ -6233,8 +6150,12 @@ void put_mnt_ns(struct mnt_namespace *ns)
{
if (!refcount_dec_and_test(&ns->ns.count))
return;
- drop_collected_mounts(&ns->root->mnt);
- free_mnt_ns(ns);
+ namespace_lock();
+ emptied_ns = ns;
+ lock_mount_hash();
+ umount_tree(ns->root, 0);
+ unlock_mount_hash();
+ namespace_unlock();
}
struct vfsmount *kern_mount(struct file_system_type *type)
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 0d1b6d35ff3b..18b3dc74c70e 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -78,7 +78,8 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in
* [!] NOTE: This must be run in the same thread as ->issue_read() was called
* in as we access the readahead_control struct.
*/
-static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
+static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq,
+ struct readahead_control *ractl)
{
struct netfs_io_request *rreq = subreq->rreq;
size_t rsize = subreq->len;
@@ -86,7 +87,7 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
if (subreq->source == NETFS_DOWNLOAD_FROM_SERVER)
rsize = umin(rsize, rreq->io_streams[0].sreq_max_len);
- if (rreq->ractl) {
+ if (ractl) {
/* If we don't have sufficient folios in the rolling buffer,
* extract a folioq's worth from the readahead region at a time
* into the buffer. Note that this acquires a ref on each page
@@ -99,7 +100,7 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
while (rreq->submitted < subreq->start + rsize) {
ssize_t added;
- added = rolling_buffer_load_from_ra(&rreq->buffer, rreq->ractl,
+ added = rolling_buffer_load_from_ra(&rreq->buffer, ractl,
&put_batch);
if (added < 0)
return added;
@@ -211,7 +212,8 @@ static void netfs_issue_read(struct netfs_io_request *rreq,
* slicing up the region to be read according to available cache blocks and
* network rsize.
*/
-static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
+static void netfs_read_to_pagecache(struct netfs_io_request *rreq,
+ struct readahead_control *ractl)
{
struct netfs_inode *ictx = netfs_inode(rreq->inode);
unsigned long long start = rreq->start;
@@ -262,9 +264,9 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
if (ret < 0) {
subreq->error = ret;
/* Not queued - release both refs. */
- netfs_put_subrequest(subreq, false,
+ netfs_put_subrequest(subreq,
netfs_sreq_trace_put_cancel);
- netfs_put_subrequest(subreq, false,
+ netfs_put_subrequest(subreq,
netfs_sreq_trace_put_cancel);
break;
}
@@ -291,14 +293,14 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
break;
issue:
- slice = netfs_prepare_read_iterator(subreq);
+ slice = netfs_prepare_read_iterator(subreq, ractl);
if (slice < 0) {
ret = slice;
subreq->error = ret;
trace_netfs_sreq(subreq, netfs_sreq_trace_cancel);
/* Not queued - release both refs. */
- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
+ netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel);
+ netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel);
break;
}
size -= slice;
@@ -312,7 +314,7 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
if (unlikely(size > 0)) {
smp_wmb(); /* Write lists before ALL_QUEUED. */
set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
- netfs_wake_read_collector(rreq);
+ netfs_wake_collector(rreq);
}
/* Defer error return as we may need to wait for outstanding I/O. */
@@ -359,18 +361,15 @@ void netfs_readahead(struct readahead_control *ractl)
netfs_rreq_expand(rreq, ractl);
- rreq->ractl = ractl;
rreq->submitted = rreq->start;
if (rolling_buffer_init(&rreq->buffer, rreq->debug_id, ITER_DEST) < 0)
goto cleanup_free;
- netfs_read_to_pagecache(rreq);
+ netfs_read_to_pagecache(rreq, ractl);
- netfs_put_request(rreq, true, netfs_rreq_trace_put_return);
- return;
+ return netfs_put_request(rreq, netfs_rreq_trace_put_return);
cleanup_free:
- netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
- return;
+ return netfs_put_request(rreq, netfs_rreq_trace_put_failed);
}
EXPORT_SYMBOL(netfs_readahead);
@@ -389,7 +388,6 @@ static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct fo
if (added < 0)
return added;
rreq->submitted = rreq->start + added;
- rreq->ractl = (struct readahead_control *)1UL;
return 0;
}
@@ -459,7 +457,7 @@ static int netfs_read_gaps(struct file *file, struct folio *folio)
iov_iter_bvec(&rreq->buffer.iter, ITER_DEST, bvec, i, rreq->len);
rreq->submitted = rreq->start + flen;
- netfs_read_to_pagecache(rreq);
+ netfs_read_to_pagecache(rreq, NULL);
if (sink)
folio_put(sink);
@@ -470,11 +468,11 @@ static int netfs_read_gaps(struct file *file, struct folio *folio)
folio_mark_uptodate(folio);
}
folio_unlock(folio);
- netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ netfs_put_request(rreq, netfs_rreq_trace_put_return);
return ret < 0 ? ret : 0;
discard:
- netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
+ netfs_put_request(rreq, netfs_rreq_trace_put_discard);
alloc_error:
folio_unlock(folio);
return ret;
@@ -528,13 +526,13 @@ int netfs_read_folio(struct file *file, struct folio *folio)
if (ret < 0)
goto discard;
- netfs_read_to_pagecache(rreq);
+ netfs_read_to_pagecache(rreq, NULL);
ret = netfs_wait_for_read(rreq);
- netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ netfs_put_request(rreq, netfs_rreq_trace_put_return);
return ret < 0 ? ret : 0;
discard:
- netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
+ netfs_put_request(rreq, netfs_rreq_trace_put_discard);
alloc_error:
folio_unlock(folio);
return ret;
@@ -685,11 +683,11 @@ retry:
if (ret < 0)
goto error_put;
- netfs_read_to_pagecache(rreq);
+ netfs_read_to_pagecache(rreq, NULL);
ret = netfs_wait_for_read(rreq);
if (ret < 0)
goto error;
- netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ netfs_put_request(rreq, netfs_rreq_trace_put_return);
have_folio:
ret = folio_wait_private_2_killable(folio);
@@ -701,7 +699,7 @@ have_folio_no_wait:
return 0;
error_put:
- netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
+ netfs_put_request(rreq, netfs_rreq_trace_put_failed);
error:
if (folio) {
folio_unlock(folio);
@@ -750,13 +748,13 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio,
if (ret < 0)
goto error_put;
- netfs_read_to_pagecache(rreq);
+ netfs_read_to_pagecache(rreq, NULL);
ret = netfs_wait_for_read(rreq);
- netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ netfs_put_request(rreq, netfs_rreq_trace_put_return);
return ret < 0 ? ret : 0;
error_put:
- netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
+ netfs_put_request(rreq, netfs_rreq_trace_put_discard);
error:
_leave(" = %d", ret);
return ret;
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index b4826360a411..f27ea5099a68 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -53,30 +53,40 @@ static struct folio *netfs_grab_folio_for_write(struct address_space *mapping,
* data written into the pagecache until we can find out from the server what
* the values actually are.
*/
-static void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode,
- loff_t i_size, loff_t pos, size_t copied)
+void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode,
+ loff_t pos, size_t copied)
{
+ loff_t i_size, end = pos + copied;
blkcnt_t add;
size_t gap;
+ if (end <= i_size_read(inode))
+ return;
+
if (ctx->ops->update_i_size) {
- ctx->ops->update_i_size(inode, pos);
+ ctx->ops->update_i_size(inode, end);
return;
}
- i_size_write(inode, pos);
+ spin_lock(&inode->i_lock);
+
+ i_size = i_size_read(inode);
+ if (end > i_size) {
+ i_size_write(inode, end);
#if IS_ENABLED(CONFIG_FSCACHE)
- fscache_update_cookie(ctx->cache, NULL, &pos);
+ fscache_update_cookie(ctx->cache, NULL, &end);
#endif
- gap = SECTOR_SIZE - (i_size & (SECTOR_SIZE - 1));
- if (copied > gap) {
- add = DIV_ROUND_UP(copied - gap, SECTOR_SIZE);
+ gap = SECTOR_SIZE - (i_size & (SECTOR_SIZE - 1));
+ if (copied > gap) {
+ add = DIV_ROUND_UP(copied - gap, SECTOR_SIZE);
- inode->i_blocks = min_t(blkcnt_t,
- DIV_ROUND_UP(pos, SECTOR_SIZE),
- inode->i_blocks + add);
+ inode->i_blocks = min_t(blkcnt_t,
+ DIV_ROUND_UP(end, SECTOR_SIZE),
+ inode->i_blocks + add);
+ }
}
+ spin_unlock(&inode->i_lock);
}
/**
@@ -111,12 +121,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
struct folio *folio = NULL, *writethrough = NULL;
unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0;
ssize_t written = 0, ret, ret2;
- loff_t i_size, pos = iocb->ki_pos;
+ loff_t pos = iocb->ki_pos;
size_t max_chunk = mapping_max_folio_size(mapping);
bool maybe_trouble = false;
- if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) ||
- iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC))
+ if (unlikely(iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC))
) {
wbc_attach_fdatawrite_inode(&wbc, mapping->host);
@@ -345,10 +354,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
flush_dcache_folio(folio);
/* Update the inode size if we moved the EOF marker */
+ netfs_update_i_size(ctx, inode, pos, copied);
pos += copied;
- i_size = i_size_read(inode);
- if (pos > i_size)
- netfs_update_i_size(ctx, inode, i_size, pos, copied);
written += copied;
if (likely(!wreq)) {
@@ -386,7 +393,7 @@ out:
wbc_detach_inode(&wbc);
if (ret2 == -EIOCBQUEUED)
return ret2;
- if (ret == 0)
+ if (ret == 0 && ret2 < 0)
ret = ret2;
}
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
index 5e3f0aeb51f3..a05e13472baf 100644
--- a/fs/netfs/direct_read.c
+++ b/fs/netfs/direct_read.c
@@ -85,7 +85,7 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
if (rreq->netfs_ops->prepare_read) {
ret = rreq->netfs_ops->prepare_read(subreq);
if (ret < 0) {
- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
+ netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel);
break;
}
}
@@ -103,19 +103,16 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
rreq->netfs_ops->issue_read(subreq);
if (test_bit(NETFS_RREQ_PAUSE, &rreq->flags))
- netfs_wait_for_pause(rreq);
+ netfs_wait_for_paused_read(rreq);
if (test_bit(NETFS_RREQ_FAILED, &rreq->flags))
break;
- if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) &&
- test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags))
- break;
cond_resched();
} while (size > 0);
if (unlikely(size > 0)) {
smp_wmb(); /* Write lists before ALL_QUEUED. */
set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
- netfs_wake_read_collector(rreq);
+ netfs_wake_collector(rreq);
}
return ret;
@@ -144,7 +141,7 @@ static ssize_t netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync)
ret = netfs_dispatch_unbuffered_reads(rreq);
if (!rreq->submitted) {
- netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit);
+ netfs_put_request(rreq, netfs_rreq_trace_put_no_submit);
inode_dio_end(rreq->inode);
ret = 0;
goto out;
@@ -188,7 +185,8 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i
rreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
iocb->ki_pos, orig_count,
- NETFS_DIO_READ);
+ iocb->ki_flags & IOCB_DIRECT ?
+ NETFS_DIO_READ : NETFS_UNBUFFERED_READ);
if (IS_ERR(rreq))
return PTR_ERR(rreq);
@@ -236,7 +234,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i
}
out:
- netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ netfs_put_request(rreq, netfs_rreq_trace_put_return);
if (ret > 0)
orig_count -= ret;
return ret;
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index 42ce53cc216e..a16660ab7f83 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -9,20 +9,6 @@
#include <linux/uio.h>
#include "internal.h"
-static void netfs_cleanup_dio_write(struct netfs_io_request *wreq)
-{
- struct inode *inode = wreq->inode;
- unsigned long long end = wreq->start + wreq->transferred;
-
- if (!wreq->error &&
- i_size_read(inode) < end) {
- if (wreq->netfs_ops->update_i_size)
- wreq->netfs_ops->update_i_size(inode, end);
- else
- i_size_write(inode, end);
- }
-}
-
/*
* Perform an unbuffered write where we may have to do an RMW operation on an
* encrypted file. This can also be used for direct I/O writes.
@@ -87,6 +73,8 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
}
__set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags);
+ if (async)
+ __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags);
/* Copy the data into the bounce buffer and encrypt it. */
// TODO
@@ -96,7 +84,6 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
if (async)
wreq->iocb = iocb;
wreq->len = iov_iter_count(&wreq->buffer.iter);
- wreq->cleanup = netfs_cleanup_dio_write;
ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len);
if (ret < 0) {
_debug("begin = %zd", ret);
@@ -104,20 +91,15 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
}
if (!async) {
- trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip);
- wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
- TASK_UNINTERRUPTIBLE);
- ret = wreq->error;
- if (ret == 0) {
- ret = wreq->transferred;
+ ret = netfs_wait_for_write(wreq);
+ if (ret > 0)
iocb->ki_pos += ret;
- }
} else {
ret = -EIOCBQUEUED;
}
out:
- netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ netfs_put_request(wreq, netfs_rreq_trace_put_return);
return ret;
}
EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked);
diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c
index b1722a82c03d..e4308457633c 100644
--- a/fs/netfs/fscache_io.c
+++ b/fs/netfs/fscache_io.c
@@ -192,8 +192,7 @@ EXPORT_SYMBOL(__fscache_clear_page_bits);
/*
* Deal with the completion of writing the data to the cache.
*/
-static void fscache_wreq_done(void *priv, ssize_t transferred_or_error,
- bool was_async)
+static void fscache_wreq_done(void *priv, ssize_t transferred_or_error)
{
struct fscache_write_request *wreq = priv;
@@ -202,8 +201,7 @@ static void fscache_wreq_done(void *priv, ssize_t transferred_or_error,
wreq->set_bits);
if (wreq->term_func)
- wreq->term_func(wreq->term_func_priv, transferred_or_error,
- was_async);
+ wreq->term_func(wreq->term_func_priv, transferred_or_error);
fscache_end_operation(&wreq->cache_resources);
kfree(wreq);
}
@@ -255,14 +253,14 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
return;
abandon_end:
- return fscache_wreq_done(wreq, ret, false);
+ return fscache_wreq_done(wreq, ret);
abandon_free:
kfree(wreq);
abandon:
if (using_pgpriv2)
fscache_clear_page_bits(mapping, start, len, cond);
if (term_func)
- term_func(term_func_priv, ret, false);
+ term_func(term_func_priv, ret);
}
EXPORT_SYMBOL(__fscache_write_to_cache);
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 1c4f953c3d68..d4f16fefd965 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -23,11 +23,17 @@
/*
* buffered_read.c
*/
-void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async);
+void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error);
int netfs_prefetch_for_write(struct file *file, struct folio *folio,
size_t offset, size_t len);
/*
+ * buffered_write.c
+ */
+void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode,
+ loff_t pos, size_t copied);
+
+/*
* main.c
*/
extern unsigned int netfs_debug;
@@ -62,6 +68,14 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {}
struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq,
enum netfs_folioq_trace trace);
void netfs_reset_iter(struct netfs_io_subrequest *subreq);
+void netfs_wake_collector(struct netfs_io_request *rreq);
+void netfs_subreq_clear_in_progress(struct netfs_io_subrequest *subreq);
+void netfs_wait_for_in_progress_stream(struct netfs_io_request *rreq,
+ struct netfs_io_stream *stream);
+ssize_t netfs_wait_for_read(struct netfs_io_request *rreq);
+ssize_t netfs_wait_for_write(struct netfs_io_request *rreq);
+void netfs_wait_for_paused_read(struct netfs_io_request *rreq);
+void netfs_wait_for_paused_write(struct netfs_io_request *rreq);
/*
* objects.c
@@ -71,9 +85,8 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
loff_t start, size_t len,
enum netfs_io_origin origin);
void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what);
-void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async);
-void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
- enum netfs_rreq_ref_trace what);
+void netfs_clear_subrequests(struct netfs_io_request *rreq);
+void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what);
struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq);
static inline void netfs_see_request(struct netfs_io_request *rreq,
@@ -92,11 +105,9 @@ static inline void netfs_see_subrequest(struct netfs_io_subrequest *subreq,
/*
* read_collect.c
*/
+bool netfs_read_collection(struct netfs_io_request *rreq);
void netfs_read_collection_worker(struct work_struct *work);
-void netfs_wake_read_collector(struct netfs_io_request *rreq);
-void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async);
-ssize_t netfs_wait_for_read(struct netfs_io_request *rreq);
-void netfs_wait_for_pause(struct netfs_io_request *rreq);
+void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error);
/*
* read_pgpriv2.c
@@ -176,8 +187,8 @@ static inline void netfs_stat_d(atomic_t *stat)
* write_collect.c
*/
int netfs_folio_written_back(struct folio *folio);
+bool netfs_write_collection(struct netfs_io_request *wreq);
void netfs_write_collection_worker(struct work_struct *work);
-void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async);
/*
* write_issue.c
@@ -198,8 +209,8 @@ struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len
int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
struct folio *folio, size_t copied, bool to_page_end,
struct folio **writethrough_cache);
-int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
- struct folio *writethrough_cache);
+ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+ struct folio *writethrough_cache);
int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len);
/*
@@ -255,6 +266,39 @@ static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr)
}
/*
+ * Clear and wake up a NETFS_RREQ_* flag bit on a request.
+ */
+static inline void netfs_wake_rreq_flag(struct netfs_io_request *rreq,
+ unsigned int rreq_flag,
+ enum netfs_rreq_trace trace)
+{
+ if (test_bit(rreq_flag, &rreq->flags)) {
+ clear_bit_unlock(rreq_flag, &rreq->flags);
+ smp_mb__after_atomic(); /* Set flag before task state */
+ trace_netfs_rreq(rreq, trace);
+ wake_up(&rreq->waitq);
+ }
+}
+
+/*
+ * Test the NETFS_RREQ_IN_PROGRESS flag, inserting an appropriate barrier.
+ */
+static inline bool netfs_check_rreq_in_progress(const struct netfs_io_request *rreq)
+{
+ /* Order read of flags before read of anything else, such as error. */
+ return test_bit_acquire(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+}
+
+/*
+ * Test the NETFS_SREQ_IN_PROGRESS flag, inserting an appropriate barrier.
+ */
+static inline bool netfs_check_subreq_in_progress(const struct netfs_io_subrequest *subreq)
+{
+ /* Order read of flags before read of anything else, such as error. */
+ return test_bit_acquire(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+}
+
+/*
* fscache-cache.c
*/
#ifdef CONFIG_PROC_FS
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 70ecc8f5f210..73da6c9f5777 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -39,6 +39,7 @@ static const char *netfs_origins[nr__netfs_io_origin] = {
[NETFS_READ_GAPS] = "RG",
[NETFS_READ_SINGLE] = "R1",
[NETFS_READ_FOR_WRITE] = "RW",
+ [NETFS_UNBUFFERED_READ] = "UR",
[NETFS_DIO_READ] = "DR",
[NETFS_WRITEBACK] = "WB",
[NETFS_WRITEBACK_SINGLE] = "W1",
@@ -57,15 +58,15 @@ static int netfs_requests_seq_show(struct seq_file *m, void *v)
if (v == &netfs_io_requests) {
seq_puts(m,
- "REQUEST OR REF FL ERR OPS COVERAGE\n"
- "======== == === == ==== === =========\n"
+ "REQUEST OR REF FLAG ERR OPS COVERAGE\n"
+ "======== == === ==== ==== === =========\n"
);
return 0;
}
rreq = list_entry(v, struct netfs_io_request, proc_link);
seq_printf(m,
- "%08x %s %3d %2lx %4ld %3d @%04llx %llx/%llx",
+ "%08x %s %3d %4lx %4ld %3d @%04llx %llx/%llx",
rreq->debug_id,
netfs_origins[rreq->origin],
refcount_read(&rreq->ref),
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 7099aa07737a..20748bcfbf59 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -313,3 +313,234 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp)
return true;
}
EXPORT_SYMBOL(netfs_release_folio);
+
+/*
+ * Wake the collection work item.
+ */
+void netfs_wake_collector(struct netfs_io_request *rreq)
+{
+ if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) &&
+ !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) {
+ queue_work(system_unbound_wq, &rreq->work);
+ } else {
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wake_queue);
+ wake_up(&rreq->waitq);
+ }
+}
+
+/*
+ * Mark a subrequest as no longer being in progress and, if need be, wake the
+ * collector.
+ */
+void netfs_subreq_clear_in_progress(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct netfs_io_stream *stream = &rreq->io_streams[subreq->stream_nr];
+
+ clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+ smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */
+
+ /* If we are at the head of the queue, wake up the collector. */
+ if (list_is_first(&subreq->rreq_link, &stream->subrequests) ||
+ test_bit(NETFS_RREQ_RETRYING, &rreq->flags))
+ netfs_wake_collector(rreq);
+}
+
+/*
+ * Wait for all outstanding I/O in a stream to quiesce.
+ */
+void netfs_wait_for_in_progress_stream(struct netfs_io_request *rreq,
+ struct netfs_io_stream *stream)
+{
+ struct netfs_io_subrequest *subreq;
+ DEFINE_WAIT(myself);
+
+ list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
+ if (!netfs_check_subreq_in_progress(subreq))
+ continue;
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wait_quiesce);
+ for (;;) {
+ prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
+
+ if (!netfs_check_subreq_in_progress(subreq))
+ break;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_wait_for);
+ schedule();
+ }
+ }
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_waited_quiesce);
+ finish_wait(&rreq->waitq, &myself);
+}
+
+/*
+ * Perform collection in app thread if not offloaded to workqueue.
+ */
+static int netfs_collect_in_app(struct netfs_io_request *rreq,
+ bool (*collector)(struct netfs_io_request *rreq))
+{
+ bool need_collect = false, inactive = true, done = true;
+
+ if (!netfs_check_rreq_in_progress(rreq)) {
+ trace_netfs_rreq(rreq, netfs_rreq_trace_recollect);
+ return 1; /* Done */
+ }
+
+ for (int i = 0; i < NR_IO_STREAMS; i++) {
+ struct netfs_io_subrequest *subreq;
+ struct netfs_io_stream *stream = &rreq->io_streams[i];
+
+ if (!stream->active)
+ continue;
+ inactive = false;
+ trace_netfs_collect_stream(rreq, stream);
+ subreq = list_first_entry_or_null(&stream->subrequests,
+ struct netfs_io_subrequest,
+ rreq_link);
+ if (subreq &&
+ (!netfs_check_subreq_in_progress(subreq) ||
+ test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) {
+ need_collect = true;
+ break;
+ }
+ if (subreq || !test_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags))
+ done = false;
+ }
+
+ if (!need_collect && !inactive && !done)
+ return 0; /* Sleep */
+
+ __set_current_state(TASK_RUNNING);
+ if (collector(rreq)) {
+ /* Drop the ref from the NETFS_RREQ_IN_PROGRESS flag. */
+ netfs_put_request(rreq, netfs_rreq_trace_put_work_ip);
+ return 1; /* Done */
+ }
+
+ if (inactive) {
+ WARN(true, "Failed to collect inactive req R=%08x\n",
+ rreq->debug_id);
+ cond_resched();
+ }
+ return 2; /* Again */
+}
+
+/*
+ * Wait for a request to complete, successfully or otherwise.
+ */
+static ssize_t netfs_wait_for_in_progress(struct netfs_io_request *rreq,
+ bool (*collector)(struct netfs_io_request *rreq))
+{
+ DEFINE_WAIT(myself);
+ ssize_t ret;
+
+ for (;;) {
+ prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
+
+ if (!test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) {
+ switch (netfs_collect_in_app(rreq, collector)) {
+ case 0:
+ break;
+ case 1:
+ goto all_collected;
+ case 2:
+ if (!netfs_check_rreq_in_progress(rreq))
+ break;
+ cond_resched();
+ continue;
+ }
+ }
+
+ if (!netfs_check_rreq_in_progress(rreq))
+ break;
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip);
+ schedule();
+ }
+
+all_collected:
+ trace_netfs_rreq(rreq, netfs_rreq_trace_waited_ip);
+ finish_wait(&rreq->waitq, &myself);
+
+ ret = rreq->error;
+ if (ret == 0) {
+ ret = rreq->transferred;
+ switch (rreq->origin) {
+ case NETFS_DIO_READ:
+ case NETFS_DIO_WRITE:
+ case NETFS_READ_SINGLE:
+ case NETFS_UNBUFFERED_READ:
+ case NETFS_UNBUFFERED_WRITE:
+ break;
+ default:
+ if (rreq->submitted < rreq->len) {
+ trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
+ ret = -EIO;
+ }
+ break;
+ }
+ }
+
+ return ret;
+}
+
+ssize_t netfs_wait_for_read(struct netfs_io_request *rreq)
+{
+ return netfs_wait_for_in_progress(rreq, netfs_read_collection);
+}
+
+ssize_t netfs_wait_for_write(struct netfs_io_request *rreq)
+{
+ return netfs_wait_for_in_progress(rreq, netfs_write_collection);
+}
+
+/*
+ * Wait for a paused operation to unpause or complete in some manner.
+ */
+static void netfs_wait_for_pause(struct netfs_io_request *rreq,
+ bool (*collector)(struct netfs_io_request *rreq))
+{
+ DEFINE_WAIT(myself);
+
+ for (;;) {
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wait_pause);
+ prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
+
+ if (!test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) {
+ switch (netfs_collect_in_app(rreq, collector)) {
+ case 0:
+ break;
+ case 1:
+ goto all_collected;
+ case 2:
+ if (!netfs_check_rreq_in_progress(rreq) ||
+ !test_bit(NETFS_RREQ_PAUSE, &rreq->flags))
+ break;
+ cond_resched();
+ continue;
+ }
+ }
+
+ if (!netfs_check_rreq_in_progress(rreq) ||
+ !test_bit(NETFS_RREQ_PAUSE, &rreq->flags))
+ break;
+
+ schedule();
+ }
+
+all_collected:
+ trace_netfs_rreq(rreq, netfs_rreq_trace_waited_pause);
+ finish_wait(&rreq->waitq, &myself);
+}
+
+void netfs_wait_for_paused_read(struct netfs_io_request *rreq)
+{
+ return netfs_wait_for_pause(rreq, netfs_read_collection);
+}
+
+void netfs_wait_for_paused_write(struct netfs_io_request *rreq)
+{
+ return netfs_wait_for_pause(rreq, netfs_write_collection);
+}
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index dc6b41ef18b0..e8c99738b5bb 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -10,6 +10,8 @@
#include <linux/delay.h>
#include "internal.h"
+static void netfs_free_request(struct work_struct *work);
+
/*
* Allocate an I/O request and initialise it.
*/
@@ -34,6 +36,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
}
memset(rreq, 0, kmem_cache_size(cache));
+ INIT_WORK(&rreq->cleanup_work, netfs_free_request);
rreq->start = start;
rreq->len = len;
rreq->origin = origin;
@@ -49,13 +52,14 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
INIT_LIST_HEAD(&rreq->io_streams[0].subrequests);
INIT_LIST_HEAD(&rreq->io_streams[1].subrequests);
init_waitqueue_head(&rreq->waitq);
- refcount_set(&rreq->ref, 1);
+ refcount_set(&rreq->ref, 2);
if (origin == NETFS_READAHEAD ||
origin == NETFS_READPAGE ||
origin == NETFS_READ_GAPS ||
origin == NETFS_READ_SINGLE ||
origin == NETFS_READ_FOR_WRITE ||
+ origin == NETFS_UNBUFFERED_READ ||
origin == NETFS_DIO_READ) {
INIT_WORK(&rreq->work, netfs_read_collection_worker);
rreq->io_streams[0].avail = true;
@@ -64,8 +68,6 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
}
__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
- if (file && file->f_flags & O_NONBLOCK)
- __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags);
if (rreq->netfs_ops->init_request) {
ret = rreq->netfs_ops->init_request(rreq, file);
if (ret < 0) {
@@ -75,7 +77,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
}
atomic_inc(&ctx->io_count);
- trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new);
+ trace_netfs_rreq_ref(rreq->debug_id, refcount_read(&rreq->ref), netfs_rreq_trace_new);
netfs_proc_add_rreq(rreq);
netfs_stat(&netfs_n_rh_rreq);
return rreq;
@@ -89,7 +91,7 @@ void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace
trace_netfs_rreq_ref(rreq->debug_id, r + 1, what);
}
-void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async)
+void netfs_clear_subrequests(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
struct netfs_io_stream *stream;
@@ -101,8 +103,7 @@ void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async)
subreq = list_first_entry(&stream->subrequests,
struct netfs_io_subrequest, rreq_link);
list_del(&subreq->rreq_link);
- netfs_put_subrequest(subreq, was_async,
- netfs_sreq_trace_put_clear);
+ netfs_put_subrequest(subreq, netfs_sreq_trace_put_clear);
}
}
}
@@ -118,13 +119,19 @@ static void netfs_free_request_rcu(struct rcu_head *rcu)
static void netfs_free_request(struct work_struct *work)
{
struct netfs_io_request *rreq =
- container_of(work, struct netfs_io_request, work);
+ container_of(work, struct netfs_io_request, cleanup_work);
struct netfs_inode *ictx = netfs_inode(rreq->inode);
unsigned int i;
trace_netfs_rreq(rreq, netfs_rreq_trace_free);
+
+ /* Cancel/flush the result collection worker. That does not carry a
+ * ref of its own, so we must wait for it somewhere.
+ */
+ cancel_work_sync(&rreq->work);
+
netfs_proc_del_rreq(rreq);
- netfs_clear_subrequests(rreq, false);
+ netfs_clear_subrequests(rreq);
if (rreq->netfs_ops->free_request)
rreq->netfs_ops->free_request(rreq);
if (rreq->cache_resources.ops)
@@ -145,8 +152,7 @@ static void netfs_free_request(struct work_struct *work)
call_rcu(&rreq->rcu, netfs_free_request_rcu);
}
-void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
- enum netfs_rreq_ref_trace what)
+void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what)
{
unsigned int debug_id;
bool dead;
@@ -156,15 +162,8 @@ void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
debug_id = rreq->debug_id;
dead = __refcount_dec_and_test(&rreq->ref, &r);
trace_netfs_rreq_ref(debug_id, r - 1, what);
- if (dead) {
- if (was_async) {
- rreq->work.func = netfs_free_request;
- if (!queue_work(system_unbound_wq, &rreq->work))
- WARN_ON(1);
- } else {
- netfs_free_request(&rreq->work);
- }
- }
+ if (dead)
+ WARN_ON(!queue_work(system_unbound_wq, &rreq->cleanup_work));
}
}
@@ -206,8 +205,7 @@ void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
what);
}
-static void netfs_free_subrequest(struct netfs_io_subrequest *subreq,
- bool was_async)
+static void netfs_free_subrequest(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
@@ -216,10 +214,10 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq,
rreq->netfs_ops->free_subrequest(subreq);
mempool_free(subreq, rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool);
netfs_stat_d(&netfs_n_rh_sreq);
- netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq);
+ netfs_put_request(rreq, netfs_rreq_trace_put_subreq);
}
-void netfs_put_subrequest(struct netfs_io_subrequest *subreq, bool was_async,
+void netfs_put_subrequest(struct netfs_io_subrequest *subreq,
enum netfs_sreq_ref_trace what)
{
unsigned int debug_index = subreq->debug_index;
@@ -230,5 +228,5 @@ void netfs_put_subrequest(struct netfs_io_subrequest *subreq, bool was_async,
dead = __refcount_dec_and_test(&subreq->ref, &r);
trace_netfs_sreq_ref(debug_id, debug_index, r - 1, what);
if (dead)
- netfs_free_subrequest(subreq, was_async);
+ netfs_free_subrequest(subreq);
}
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index 23c75755ad4e..3e804da1e1eb 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -83,14 +83,12 @@ static void netfs_unlock_read_folio(struct netfs_io_request *rreq,
}
just_unlock:
- if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
- if (folio->index == rreq->no_unlock_folio &&
- test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) {
- _debug("no unlock");
- } else {
- trace_netfs_folio(folio, netfs_folio_trace_read_unlock);
- folio_unlock(folio);
- }
+ if (folio->index == rreq->no_unlock_folio &&
+ test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) {
+ _debug("no unlock");
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_read_unlock);
+ folio_unlock(folio);
}
folioq_clear(folioq, slot);
@@ -220,7 +218,7 @@ reassess:
stream->collected_to = front->start;
}
- if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags))
+ if (netfs_check_subreq_in_progress(front))
notes |= HIT_PENDING;
smp_rmb(); /* Read counters after IN_PROGRESS flag. */
transferred = READ_ONCE(front->transferred);
@@ -280,9 +278,13 @@ reassess:
stream->need_retry = true;
notes |= NEED_RETRY | MADE_PROGRESS;
break;
+ } else if (test_bit(NETFS_RREQ_SHORT_TRANSFER, &rreq->flags)) {
+ notes |= MADE_PROGRESS;
} else {
if (!stream->failed)
- stream->transferred = stream->collected_to - rreq->start;
+ stream->transferred += transferred;
+ if (front->transferred < front->len)
+ set_bit(NETFS_RREQ_SHORT_TRANSFER, &rreq->flags);
notes |= MADE_PROGRESS;
}
@@ -291,13 +293,15 @@ reassess:
spin_lock(&rreq->lock);
remove = front;
- trace_netfs_sreq(front, netfs_sreq_trace_discard);
+ trace_netfs_sreq(front,
+ notes & ABANDON_SREQ ?
+ netfs_sreq_trace_abandoned : netfs_sreq_trace_consumed);
list_del_init(&front->rreq_link);
front = list_first_entry_or_null(&stream->subrequests,
struct netfs_io_subrequest, rreq_link);
stream->front = front;
spin_unlock(&rreq->lock);
- netfs_put_subrequest(remove, false,
+ netfs_put_subrequest(remove,
notes & ABANDON_SREQ ?
netfs_sreq_trace_put_abandon :
netfs_sreq_trace_put_done);
@@ -311,14 +315,8 @@ reassess:
if (notes & NEED_RETRY)
goto need_retry;
- if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) {
- trace_netfs_rreq(rreq, netfs_rreq_trace_unpause);
- clear_bit_unlock(NETFS_RREQ_PAUSE, &rreq->flags);
- smp_mb__after_atomic(); /* Set PAUSE before task state */
- wake_up(&rreq->waitq);
- }
-
if (notes & MADE_PROGRESS) {
+ netfs_wake_rreq_flag(rreq, NETFS_RREQ_PAUSE, netfs_rreq_trace_unpause);
//cond_resched();
goto reassess;
}
@@ -342,24 +340,10 @@ need_retry:
*/
static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
{
- struct netfs_io_subrequest *subreq;
- struct netfs_io_stream *stream = &rreq->io_streams[0];
unsigned int i;
- /* Collect unbuffered reads and direct reads, adding up the transfer
- * sizes until we find the first short or failed subrequest.
- */
- list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
- rreq->transferred += subreq->transferred;
-
- if (subreq->transferred < subreq->len ||
- test_bit(NETFS_SREQ_FAILED, &subreq->flags)) {
- rreq->error = subreq->error;
- break;
- }
- }
-
- if (rreq->origin == NETFS_DIO_READ) {
+ if (rreq->origin == NETFS_UNBUFFERED_READ ||
+ rreq->origin == NETFS_DIO_READ) {
for (i = 0; i < rreq->direct_bv_count; i++) {
flush_dcache_page(rreq->direct_bv[i].bv_page);
// TODO: cifs marks pages in the destination buffer
@@ -371,13 +355,16 @@ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
if (rreq->iocb) {
rreq->iocb->ki_pos += rreq->transferred;
- if (rreq->iocb->ki_complete)
+ if (rreq->iocb->ki_complete) {
+ trace_netfs_rreq(rreq, netfs_rreq_trace_ki_complete);
rreq->iocb->ki_complete(
rreq->iocb, rreq->error ? rreq->error : rreq->transferred);
+ }
}
if (rreq->netfs_ops->done)
rreq->netfs_ops->done(rreq);
- if (rreq->origin == NETFS_DIO_READ)
+ if (rreq->origin == NETFS_UNBUFFERED_READ ||
+ rreq->origin == NETFS_DIO_READ)
inode_dio_end(rreq->inode);
}
@@ -396,9 +383,11 @@ static void netfs_rreq_assess_single(struct netfs_io_request *rreq)
if (rreq->iocb) {
rreq->iocb->ki_pos += rreq->transferred;
- if (rreq->iocb->ki_complete)
+ if (rreq->iocb->ki_complete) {
+ trace_netfs_rreq(rreq, netfs_rreq_trace_ki_complete);
rreq->iocb->ki_complete(
rreq->iocb, rreq->error ? rreq->error : rreq->transferred);
+ }
}
if (rreq->netfs_ops->done)
rreq->netfs_ops->done(rreq);
@@ -410,7 +399,7 @@ static void netfs_rreq_assess_single(struct netfs_io_request *rreq)
* Note that we're in normal kernel thread context at this point, possibly
* running on a workqueue.
*/
-static void netfs_read_collection(struct netfs_io_request *rreq)
+bool netfs_read_collection(struct netfs_io_request *rreq)
{
struct netfs_io_stream *stream = &rreq->io_streams[0];
@@ -420,11 +409,11 @@ static void netfs_read_collection(struct netfs_io_request *rreq)
* queue is empty.
*/
if (!test_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags))
- return;
+ return false;
smp_rmb(); /* Read ALL_QUEUED before subreq lists. */
if (!list_empty(&stream->subrequests))
- return;
+ return false;
/* Okay, declare that all I/O is complete. */
rreq->transferred = stream->transferred;
@@ -433,6 +422,7 @@ static void netfs_read_collection(struct netfs_io_request *rreq)
//netfs_rreq_is_still_valid(rreq);
switch (rreq->origin) {
+ case NETFS_UNBUFFERED_READ:
case NETFS_DIO_READ:
case NETFS_READ_GAPS:
netfs_rreq_assess_dio(rreq);
@@ -445,14 +435,15 @@ static void netfs_read_collection(struct netfs_io_request *rreq)
}
task_io_account_read(rreq->transferred);
- trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
- clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+ netfs_wake_rreq_flag(rreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip);
+ /* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */
trace_netfs_rreq(rreq, netfs_rreq_trace_done);
- netfs_clear_subrequests(rreq, false);
+ netfs_clear_subrequests(rreq);
netfs_unlock_abandoned_read_pages(rreq);
if (unlikely(rreq->copy_to_cache))
netfs_pgpriv2_end_copy_to_cache(rreq);
+ return true;
}
void netfs_read_collection_worker(struct work_struct *work)
@@ -460,26 +451,12 @@ void netfs_read_collection_worker(struct work_struct *work)
struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work);
netfs_see_request(rreq, netfs_rreq_trace_see_work);
- if (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
- netfs_read_collection(rreq);
- netfs_put_request(rreq, false, netfs_rreq_trace_put_work);
-}
-
-/*
- * Wake the collection work item.
- */
-void netfs_wake_read_collector(struct netfs_io_request *rreq)
-{
- if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) &&
- !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) {
- if (!work_pending(&rreq->work)) {
- netfs_get_request(rreq, netfs_rreq_trace_get_work);
- if (!queue_work(system_unbound_wq, &rreq->work))
- netfs_put_request(rreq, true, netfs_rreq_trace_put_work_nq);
- }
- } else {
- trace_netfs_rreq(rreq, netfs_rreq_trace_wake_queue);
- wake_up(&rreq->waitq);
+ if (netfs_check_rreq_in_progress(rreq)) {
+ if (netfs_read_collection(rreq))
+ /* Drop the ref from the IN_PROGRESS flag. */
+ netfs_put_request(rreq, netfs_rreq_trace_put_work_ip);
+ else
+ netfs_see_request(rreq, netfs_rreq_trace_see_work_complete);
}
}
@@ -511,7 +488,7 @@ void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq)
list_is_first(&subreq->rreq_link, &stream->subrequests)
) {
__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
- netfs_wake_read_collector(rreq);
+ netfs_wake_collector(rreq);
}
}
EXPORT_SYMBOL(netfs_read_subreq_progress);
@@ -535,7 +512,6 @@ EXPORT_SYMBOL(netfs_read_subreq_progress);
void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
- struct netfs_io_stream *stream = &rreq->io_streams[0];
switch (subreq->source) {
case NETFS_READ_FROM_CACHE:
@@ -582,23 +558,15 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq)
}
trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
-
- clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
- smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */
-
- /* If we are at the head of the queue, wake up the collector. */
- if (list_is_first(&subreq->rreq_link, &stream->subrequests) ||
- test_bit(NETFS_RREQ_RETRYING, &rreq->flags))
- netfs_wake_read_collector(rreq);
-
- netfs_put_subrequest(subreq, true, netfs_sreq_trace_put_terminated);
+ netfs_subreq_clear_in_progress(subreq);
+ netfs_put_subrequest(subreq, netfs_sreq_trace_put_terminated);
}
EXPORT_SYMBOL(netfs_read_subreq_terminated);
/*
* Handle termination of a read from the cache.
*/
-void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async)
+void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error)
{
struct netfs_io_subrequest *subreq = priv;
@@ -613,94 +581,3 @@ void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool
}
netfs_read_subreq_terminated(subreq);
}
-
-/*
- * Wait for the read operation to complete, successfully or otherwise.
- */
-ssize_t netfs_wait_for_read(struct netfs_io_request *rreq)
-{
- struct netfs_io_subrequest *subreq;
- struct netfs_io_stream *stream = &rreq->io_streams[0];
- DEFINE_WAIT(myself);
- ssize_t ret;
-
- for (;;) {
- trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue);
- prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
-
- subreq = list_first_entry_or_null(&stream->subrequests,
- struct netfs_io_subrequest, rreq_link);
- if (subreq &&
- (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) ||
- test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) {
- __set_current_state(TASK_RUNNING);
- netfs_read_collection(rreq);
- continue;
- }
-
- if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
- break;
-
- schedule();
- trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue);
- }
-
- finish_wait(&rreq->waitq, &myself);
-
- ret = rreq->error;
- if (ret == 0) {
- ret = rreq->transferred;
- switch (rreq->origin) {
- case NETFS_DIO_READ:
- case NETFS_READ_SINGLE:
- ret = rreq->transferred;
- break;
- default:
- if (rreq->submitted < rreq->len) {
- trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
- ret = -EIO;
- }
- break;
- }
- }
-
- return ret;
-}
-
-/*
- * Wait for a paused read operation to unpause or complete in some manner.
- */
-void netfs_wait_for_pause(struct netfs_io_request *rreq)
-{
- struct netfs_io_subrequest *subreq;
- struct netfs_io_stream *stream = &rreq->io_streams[0];
- DEFINE_WAIT(myself);
-
- trace_netfs_rreq(rreq, netfs_rreq_trace_wait_pause);
-
- for (;;) {
- trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue);
- prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
-
- if (!test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) {
- subreq = list_first_entry_or_null(&stream->subrequests,
- struct netfs_io_subrequest, rreq_link);
- if (subreq &&
- (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) ||
- test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) {
- __set_current_state(TASK_RUNNING);
- netfs_read_collection(rreq);
- continue;
- }
- }
-
- if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags) ||
- !test_bit(NETFS_RREQ_PAUSE, &rreq->flags))
- break;
-
- schedule();
- trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue);
- }
-
- finish_wait(&rreq->waitq, &myself);
-}
diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c
index cf7727060215..8097bc069c1d 100644
--- a/fs/netfs/read_pgpriv2.c
+++ b/fs/netfs/read_pgpriv2.c
@@ -110,13 +110,15 @@ static struct netfs_io_request *netfs_pgpriv2_begin_copy_to_cache(
if (!creq->io_streams[1].avail)
goto cancel_put;
+ __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &creq->flags);
+ trace_netfs_copy2cache(rreq, creq);
trace_netfs_write(creq, netfs_write_trace_copy_to_cache);
netfs_stat(&netfs_n_wh_copy_to_cache);
rreq->copy_to_cache = creq;
return creq;
cancel_put:
- netfs_put_request(creq, false, netfs_rreq_trace_put_return);
+ netfs_put_request(creq, netfs_rreq_trace_put_return);
cancel:
rreq->copy_to_cache = ERR_PTR(-ENOBUFS);
clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags);
@@ -154,8 +156,11 @@ void netfs_pgpriv2_end_copy_to_cache(struct netfs_io_request *rreq)
netfs_issue_write(creq, &creq->io_streams[1]);
smp_wmb(); /* Write lists before ALL_QUEUED. */
set_bit(NETFS_RREQ_ALL_QUEUED, &creq->flags);
+ trace_netfs_rreq(rreq, netfs_rreq_trace_end_copy_to_cache);
+ if (list_empty_careful(&creq->io_streams[1].subrequests))
+ netfs_wake_collector(creq);
- netfs_put_request(creq, false, netfs_rreq_trace_put_return);
+ netfs_put_request(creq, netfs_rreq_trace_put_return);
creq->copy_to_cache = NULL;
}
diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c
index 0f294b26e08c..b99e84a8170a 100644
--- a/fs/netfs/read_retry.c
+++ b/fs/netfs/read_retry.c
@@ -173,7 +173,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
&stream->subrequests, rreq_link) {
trace_netfs_sreq(subreq, netfs_sreq_trace_superfluous);
list_del(&subreq->rreq_link);
- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done);
+ netfs_put_subrequest(subreq, netfs_sreq_trace_put_done);
if (subreq == to)
break;
}
@@ -257,35 +257,15 @@ abandon:
*/
void netfs_retry_reads(struct netfs_io_request *rreq)
{
- struct netfs_io_subrequest *subreq;
struct netfs_io_stream *stream = &rreq->io_streams[0];
- DEFINE_WAIT(myself);
netfs_stat(&netfs_n_rh_retry_read_req);
- set_bit(NETFS_RREQ_RETRYING, &rreq->flags);
-
/* Wait for all outstanding I/O to quiesce before performing retries as
* we may need to renegotiate the I/O sizes.
*/
- list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
- if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags))
- continue;
-
- trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue);
- for (;;) {
- prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
-
- if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags))
- break;
-
- trace_netfs_sreq(subreq, netfs_sreq_trace_wait_for);
- schedule();
- trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue);
- }
-
- finish_wait(&rreq->waitq, &myself);
- }
+ set_bit(NETFS_RREQ_RETRYING, &rreq->flags);
+ netfs_wait_for_in_progress_stream(rreq, stream);
clear_bit(NETFS_RREQ_RETRYING, &rreq->flags);
trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c
index fea0ecdecc53..fa622a6cd56d 100644
--- a/fs/netfs/read_single.c
+++ b/fs/netfs/read_single.c
@@ -142,7 +142,7 @@ static int netfs_single_dispatch_read(struct netfs_io_request *rreq)
set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
return ret;
cancel:
- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
+ netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel);
return ret;
}
@@ -185,11 +185,11 @@ ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_ite
netfs_single_dispatch_read(rreq);
ret = netfs_wait_for_read(rreq);
- netfs_put_request(rreq, true, netfs_rreq_trace_put_return);
+ netfs_put_request(rreq, netfs_rreq_trace_put_return);
return ret;
cleanup_free:
- netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
+ netfs_put_request(rreq, netfs_rreq_trace_put_failed);
return ret;
}
EXPORT_SYMBOL(netfs_read_single);
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index 3fca59e6475d..0f3a36852a4d 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -240,7 +240,7 @@ reassess_streams:
}
/* Stall if the front is still undergoing I/O. */
- if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags)) {
+ if (netfs_check_subreq_in_progress(front)) {
notes |= HIT_PENDING;
break;
}
@@ -280,7 +280,7 @@ reassess_streams:
struct netfs_io_subrequest, rreq_link);
stream->front = front;
spin_unlock(&wreq->lock);
- netfs_put_subrequest(remove, false,
+ netfs_put_subrequest(remove,
notes & SAW_FAILURE ?
netfs_sreq_trace_put_cancel :
netfs_sreq_trace_put_done);
@@ -321,18 +321,14 @@ reassess_streams:
if (notes & NEED_RETRY)
goto need_retry;
- if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
- trace_netfs_rreq(wreq, netfs_rreq_trace_unpause);
- clear_bit_unlock(NETFS_RREQ_PAUSE, &wreq->flags);
- smp_mb__after_atomic(); /* Set PAUSE before task state */
- wake_up(&wreq->waitq);
- }
- if (notes & NEED_REASSESS) {
+ if (notes & MADE_PROGRESS) {
+ netfs_wake_rreq_flag(wreq, NETFS_RREQ_PAUSE, netfs_rreq_trace_unpause);
//cond_resched();
goto reassess_streams;
}
- if (notes & MADE_PROGRESS) {
+
+ if (notes & NEED_REASSESS) {
//cond_resched();
goto reassess_streams;
}
@@ -356,30 +352,21 @@ need_retry:
/*
* Perform the collection of subrequests, folios and encryption buffers.
*/
-void netfs_write_collection_worker(struct work_struct *work)
+bool netfs_write_collection(struct netfs_io_request *wreq)
{
- struct netfs_io_request *wreq = container_of(work, struct netfs_io_request, work);
struct netfs_inode *ictx = netfs_inode(wreq->inode);
size_t transferred;
int s;
_enter("R=%x", wreq->debug_id);
- netfs_see_request(wreq, netfs_rreq_trace_see_work);
- if (!test_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags)) {
- netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
- return;
- }
-
netfs_collect_write_results(wreq);
/* We're done when the app thread has finished posting subreqs and all
* the queues in all the streams are empty.
*/
- if (!test_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags)) {
- netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
- return;
- }
+ if (!test_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags))
+ return false;
smp_rmb(); /* Read ALL_QUEUED before lists. */
transferred = LONG_MAX;
@@ -387,10 +374,8 @@ void netfs_write_collection_worker(struct work_struct *work)
struct netfs_io_stream *stream = &wreq->io_streams[s];
if (!stream->active)
continue;
- if (!list_empty(&stream->subrequests)) {
- netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
- return;
- }
+ if (!list_empty(&stream->subrequests))
+ return false;
if (stream->transferred < transferred)
transferred = stream->transferred;
}
@@ -408,8 +393,10 @@ void netfs_write_collection_worker(struct work_struct *work)
ictx->ops->invalidate_cache(wreq);
}
- if (wreq->cleanup)
- wreq->cleanup(wreq);
+ if ((wreq->origin == NETFS_UNBUFFERED_WRITE ||
+ wreq->origin == NETFS_DIO_WRITE) &&
+ !wreq->error)
+ netfs_update_i_size(ictx, &ictx->inode, wreq->start, wreq->transferred);
if (wreq->origin == NETFS_DIO_WRITE &&
wreq->mapping->nrpages) {
@@ -428,31 +415,35 @@ void netfs_write_collection_worker(struct work_struct *work)
inode_dio_end(wreq->inode);
_debug("finished");
- trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
- clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
+ netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip);
+ /* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */
if (wreq->iocb) {
size_t written = min(wreq->transferred, wreq->len);
wreq->iocb->ki_pos += written;
- if (wreq->iocb->ki_complete)
+ if (wreq->iocb->ki_complete) {
+ trace_netfs_rreq(wreq, netfs_rreq_trace_ki_complete);
wreq->iocb->ki_complete(
wreq->iocb, wreq->error ? wreq->error : written);
+ }
wreq->iocb = VFS_PTR_POISON;
}
- netfs_clear_subrequests(wreq, false);
- netfs_put_request(wreq, false, netfs_rreq_trace_put_work_complete);
+ netfs_clear_subrequests(wreq);
+ return true;
}
-/*
- * Wake the collection work item.
- */
-void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async)
+void netfs_write_collection_worker(struct work_struct *work)
{
- if (!work_pending(&wreq->work)) {
- netfs_get_request(wreq, netfs_rreq_trace_get_work);
- if (!queue_work(system_unbound_wq, &wreq->work))
- netfs_put_request(wreq, was_async, netfs_rreq_trace_put_work_nq);
+ struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work);
+
+ netfs_see_request(rreq, netfs_rreq_trace_see_work);
+ if (netfs_check_rreq_in_progress(rreq)) {
+ if (netfs_write_collection(rreq))
+ /* Drop the ref from the IN_PROGRESS flag. */
+ netfs_put_request(rreq, netfs_rreq_trace_put_work_ip);
+ else
+ netfs_see_request(rreq, netfs_rreq_trace_see_work_complete);
}
}
@@ -460,7 +451,6 @@ void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async)
* netfs_write_subrequest_terminated - Note the termination of a write operation.
* @_op: The I/O request that has terminated.
* @transferred_or_error: The amount of data transferred or an error code.
- * @was_async: The termination was asynchronous
*
* This tells the library that a contributory write I/O operation has
* terminated, one way or another, and that it should collect the results.
@@ -470,21 +460,16 @@ void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async)
* negative error code. The library will look after reissuing I/O operations
* as appropriate and writing downloaded data to the cache.
*
- * If @was_async is true, the caller might be running in softirq or interrupt
- * context and we can't sleep.
- *
* When this is called, ownership of the subrequest is transferred back to the
* library, along with a ref.
*
* Note that %_op is a void* so that the function can be passed to
* kiocb::term_func without the need for a casting wrapper.
*/
-void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
- bool was_async)
+void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error)
{
struct netfs_io_subrequest *subreq = _op;
struct netfs_io_request *wreq = subreq->rreq;
- struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr];
_enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
@@ -495,8 +480,6 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
case NETFS_WRITE_TO_CACHE:
netfs_stat(&netfs_n_wh_write_done);
break;
- case NETFS_INVALID_WRITE:
- break;
default:
BUG();
}
@@ -536,15 +519,7 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
}
trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
-
- clear_and_wake_up_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
-
- /* If we are at the head of the queue, wake up the collector,
- * transferring a ref to it if we were the ones to do so.
- */
- if (list_is_first(&subreq->rreq_link, &stream->subrequests))
- netfs_wake_write_collector(wreq, was_async);
-
- netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
+ netfs_subreq_clear_in_progress(subreq);
+ netfs_put_subrequest(subreq, netfs_sreq_trace_put_terminated);
}
EXPORT_SYMBOL(netfs_write_subrequest_terminated);
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index 77279fc5b5a7..50bee2c4130d 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -134,7 +134,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
return wreq;
nomem:
wreq->error = -ENOMEM;
- netfs_put_request(wreq, false, netfs_rreq_trace_put_failed);
+ netfs_put_request(wreq, netfs_rreq_trace_put_failed);
return ERR_PTR(-ENOMEM);
}
@@ -233,7 +233,7 @@ static void netfs_do_issue_write(struct netfs_io_stream *stream,
_enter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len);
if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
- return netfs_write_subrequest_terminated(subreq, subreq->error, false);
+ return netfs_write_subrequest_terminated(subreq, subreq->error);
trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
stream->issue_write(subreq);
@@ -542,7 +542,7 @@ static void netfs_end_issue_write(struct netfs_io_request *wreq)
}
if (needs_poke)
- netfs_wake_write_collector(wreq, false);
+ netfs_wake_collector(wreq);
}
/*
@@ -576,6 +576,7 @@ int netfs_writepages(struct address_space *mapping,
goto couldnt_start;
}
+ __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags);
trace_netfs_write(wreq, netfs_write_trace_writeback);
netfs_stat(&netfs_n_wh_writepages);
@@ -599,8 +600,9 @@ int netfs_writepages(struct address_space *mapping,
netfs_end_issue_write(wreq);
mutex_unlock(&ictx->wb_lock);
+ netfs_wake_collector(wreq);
- netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ netfs_put_request(wreq, netfs_rreq_trace_put_return);
_leave(" = %d", error);
return error;
@@ -673,11 +675,11 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c
/*
* End a write operation used when writing through the pagecache.
*/
-int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
- struct folio *writethrough_cache)
+ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+ struct folio *writethrough_cache)
{
struct netfs_inode *ictx = netfs_inode(wreq->inode);
- int ret;
+ ssize_t ret;
_enter("R=%x", wreq->debug_id);
@@ -688,13 +690,11 @@ int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_contr
mutex_unlock(&ictx->wb_lock);
- if (wreq->iocb) {
+ if (wreq->iocb)
ret = -EIOCBQUEUED;
- } else {
- wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE);
- ret = wreq->error;
- }
- netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ else
+ ret = netfs_wait_for_write(wreq);
+ netfs_put_request(wreq, netfs_rreq_trace_put_return);
return ret;
}
@@ -722,10 +722,8 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t
start += part;
len -= part;
rolling_buffer_advance(&wreq->buffer, part);
- if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
- trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause);
- wait_event(wreq->waitq, !test_bit(NETFS_RREQ_PAUSE, &wreq->flags));
- }
+ if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags))
+ netfs_wait_for_paused_write(wreq);
if (test_bit(NETFS_RREQ_FAILED, &wreq->flags))
break;
}
@@ -885,7 +883,8 @@ int netfs_writeback_single(struct address_space *mapping,
goto couldnt_start;
}
- trace_netfs_write(wreq, netfs_write_trace_writeback);
+ __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags);
+ trace_netfs_write(wreq, netfs_write_trace_writeback_single);
netfs_stat(&netfs_n_wh_writepages);
if (__test_and_set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
@@ -914,8 +913,9 @@ stop:
set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
mutex_unlock(&ictx->wb_lock);
+ netfs_wake_collector(wreq);
- netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ netfs_put_request(wreq, netfs_rreq_trace_put_return);
_leave(" = %d", ret);
return ret;
diff --git a/fs/netfs/write_retry.c b/fs/netfs/write_retry.c
index 545d33079a77..fc9c3e0d34d8 100644
--- a/fs/netfs/write_retry.c
+++ b/fs/netfs/write_retry.c
@@ -39,9 +39,10 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
break;
if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
- struct iov_iter source = subreq->io_iter;
+ struct iov_iter source;
- iov_iter_revert(&source, subreq->len - source.count);
+ netfs_reset_iter(subreq);
+ source = subreq->io_iter;
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
netfs_reissue_write(stream, subreq, &source);
}
@@ -131,7 +132,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
&stream->subrequests, rreq_link) {
trace_netfs_sreq(subreq, netfs_sreq_trace_discard);
list_del(&subreq->rreq_link);
- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done);
+ netfs_put_subrequest(subreq, netfs_sreq_trace_put_done);
if (subreq == to)
break;
}
@@ -145,14 +146,13 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
subreq = netfs_alloc_subrequest(wreq);
subreq->source = to->source;
subreq->start = start;
- subreq->debug_index = atomic_inc_return(&wreq->subreq_counter);
subreq->stream_nr = to->stream_nr;
subreq->retry_count = 1;
trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
refcount_read(&subreq->ref),
netfs_sreq_trace_new);
- netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_split);
list_add(&subreq->rreq_link, &to->rreq_link);
to = list_next_entry(to, rreq_link);
@@ -199,7 +199,6 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
*/
void netfs_retry_writes(struct netfs_io_request *wreq)
{
- struct netfs_io_subrequest *subreq;
struct netfs_io_stream *stream;
int s;
@@ -208,16 +207,13 @@ void netfs_retry_writes(struct netfs_io_request *wreq)
/* Wait for all outstanding I/O to quiesce before performing retries as
* we may need to renegotiate the I/O sizes.
*/
+ set_bit(NETFS_RREQ_RETRYING, &wreq->flags);
for (s = 0; s < NR_IO_STREAMS; s++) {
stream = &wreq->io_streams[s];
- if (!stream->active)
- continue;
-
- list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
- wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS,
- TASK_UNINTERRUPTIBLE);
- }
+ if (stream->active)
+ netfs_wait_for_in_progress_stream(wreq, stream);
}
+ clear_bit(NETFS_RREQ_RETRYING, &wreq->flags);
// TODO: Enc: Fetch changed partial pages
// TODO: Enc: Reencrypt content if needed.
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index d8d50a88de04..d526f5ba7887 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -141,24 +141,18 @@ static const struct rpc_pipe_ops bl_upcall_ops = {
.destroy_msg = bl_pipe_destroy_msg,
};
-static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
+static int nfs4blocklayout_register_sb(struct super_block *sb,
struct rpc_pipe *pipe)
{
- struct dentry *dir, *dentry;
+ struct dentry *dir;
+ int err;
dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
if (dir == NULL)
- return ERR_PTR(-ENOENT);
- dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
+ return -ENOENT;
+ err = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
dput(dir);
- return dentry;
-}
-
-static void nfs4blocklayout_unregister_sb(struct super_block *sb,
- struct rpc_pipe *pipe)
-{
- if (pipe->dentry)
- rpc_unlink(pipe->dentry);
+ return err;
}
static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
@@ -167,7 +161,6 @@ static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
struct super_block *sb = ptr;
struct net *net = sb->s_fs_info;
struct nfs_net *nn = net_generic(net, nfs_net_id);
- struct dentry *dentry;
int ret = 0;
if (!try_module_get(THIS_MODULE))
@@ -180,16 +173,10 @@ static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
switch (event) {
case RPC_PIPEFS_MOUNT:
- dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
- if (IS_ERR(dentry)) {
- ret = PTR_ERR(dentry);
- break;
- }
- nn->bl_device_pipe->dentry = dentry;
+ ret = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
break;
case RPC_PIPEFS_UMOUNT:
- if (nn->bl_device_pipe->dentry)
- nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
+ rpc_unlink(nn->bl_device_pipe);
break;
default:
ret = -ENOTSUPP;
@@ -203,18 +190,17 @@ static struct notifier_block nfs4blocklayout_block = {
.notifier_call = rpc_pipefs_event,
};
-static struct dentry *nfs4blocklayout_register_net(struct net *net,
- struct rpc_pipe *pipe)
+static int nfs4blocklayout_register_net(struct net *net, struct rpc_pipe *pipe)
{
struct super_block *pipefs_sb;
- struct dentry *dentry;
+ int ret;
pipefs_sb = rpc_get_sb_net(net);
if (!pipefs_sb)
- return NULL;
- dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
+ return 0;
+ ret = nfs4blocklayout_register_sb(pipefs_sb, pipe);
rpc_put_sb_net(net);
- return dentry;
+ return ret;
}
static void nfs4blocklayout_unregister_net(struct net *net,
@@ -224,7 +210,7 @@ static void nfs4blocklayout_unregister_net(struct net *net,
pipefs_sb = rpc_get_sb_net(net);
if (pipefs_sb) {
- nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
+ rpc_unlink(pipe);
rpc_put_sb_net(net);
}
}
@@ -232,20 +218,17 @@ static void nfs4blocklayout_unregister_net(struct net *net,
static int nfs4blocklayout_net_init(struct net *net)
{
struct nfs_net *nn = net_generic(net, nfs_net_id);
- struct dentry *dentry;
+ int err;
mutex_init(&nn->bl_mutex);
init_waitqueue_head(&nn->bl_wq);
nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
if (IS_ERR(nn->bl_device_pipe))
return PTR_ERR(nn->bl_device_pipe);
- dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
- if (IS_ERR(dentry)) {
+ err = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
+ if (unlikely(err))
rpc_destroy_pipe_data(nn->bl_device_pipe);
- return PTR_ERR(dentry);
- }
- nn->bl_device_pipe->dentry = dentry;
- return 0;
+ return err;
}
static void nfs4blocklayout_net_exit(struct net *net)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 6d63b958c4bb..cf35ad3f818a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -180,7 +180,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
clp->cl_proto = cl_init->proto;
clp->cl_nconnect = cl_init->nconnect;
clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1;
- clp->cl_net = get_net(cl_init->net);
+ clp->cl_net = get_net_track(cl_init->net, &clp->cl_ns_tracker, GFP_KERNEL);
#if IS_ENABLED(CONFIG_NFS_LOCALIO)
seqlock_init(&clp->cl_boot_lock);
@@ -250,7 +250,7 @@ void nfs_free_client(struct nfs_client *clp)
if (!IS_ERR(clp->cl_rpcclient))
rpc_shutdown_client(clp->cl_rpcclient);
- put_net(clp->cl_net);
+ put_net_track(clp->cl_net, &clp->cl_ns_tracker);
put_nfs_version(clp->cl_nfs_mod);
kfree(clp->cl_hostname);
kfree(clp->cl_acceptor);
@@ -439,7 +439,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
spin_unlock(&nn->nfs_client_lock);
new = rpc_ops->init_client(new, cl_init);
if (!IS_ERR(new))
- nfs_local_probe(new);
+ nfs_local_probe_async(new);
return new;
}
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 8bdbc4dca89c..10ef46e29b25 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -1021,13 +1021,6 @@ out:
nfs_inode_find_state_and_recover(inode, stateid);
}
-void nfs_remove_bad_delegation(struct inode *inode,
- const nfs4_stateid *stateid)
-{
- nfs_revoke_delegation(inode, stateid);
-}
-EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation);
-
void nfs_delegation_mark_returned(struct inode *inode,
const nfs4_stateid *stateid)
{
@@ -1070,6 +1063,24 @@ out_rcu_unlock:
}
/**
+ * nfs_remove_bad_delegation - handle delegations that are unusable
+ * @inode: inode to process
+ * @stateid: the delegation's stateid
+ *
+ * If the server ACK-ed our FREE_STATEID then clean
+ * up the delegation, else mark and keep the revoked state.
+ */
+void nfs_remove_bad_delegation(struct inode *inode,
+ const nfs4_stateid *stateid)
+{
+ if (stateid && stateid->type == NFS4_FREED_STATEID_TYPE)
+ nfs_delegation_mark_returned(inode, stateid);
+ else
+ nfs_revoke_delegation(inode, stateid);
+}
+EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation);
+
+/**
* nfs_expire_unused_delegation_types
* @clp: client to process
* @flags: delegation types to expire
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 033feeab8c34..86e36c630f09 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -207,24 +207,25 @@ nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe
EXPORT_SYMBOL_GPL(nfs_file_splice_read);
int
-nfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+nfs_file_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
struct inode *inode = file_inode(file);
int status;
dprintk("NFS: mmap(%pD2)\n", file);
- /* Note: generic_file_mmap() returns ENOSYS on nommu systems
+ /* Note: generic_file_mmap_prepare() returns ENOSYS on nommu systems
* so we call that before revalidating the mapping
*/
- status = generic_file_mmap(file, vma);
+ status = generic_file_mmap_prepare(desc);
if (!status) {
- vma->vm_ops = &nfs_file_vm_ops;
+ desc->vm_ops = &nfs_file_vm_ops;
status = nfs_revalidate_mapping(inode, file->f_mapping);
}
return status;
}
-EXPORT_SYMBOL_GPL(nfs_file_mmap);
+EXPORT_SYMBOL_GPL(nfs_file_mmap_prepare);
/*
* Flush any dirty pages for this process, and check for write errors.
@@ -342,12 +343,14 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio,
* If the writer ends up delaying the write, the writer needs to
* increment the page use counts until he is done with the page.
*/
-static int nfs_write_begin(struct file *file, struct address_space *mapping,
+static int nfs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len, struct folio **foliop,
void **fsdata)
{
fgf_t fgp = FGP_WRITEBEGIN;
struct folio *folio;
+ struct file *file = iocb->ki_filp;
int once_thru = 0;
int ret;
@@ -377,10 +380,12 @@ start:
return ret;
}
-static int nfs_write_end(struct file *file, struct address_space *mapping,
+static int nfs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
{
+ struct file *file = iocb->ki_filp;
struct nfs_open_context *ctx = nfs_file_open_context(file);
unsigned offset = offset_in_folio(folio, pos);
int status;
@@ -899,7 +904,7 @@ const struct file_operations nfs_file_operations = {
.llseek = nfs_file_llseek,
.read_iter = nfs_file_read,
.write_iter = nfs_file_write,
- .mmap = nfs_file_mmap,
+ .mmap_prepare = nfs_file_mmap_prepare,
.open = nfs_file_open,
.flush = nfs_file_flush,
.release = nfs_file_release,
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index e6909cafab68..4bea008dbebd 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1105,6 +1105,7 @@ static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
}
static int ff_layout_async_handle_error_v4(struct rpc_task *task,
+ u32 op_status,
struct nfs4_state *state,
struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
@@ -1115,32 +1116,42 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
- switch (task->tk_status) {
- case -NFS4ERR_BADSESSION:
- case -NFS4ERR_BADSLOT:
- case -NFS4ERR_BAD_HIGH_SLOT:
- case -NFS4ERR_DEADSESSION:
- case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
- case -NFS4ERR_SEQ_FALSE_RETRY:
- case -NFS4ERR_SEQ_MISORDERED:
+ switch (op_status) {
+ case NFS4_OK:
+ case NFS4ERR_NXIO:
+ break;
+ case NFSERR_PERM:
+ if (!task->tk_xprt)
+ break;
+ xprt_force_disconnect(task->tk_xprt);
+ goto out_retry;
+ case NFS4ERR_BADSESSION:
+ case NFS4ERR_BADSLOT:
+ case NFS4ERR_BAD_HIGH_SLOT:
+ case NFS4ERR_DEADSESSION:
+ case NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ case NFS4ERR_SEQ_FALSE_RETRY:
+ case NFS4ERR_SEQ_MISORDERED:
dprintk("%s ERROR %d, Reset session. Exchangeid "
"flags 0x%x\n", __func__, task->tk_status,
clp->cl_exchange_flags);
nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
- break;
- case -NFS4ERR_DELAY:
- case -NFS4ERR_GRACE:
+ goto out_retry;
+ case NFS4ERR_DELAY:
+ nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+ fallthrough;
+ case NFS4ERR_GRACE:
rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX);
- break;
- case -NFS4ERR_RETRY_UNCACHED_REP:
- break;
+ goto out_retry;
+ case NFS4ERR_RETRY_UNCACHED_REP:
+ goto out_retry;
/* Invalidate Layout errors */
- case -NFS4ERR_PNFS_NO_LAYOUT:
- case -ESTALE: /* mapped NFS4ERR_STALE */
- case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */
- case -EISDIR: /* mapped NFS4ERR_ISDIR */
- case -NFS4ERR_FHEXPIRED:
- case -NFS4ERR_WRONG_TYPE:
+ case NFS4ERR_PNFS_NO_LAYOUT:
+ case NFS4ERR_STALE:
+ case NFS4ERR_BADHANDLE:
+ case NFS4ERR_ISDIR:
+ case NFS4ERR_FHEXPIRED:
+ case NFS4ERR_WRONG_TYPE:
dprintk("%s Invalid layout error %d\n", __func__,
task->tk_status);
/*
@@ -1153,6 +1164,11 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
pnfs_destroy_layout(NFS_I(inode));
rpc_wake_up(&tbl->slot_tbl_waitq);
goto reset;
+ default:
+ break;
+ }
+
+ switch (task->tk_status) {
/* RPC connection errors */
case -ENETDOWN:
case -ENETUNREACH:
@@ -1172,27 +1188,56 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
nfs4_delete_deviceid(devid->ld, devid->nfs_client,
&devid->deviceid);
rpc_wake_up(&tbl->slot_tbl_waitq);
- fallthrough;
+ break;
default:
- if (ff_layout_avoid_mds_available_ds(lseg))
- return -NFS4ERR_RESET_TO_PNFS;
-reset:
- dprintk("%s Retry through MDS. Error %d\n", __func__,
- task->tk_status);
- return -NFS4ERR_RESET_TO_MDS;
+ break;
}
+
+ if (ff_layout_avoid_mds_available_ds(lseg))
+ return -NFS4ERR_RESET_TO_PNFS;
+reset:
+ dprintk("%s Retry through MDS. Error %d\n", __func__,
+ task->tk_status);
+ return -NFS4ERR_RESET_TO_MDS;
+
+out_retry:
task->tk_status = 0;
return -EAGAIN;
}
/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
static int ff_layout_async_handle_error_v3(struct rpc_task *task,
+ u32 op_status,
struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
u32 idx)
{
struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+ switch (op_status) {
+ case NFS_OK:
+ case NFSERR_NXIO:
+ break;
+ case NFSERR_PERM:
+ if (!task->tk_xprt)
+ break;
+ xprt_force_disconnect(task->tk_xprt);
+ goto out_retry;
+ case NFSERR_ACCES:
+ case NFSERR_BADHANDLE:
+ case NFSERR_FBIG:
+ case NFSERR_IO:
+ case NFSERR_NOSPC:
+ case NFSERR_ROFS:
+ case NFSERR_STALE:
+ goto out_reset_to_pnfs;
+ case NFSERR_JUKEBOX:
+ nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+ goto out_retry;
+ default:
+ break;
+ }
+
switch (task->tk_status) {
/* File access problems. Don't mark the device as unavailable */
case -EACCES:
@@ -1216,6 +1261,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
nfs4_delete_deviceid(devid->ld, devid->nfs_client,
&devid->deviceid);
}
+out_reset_to_pnfs:
/* FIXME: Need to prevent infinite looping here. */
return -NFS4ERR_RESET_TO_PNFS;
out_retry:
@@ -1226,6 +1272,7 @@ out_retry:
}
static int ff_layout_async_handle_error(struct rpc_task *task,
+ u32 op_status,
struct nfs4_state *state,
struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
@@ -1244,10 +1291,11 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
switch (vers) {
case 3:
- return ff_layout_async_handle_error_v3(task, clp, lseg, idx);
- case 4:
- return ff_layout_async_handle_error_v4(task, state, clp,
+ return ff_layout_async_handle_error_v3(task, op_status, clp,
lseg, idx);
+ case 4:
+ return ff_layout_async_handle_error_v4(task, op_status, state,
+ clp, lseg, idx);
default:
/* should never happen */
WARN_ON_ONCE(1);
@@ -1300,6 +1348,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
switch (status) {
case NFS4ERR_DELAY:
case NFS4ERR_GRACE:
+ case NFS4ERR_PERM:
break;
case NFS4ERR_NXIO:
ff_layout_mark_ds_unreachable(lseg, idx);
@@ -1332,7 +1381,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
trace_ff_layout_read_error(hdr, task->tk_status);
}
- err = ff_layout_async_handle_error(task, hdr->args.context->state,
+ err = ff_layout_async_handle_error(task, hdr->res.op_status,
+ hdr->args.context->state,
hdr->ds_clp, hdr->lseg,
hdr->pgio_mirror_idx);
@@ -1505,7 +1555,8 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
trace_ff_layout_write_error(hdr, task->tk_status);
}
- err = ff_layout_async_handle_error(task, hdr->args.context->state,
+ err = ff_layout_async_handle_error(task, hdr->res.op_status,
+ hdr->args.context->state,
hdr->ds_clp, hdr->lseg,
hdr->pgio_mirror_idx);
@@ -1554,8 +1605,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
trace_ff_layout_commit_error(data, task->tk_status);
}
- err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
- data->lseg, data->ds_commit_index);
+ err = ff_layout_async_handle_error(task, data->res.op_status,
+ NULL, data->ds_clp, data->lseg,
+ data->ds_commit_index);
trace_nfs4_pnfs_commit_ds(data, err);
switch (err) {
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 4a304cf17c4b..656d5c50bbce 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -400,7 +400,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
* keep ds_clp even if DS is local, so that if local IO cannot
* proceed somehow, we can fall back to NFS whenever we want.
*/
- nfs_local_probe(ds->ds_clp);
+ nfs_local_probe_async(ds->ds_clp);
max_payload =
nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
NULL);
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index e278a1ad1ca3..8b0785178731 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -367,6 +367,7 @@ void nfs_netfs_read_completion(struct nfs_pgio_header *hdr)
sreq = netfs->sreq;
if (test_bit(NFS_IOHDR_EOF, &hdr->flags) &&
+ sreq->rreq->origin != NETFS_UNBUFFERED_READ &&
sreq->rreq->origin != NETFS_DIO_READ)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &sreq->flags);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 119e447758b9..a2fa6bc4d74e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -557,6 +557,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
set_nlink(inode, fattr->nlink);
else if (fattr_supported & NFS_ATTR_FATTR_NLINK)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK);
+ else
+ set_nlink(inode, 1);
if (fattr->valid & NFS_ATTR_FATTR_OWNER)
inode->i_uid = fattr->uid;
else if (fattr_supported & NFS_ATTR_FATTR_OWNER)
@@ -633,6 +635,34 @@ nfs_fattr_fixup_delegated(struct inode *inode, struct nfs_fattr *fattr)
}
}
+static void nfs_set_timestamps_to_ts(struct inode *inode, struct iattr *attr)
+{
+ unsigned int cache_flags = 0;
+
+ if (attr->ia_valid & ATTR_MTIME_SET) {
+ struct timespec64 ctime = inode_get_ctime(inode);
+ struct timespec64 mtime = inode_get_mtime(inode);
+ struct timespec64 now;
+ int updated = 0;
+
+ now = inode_set_ctime_current(inode);
+ if (!timespec64_equal(&now, &ctime))
+ updated |= S_CTIME;
+
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
+ if (!timespec64_equal(&now, &mtime))
+ updated |= S_MTIME;
+
+ inode_maybe_inc_iversion(inode, updated);
+ cache_flags |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME;
+ }
+ if (attr->ia_valid & ATTR_ATIME_SET) {
+ inode_set_atime_to_ts(inode, attr->ia_atime);
+ cache_flags |= NFS_INO_INVALID_ATIME;
+ }
+ NFS_I(inode)->cache_validity &= ~cache_flags;
+}
+
static void nfs_update_timestamps(struct inode *inode, unsigned int ia_valid)
{
enum file_time_flags time_flags = 0;
@@ -701,14 +731,27 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (nfs_have_delegated_mtime(inode) && attr->ia_valid & ATTR_MTIME) {
spin_lock(&inode->i_lock);
- nfs_update_timestamps(inode, attr->ia_valid);
+ if (attr->ia_valid & ATTR_MTIME_SET) {
+ nfs_set_timestamps_to_ts(inode, attr);
+ attr->ia_valid &= ~(ATTR_MTIME|ATTR_MTIME_SET|
+ ATTR_ATIME|ATTR_ATIME_SET);
+ } else {
+ nfs_update_timestamps(inode, attr->ia_valid);
+ attr->ia_valid &= ~(ATTR_MTIME|ATTR_ATIME);
+ }
spin_unlock(&inode->i_lock);
- attr->ia_valid &= ~(ATTR_MTIME | ATTR_ATIME);
} else if (nfs_have_delegated_atime(inode) &&
attr->ia_valid & ATTR_ATIME &&
!(attr->ia_valid & ATTR_MTIME)) {
- nfs_update_delegated_atime(inode);
- attr->ia_valid &= ~ATTR_ATIME;
+ if (attr->ia_valid & ATTR_ATIME_SET) {
+ spin_lock(&inode->i_lock);
+ nfs_set_timestamps_to_ts(inode, attr);
+ spin_unlock(&inode->i_lock);
+ attr->ia_valid &= ~(ATTR_ATIME|ATTR_ATIME_SET);
+ } else {
+ nfs_update_delegated_atime(inode);
+ attr->ia_valid &= ~ATTR_ATIME;
+ }
}
/* Optimization: if the end result is no change, don't RPC */
@@ -2546,15 +2589,26 @@ EXPORT_SYMBOL_GPL(nfs_net_id);
static int nfs_net_init(struct net *net)
{
struct nfs_net *nn = net_generic(net, nfs_net_id);
+ int err;
nfs_clients_init(net);
if (!rpc_proc_register(net, &nn->rpcstats)) {
- nfs_clients_exit(net);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto err_proc_rpc;
}
- return nfs_fs_proc_net_init(net);
+ err = nfs_fs_proc_net_init(net);
+ if (err)
+ goto err_proc_nfs;
+
+ return 0;
+
+err_proc_nfs:
+ rpc_proc_unregister(net, "nfs");
+err_proc_rpc:
+ nfs_clients_exit(net);
+ return err;
}
static void nfs_net_exit(struct net *net)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 6655e5f32ec6..26551ff09a52 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -432,7 +432,7 @@ loff_t nfs_file_llseek(struct file *, loff_t, int);
ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
ssize_t nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe,
size_t len, unsigned int flags);
-int nfs_file_mmap(struct file *, struct vm_area_struct *);
+int nfs_file_mmap_prepare(struct vm_area_desc *);
ssize_t nfs_file_write(struct kiocb *, struct iov_iter *);
int nfs_file_release(struct inode *, struct file *);
int nfs_lock(struct file *, int, struct file_lock *);
@@ -455,7 +455,6 @@ extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
#if IS_ENABLED(CONFIG_NFS_LOCALIO)
/* localio.c */
-extern void nfs_local_probe(struct nfs_client *);
extern void nfs_local_probe_async(struct nfs_client *);
extern void nfs_local_probe_async_work(struct work_struct *);
extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *,
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 4ec952f9f47d..510d0a16cfe9 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -171,7 +171,7 @@ static bool nfs_server_uuid_is_local(struct nfs_client *clp)
* - called after alloc_client and init_client (so cl_rpcclient exists)
* - this function is idempotent, it can be called for old or new clients
*/
-void nfs_local_probe(struct nfs_client *clp)
+static void nfs_local_probe(struct nfs_client *clp)
{
/* Disallow localio if disabled via sysfs or AUTH_SYS isn't used */
if (!localio_enabled ||
@@ -191,14 +191,16 @@ void nfs_local_probe(struct nfs_client *clp)
nfs_localio_enable_client(clp);
nfs_uuid_end(&clp->cl_uuid);
}
-EXPORT_SYMBOL_GPL(nfs_local_probe);
void nfs_local_probe_async_work(struct work_struct *work)
{
struct nfs_client *clp =
container_of(work, struct nfs_client, cl_local_probe_work);
+ if (!refcount_inc_not_zero(&clp->cl_count))
+ return;
nfs_local_probe(clp);
+ nfs_put_client(clp);
}
void nfs_local_probe_async(struct nfs_client *clp)
@@ -207,14 +209,16 @@ void nfs_local_probe_async(struct nfs_client *clp)
}
EXPORT_SYMBOL_GPL(nfs_local_probe_async);
-static inline struct nfsd_file *nfs_local_file_get(struct nfsd_file *nf)
+static inline void nfs_local_file_put(struct nfsd_file *localio)
{
- return nfs_to->nfsd_file_get(nf);
-}
+ /* nfs_to_nfsd_file_put_local() expects an __rcu pointer
+ * but we have a __kernel pointer. It is always safe
+ * to cast a __kernel pointer to an __rcu pointer
+ * because the cast only weakens what is known about the pointer.
+ */
+ struct nfsd_file __rcu *nf = (struct nfsd_file __rcu*) localio;
-static inline void nfs_local_file_put(struct nfsd_file *nf)
-{
- nfs_to->nfsd_file_put(nf);
+ nfs_to_nfsd_file_put_local(&nf);
}
/*
@@ -226,12 +230,13 @@ static inline void nfs_local_file_put(struct nfsd_file *nf)
static struct nfsd_file *
__nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
struct nfs_fh *fh, struct nfs_file_localio *nfl,
+ struct nfsd_file __rcu **pnf,
const fmode_t mode)
{
struct nfsd_file *localio;
localio = nfs_open_local_fh(&clp->cl_uuid, clp->cl_rpcclient,
- cred, fh, nfl, mode);
+ cred, fh, nfl, pnf, mode);
if (IS_ERR(localio)) {
int status = PTR_ERR(localio);
trace_nfs_local_open_fh(fh, mode, status);
@@ -258,7 +263,7 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
struct nfs_fh *fh, struct nfs_file_localio *nfl,
const fmode_t mode)
{
- struct nfsd_file *nf, *new, __rcu **pnf;
+ struct nfsd_file *nf, __rcu **pnf;
if (!nfs_server_is_local(clp))
return NULL;
@@ -270,29 +275,9 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
else
pnf = &nfl->ro_file;
- new = NULL;
- rcu_read_lock();
- nf = rcu_dereference(*pnf);
- if (!nf) {
- rcu_read_unlock();
- new = __nfs_local_open_fh(clp, cred, fh, nfl, mode);
- if (IS_ERR(new))
- return NULL;
- rcu_read_lock();
- /* try to swap in the pointer */
- spin_lock(&clp->cl_uuid.lock);
- nf = rcu_dereference_protected(*pnf, 1);
- if (!nf) {
- nf = new;
- new = NULL;
- rcu_assign_pointer(*pnf, nf);
- }
- spin_unlock(&clp->cl_uuid.lock);
- }
- nf = nfs_local_file_get(nf);
- rcu_read_unlock();
- if (new)
- nfs_to_nfsd_file_put_local(new);
+ nf = __nfs_local_open_fh(clp, cred, fh, nfl, pnf, mode);
+ if (IS_ERR(nf))
+ return NULL;
return nf;
}
EXPORT_SYMBOL_GPL(nfs_local_open_fh);
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index 0282d93c8bcc..aafd15a4afce 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -21,6 +21,7 @@ int nfs42_proc_allocate(struct file *, loff_t, loff_t);
ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t,
struct nl4_server *, nfs4_stateid *, bool);
int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
+int nfs42_proc_zero_range(struct file *, loff_t, loff_t);
loff_t nfs42_proc_llseek(struct file *, loff_t, int);
int nfs42_proc_layoutstats_generic(struct nfs_server *,
struct nfs42_layoutstat_data *);
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 5cf52ece96ac..01c01f45358b 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -146,7 +146,8 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
err = nfs42_proc_fallocate(&msg, filep, offset, len);
if (err == -EOPNOTSUPP)
- NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE;
+ NFS_SERVER(inode)->caps &= ~(NFS_CAP_ALLOCATE |
+ NFS_CAP_ZERO_RANGE);
inode_unlock(inode);
return err;
@@ -169,7 +170,31 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
if (err == 0)
truncate_pagecache_range(inode, offset, (offset + len) -1);
if (err == -EOPNOTSUPP)
- NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
+ NFS_SERVER(inode)->caps &= ~(NFS_CAP_DEALLOCATE |
+ NFS_CAP_ZERO_RANGE);
+
+ inode_unlock(inode);
+ return err;
+}
+
+int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ZERO_RANGE],
+ };
+ struct inode *inode = file_inode(filep);
+ int err;
+
+ if (!nfs_server_capable(inode, NFS_CAP_ZERO_RANGE))
+ return -EOPNOTSUPP;
+
+ inode_lock(inode);
+
+ err = nfs42_proc_fallocate(&msg, filep, offset, len);
+ if (err == 0)
+ truncate_pagecache_range(inode, offset, (offset + len) -1);
+ if (err == -EOPNOTSUPP)
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_ZERO_RANGE;
inode_unlock(inode);
return err;
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index b1b663468249..4cc915d5741d 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -174,6 +174,18 @@
decode_putfh_maxsz + \
decode_deallocate_maxsz + \
decode_getattr_maxsz)
+#define NFS4_enc_zero_range_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_deallocate_maxsz + \
+ encode_allocate_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_zero_range_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_deallocate_maxsz + \
+ decode_allocate_maxsz + \
+ decode_getattr_maxsz)
#define NFS4_enc_read_plus_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
@@ -649,6 +661,27 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req,
}
/*
+ * Encode ZERO_RANGE request
+ */
+static void nfs4_xdr_enc_zero_range(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_falloc_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->falloc_fh, &hdr);
+ encode_deallocate(xdr, args, &hdr);
+ encode_allocate(xdr, args, &hdr);
+ encode_getfattr(xdr, args->falloc_bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
* Encode READ_PLUS request
*/
static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req,
@@ -1511,6 +1544,37 @@ out:
}
/*
+ * Decode ZERO_RANGE request
+ */
+static int nfs4_xdr_dec_zero_range(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_falloc_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_deallocate(xdr, res);
+ if (status)
+ goto out;
+ status = decode_allocate(xdr, res);
+ if (status)
+ goto out;
+ decode_getfattr(xdr, res->falloc_fattr, res->falloc_server);
+out:
+ return status;
+}
+
+/*
* Decode READ_PLUS request
*/
static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 7d383d29a995..d3ca91f60fc1 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -67,8 +67,7 @@ struct nfs4_minor_version_ops {
void (*free_lock_state)(struct nfs_server *,
struct nfs4_lock_state *);
int (*test_and_free_expired)(struct nfs_server *,
- const nfs4_stateid *,
- const struct cred *);
+ nfs4_stateid *, const struct cred *);
struct nfs_seqid *
(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
void (*session_trunk)(struct rpc_clnt *clnt,
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 1cd9652f3c28..5c749b6117bb 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -225,8 +225,14 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t
if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP;
- if ((mode != 0) && (mode != (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)))
+ switch (mode) {
+ case 0:
+ case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
+ case FALLOC_FL_ZERO_RANGE:
+ break;
+ default:
return -EOPNOTSUPP;
+ }
ret = inode_newsize_ok(inode, offset + len);
if (ret < 0)
@@ -234,6 +240,8 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t
if (mode & FALLOC_FL_PUNCH_HOLE)
return nfs42_proc_deallocate(filep, offset, len);
+ else if (mode & FALLOC_FL_ZERO_RANGE)
+ return nfs42_proc_zero_range(filep, offset ,len);
return nfs42_proc_allocate(filep, offset, len);
}
@@ -448,7 +456,7 @@ static int nfs4_setlease(struct file *file, int arg, struct file_lease **lease,
const struct file_operations nfs4_file_operations = {
.read_iter = nfs_file_read,
.write_iter = nfs_file_write,
- .mmap = nfs_file_mmap,
+ .mmap_prepare = nfs_file_mmap_prepare,
.open = nfs4_file_open,
.flush = nfs4_file_flush,
.release = nfs_file_release,
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 25a7c771cfd8..00932500fce4 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -424,26 +424,16 @@ static void nfs_idmap_pipe_destroy(struct dentry *dir,
struct rpc_pipe_dir_object *pdo)
{
struct idmap *idmap = pdo->pdo_data;
- struct rpc_pipe *pipe = idmap->idmap_pipe;
- if (pipe->dentry) {
- rpc_unlink(pipe->dentry);
- pipe->dentry = NULL;
- }
+ rpc_unlink(idmap->idmap_pipe);
}
static int nfs_idmap_pipe_create(struct dentry *dir,
struct rpc_pipe_dir_object *pdo)
{
struct idmap *idmap = pdo->pdo_data;
- struct rpc_pipe *pipe = idmap->idmap_pipe;
- struct dentry *dentry;
- dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
- pipe->dentry = dentry;
- return 0;
+ return rpc_mkpipe_dentry(dir, "idmap", idmap, idmap->idmap_pipe);
}
static const struct rpc_pipe_dir_object_ops nfs_idmap_pipe_dir_object_ops = {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b1d2122bd5a7..341740fa293d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -105,7 +105,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
bool is_privileged);
static int nfs41_test_stateid(struct nfs_server *, const nfs4_stateid *,
const struct cred *);
-static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *,
+static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *,
const struct cred *, bool);
#endif
@@ -325,14 +325,14 @@ static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src,
if (nfs_have_delegated_mtime(inode)) {
if (!(cache_validity & NFS_INO_INVALID_ATIME))
- dst[1] &= ~FATTR4_WORD1_TIME_ACCESS;
+ dst[1] &= ~(FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET);
if (!(cache_validity & NFS_INO_INVALID_MTIME))
- dst[1] &= ~FATTR4_WORD1_TIME_MODIFY;
+ dst[1] &= ~(FATTR4_WORD1_TIME_MODIFY|FATTR4_WORD1_TIME_MODIFY_SET);
if (!(cache_validity & NFS_INO_INVALID_CTIME))
- dst[1] &= ~FATTR4_WORD1_TIME_METADATA;
+ dst[1] &= ~(FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY_SET);
} else if (nfs_have_delegated_atime(inode)) {
if (!(cache_validity & NFS_INO_INVALID_ATIME))
- dst[1] &= ~FATTR4_WORD1_TIME_ACCESS;
+ dst[1] &= ~(FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET);
}
}
@@ -2903,16 +2903,14 @@ static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
}
static int nfs40_test_and_free_expired_stateid(struct nfs_server *server,
- const nfs4_stateid *stateid,
- const struct cred *cred)
+ nfs4_stateid *stateid, const struct cred *cred)
{
return -NFS4ERR_BAD_STATEID;
}
#if defined(CONFIG_NFS_V4_1)
static int nfs41_test_and_free_expired_stateid(struct nfs_server *server,
- const nfs4_stateid *stateid,
- const struct cred *cred)
+ nfs4_stateid *stateid, const struct cred *cred)
{
int status;
@@ -2921,6 +2919,7 @@ static int nfs41_test_and_free_expired_stateid(struct nfs_server *server,
break;
case NFS4_INVALID_STATEID_TYPE:
case NFS4_SPECIAL_STATEID_TYPE:
+ case NFS4_FREED_STATEID_TYPE:
return -NFS4ERR_BAD_STATEID;
case NFS4_REVOKED_STATEID_TYPE:
goto out_free;
@@ -3976,8 +3975,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
FATTR4_WORD0_CASE_INSENSITIVE |
FATTR4_WORD0_CASE_PRESERVING;
if (minorversion)
- bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT |
- FATTR4_WORD2_OPEN_ARGUMENTS;
+ bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+ if (minorversion > 1)
+ bitmask[2] |= FATTR4_WORD2_OPEN_ARGUMENTS;
status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
if (status == 0) {
@@ -5164,13 +5164,15 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
}
static struct dentry *nfs4_do_mkdir(struct inode *dir, struct dentry *dentry,
- struct nfs4_createdata *data)
+ struct nfs4_createdata *data, int *statusp)
{
- int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg,
+ struct dentry *ret;
+
+ *statusp = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg,
&data->arg.seq_args, &data->res.seq_res, 1);
- if (status)
- return ERR_PTR(status);
+ if (*statusp)
+ return NULL;
spin_lock(&dir->i_lock);
/* Creating a directory bumps nlink in the parent */
@@ -5179,7 +5181,11 @@ static struct dentry *nfs4_do_mkdir(struct inode *dir, struct dentry *dentry,
data->res.fattr->time_start,
NFS_INO_INVALID_DATA);
spin_unlock(&dir->i_lock);
- return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr);
+ ret = nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr);
+ if (!IS_ERR(ret))
+ return ret;
+ *statusp = PTR_ERR(ret);
+ return NULL;
}
static void nfs4_free_createdata(struct nfs4_createdata *data)
@@ -5240,17 +5246,18 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
static struct dentry *_nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
struct iattr *sattr,
- struct nfs4_label *label)
+ struct nfs4_label *label, int *statusp)
{
struct nfs4_createdata *data;
- struct dentry *ret = ERR_PTR(-ENOMEM);
+ struct dentry *ret = NULL;
+ *statusp = -ENOMEM;
data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR);
if (data == NULL)
goto out;
data->arg.label = label;
- ret = nfs4_do_mkdir(dir, dentry, data);
+ ret = nfs4_do_mkdir(dir, dentry, data, statusp);
nfs4_free_createdata(data);
out:
@@ -5273,11 +5280,12 @@ static struct dentry *nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
sattr->ia_mode &= ~current_umask();
do {
- alias = _nfs4_proc_mkdir(dir, dentry, sattr, label);
- err = PTR_ERR_OR_ZERO(alias);
+ alias = _nfs4_proc_mkdir(dir, dentry, sattr, label, &err);
trace_nfs4_mkdir(dir, &dentry->d_name, err);
- err = nfs4_handle_exception(NFS_SERVER(dir), err,
- &exception);
+ if (err)
+ alias = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir),
+ err,
+ &exception));
} while (exception.retry);
nfs4_label_release_security(label);
@@ -6211,6 +6219,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen,
struct nfs_server *server = NFS_SERVER(inode);
int ret;
+ if (unlikely(NFS_FH(inode)->size == 0))
+ return -ENODATA;
if (!nfs4_server_supports_acls(server, type))
return -EOPNOTSUPP;
ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
@@ -6285,6 +6295,9 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf,
{
struct nfs4_exception exception = { };
int err;
+
+ if (unlikely(NFS_FH(inode)->size == 0))
+ return -ENODATA;
do {
err = __nfs4_proc_set_acl(inode, buf, buflen, type);
trace_nfs4_set_acl(inode, err);
@@ -10611,7 +10624,7 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = {
* Note: this function is always asynchronous.
*/
static int nfs41_free_stateid(struct nfs_server *server,
- const nfs4_stateid *stateid,
+ nfs4_stateid *stateid,
const struct cred *cred,
bool privileged)
{
@@ -10651,6 +10664,7 @@ static int nfs41_free_stateid(struct nfs_server *server,
if (IS_ERR(task))
return PTR_ERR(task);
rpc_put_task(task);
+ stateid->type = NFS4_FREED_STATEID_TYPE;
return 0;
}
@@ -10817,6 +10831,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_OFFLOAD_CANCEL
| NFS_CAP_COPY_NOTIFY
| NFS_CAP_DEALLOCATE
+ | NFS_CAP_ZERO_RANGE
| NFS_CAP_SEEK
| NFS_CAP_LAYOUTSTATS
| NFS_CAP_CLONE
@@ -10852,7 +10867,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
{
- ssize_t error, error2, error3;
+ ssize_t error, error2, error3, error4;
size_t left = size;
error = generic_listxattr(dentry, list, left);
@@ -10875,8 +10890,16 @@ static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, left);
if (error3 < 0)
return error3;
+ if (list) {
+ list += error3;
+ left -= error3;
+ }
+
+ error4 = security_inode_listsecurity(d_inode(dentry), list, left);
+ if (error4 < 0)
+ return error4;
- error += error2 + error3;
+ error += error2 + error3 + error4;
if (size && error > size)
return -ERANGE;
return error;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 55bef5fbfa47..318afde38057 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -7711,6 +7711,7 @@ const struct rpc_procinfo nfs4_procedures[] = {
PROC42(LISTXATTRS, enc_listxattrs, dec_listxattrs),
PROC42(REMOVEXATTR, enc_removexattr, dec_removexattr),
PROC42(READ_PLUS, enc_read_plus, dec_read_plus),
+ PROC42(ZERO_RANGE, enc_zero_range, dec_zero_range),
};
static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)];
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 3adb7d0dbec7..1a7ec68bde15 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2059,8 +2059,10 @@ static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo)
static void nfs_layoutget_end(struct pnfs_layout_hdr *lo)
{
if (atomic_dec_and_test(&lo->plh_outstanding) &&
- test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags))
+ test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) {
+ smp_mb__after_atomic();
wake_up_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN);
+ }
}
static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo)
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 91ef486f40b9..b4ccdf78d4dd 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -830,10 +830,16 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
.servername = clp->cl_hostname,
.connect_timeout = connect_timeout,
.reconnect_timeout = connect_timeout,
+ .xprtsec = clp->cl_xprtsec,
};
- if (da->da_transport != clp->cl_proto)
+ if (da->da_transport != clp->cl_proto &&
+ clp->cl_proto != XPRT_TRANSPORT_TCP_TLS)
continue;
+ if (da->da_transport == XPRT_TRANSPORT_TCP &&
+ mds_srv->nfs_client->cl_proto == XPRT_TRANSPORT_TCP_TLS)
+ xprt_args.ident = XPRT_TRANSPORT_TCP_TLS;
+
if (da->da_addr.ss_family != clp->cl_addr.ss_family)
continue;
/* Add this address as an alias */
@@ -841,6 +847,9 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
rpc_clnt_test_and_add_xprt, NULL);
continue;
}
+ if (da->da_transport == XPRT_TRANSPORT_TCP &&
+ mds_srv->nfs_client->cl_proto == XPRT_TRANSPORT_TCP_TLS)
+ da->da_transport = XPRT_TRANSPORT_TCP_TLS;
clp = get_v3_ds_connect(mds_srv,
&da->da_addr,
da->da_addrlen, da->da_transport,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 81bd1b9aba17..3c1fa320b3f1 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -56,7 +56,8 @@ static int nfs_return_empty_folio(struct folio *folio)
{
folio_zero_segment(folio, 0, folio_size(folio));
folio_mark_uptodate(folio);
- folio_unlock(folio);
+ if (nfs_netfs_folio_unlock(folio))
+ folio_unlock(folio);
return 0;
}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 9eea9e62afc9..72dee6f3050e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1052,6 +1052,16 @@ int nfs_reconfigure(struct fs_context *fc)
sync_filesystem(sb);
/*
+ * The SB_RDONLY flag has been removed from the superblock during
+ * mounts to prevent interference between different filesystems.
+ * Similarly, it is also necessary to ignore the SB_RDONLY flag
+ * during reconfiguration; otherwise, it may also result in the
+ * creation of redundant superblocks when mounting a directory with
+ * different rw and ro flags multiple times.
+ */
+ fc->sb_flags_mask &= ~SB_RDONLY;
+
+ /*
* Userspace mount programs that send binary options generally send
* them populated with default values. We have no way to know which
* ones were explicitly specified. Fall back to legacy behavior and
@@ -1173,7 +1183,7 @@ static int nfs_set_super(struct super_block *s, struct fs_context *fc)
struct nfs_server *server = fc->s_fs_info;
int ret;
- s->s_d_op = server->nfs_client->rpc_ops->dentry_ops;
+ set_default_d_op(s, server->nfs_client->rpc_ops->dentry_ops);
ret = set_anon_super(s, server);
if (ret == 0)
server->s_dev = s->s_dev;
@@ -1308,8 +1318,17 @@ int nfs_get_tree_common(struct fs_context *fc)
if (IS_ERR(server))
return PTR_ERR(server);
+ /*
+ * When NFS_MOUNT_UNSHARED is not set, NFS forces the sharing of a
+ * superblock among each filesystem that mounts sub-directories
+ * belonging to a single exported root path.
+ * To prevent interference between different filesystems, the
+ * SB_RDONLY flag should be removed from the superblock.
+ */
if (server->flags & NFS_MOUNT_UNSHARED)
compare_super = NULL;
+ else
+ fc->sb_flags &= ~SB_RDONLY;
/* -o noac implies -o sync */
if (server->flags & NFS_MOUNT_NOAC)
diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
index 37cb2b776435..545148d42dcc 100644
--- a/fs/nfs/sysfs.c
+++ b/fs/nfs/sysfs.c
@@ -387,6 +387,33 @@ static inline void nfs_sysfs_add_nfsv41_server(struct nfs_server *server)
}
#endif /* CONFIG_NFS_V4_1 */
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+
+static ssize_t
+localio_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct nfs_server *server = container_of(kobj, struct nfs_server, kobj);
+ bool localio = nfs_server_is_local(server->nfs_client);
+ return sysfs_emit(buf, "%d\n", localio);
+}
+
+static struct kobj_attribute nfs_sysfs_attr_localio = __ATTR_RO(localio);
+
+static void nfs_sysfs_add_nfs_localio_server(struct nfs_server *server)
+{
+ int ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_localio.attr,
+ nfs_netns_server_namespace(&server->kobj));
+ if (ret < 0)
+ pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n",
+ server->s_sysfs_id, ret);
+}
+#else
+static inline void nfs_sysfs_add_nfs_localio_server(struct nfs_server *server)
+{
+}
+#endif /* IS_ENABLED(CONFIG_NFS_LOCALIO) */
+
void nfs_sysfs_add_server(struct nfs_server *server)
{
int ret;
@@ -405,6 +432,7 @@ void nfs_sysfs_add_server(struct nfs_server *server)
server->s_sysfs_id, ret);
nfs_sysfs_add_nfsv41_server(server);
+ nfs_sysfs_add_nfs_localio_server(server);
}
EXPORT_SYMBOL_GPL(nfs_sysfs_add_server);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 23df8b214474..cf1d720b8251 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -632,19 +632,19 @@ static void nfs_write_error(struct nfs_page *req, int error)
* Find an associated nfs write request, and prepare to flush it out
* May return an error if the user signalled nfs_wait_on_request().
*/
-static int nfs_page_async_flush(struct folio *folio,
- struct writeback_control *wbc,
- struct nfs_pageio_descriptor *pgio)
+static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc,
+ struct nfs_pageio_descriptor *pgio)
{
struct nfs_page *req;
- int ret = 0;
+ int ret;
+
+ nfs_pageio_cond_complete(pgio, folio->index);
req = nfs_lock_and_join_requests(folio);
if (!req)
- goto out;
- ret = PTR_ERR(req);
+ return 0;
if (IS_ERR(req))
- goto out;
+ return PTR_ERR(req);
nfs_folio_set_writeback(folio);
WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags));
@@ -654,7 +654,6 @@ static int nfs_page_async_flush(struct folio *folio,
if (nfs_error_is_fatal_on_server(ret))
goto out_launder;
- ret = 0;
if (!nfs_pageio_add_request(pgio, req)) {
ret = pgio->pg_error;
/*
@@ -662,28 +661,20 @@ static int nfs_page_async_flush(struct folio *folio,
*/
if (nfs_error_is_fatal_on_server(ret))
goto out_launder;
- if (wbc->sync_mode == WB_SYNC_NONE)
- ret = AOP_WRITEPAGE_ACTIVATE;
folio_redirty_for_writepage(wbc, folio);
nfs_redirty_request(req);
pgio->pg_error = 0;
- } else
- nfs_add_stats(folio->mapping->host,
- NFSIOS_WRITEPAGES, 1);
-out:
- return ret;
+ return ret;
+ }
+
+ nfs_add_stats(folio->mapping->host, NFSIOS_WRITEPAGES, 1);
+ return 0;
+
out_launder:
nfs_write_error(req, ret);
return 0;
}
-static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc,
- struct nfs_pageio_descriptor *pgio)
-{
- nfs_pageio_cond_complete(pgio, folio->index);
- return nfs_page_async_flush(folio, wbc, pgio);
-}
-
/*
* Write an mmapped page to the server.
*/
@@ -703,17 +694,6 @@ static int nfs_writepage_locked(struct folio *folio,
return err;
}
-static int nfs_writepages_callback(struct folio *folio,
- struct writeback_control *wbc, void *data)
-{
- int ret;
-
- ret = nfs_do_writepage(folio, wbc, data);
- if (ret != AOP_WRITEPAGE_ACTIVATE)
- folio_unlock(folio);
- return ret;
-}
-
static void nfs_io_completion_commit(void *inode)
{
nfs_commit_inode(inode, 0);
@@ -740,7 +720,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate ||
- wbc->for_background || wbc->for_sync || wbc->for_reclaim) {
+ wbc->for_background || wbc->for_sync) {
ioc = nfs_io_completion_alloc(GFP_KERNEL);
if (ioc)
nfs_io_completion_init(ioc, nfs_io_completion_commit,
@@ -749,11 +729,15 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
}
do {
+ struct folio *folio = NULL;
+
nfs_pageio_init_write(&pgio, inode, priority, false,
&nfs_async_write_completion_ops);
pgio.pg_io_completion = ioc;
- err = write_cache_pages(mapping, wbc, nfs_writepages_callback,
- &pgio);
+ while ((folio = writeback_iter(mapping, wbc, folio, &err))) {
+ err = nfs_do_writepage(folio, wbc, &pgio);
+ folio_unlock(folio);
+ }
pgio.pg_error = 0;
nfs_pageio_complete(&pgio);
if (err == -EAGAIN && mntflags & NFS_MOUNT_SOFTERR)
diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c
index 6a0bdea6d644..05c7c16e37ab 100644
--- a/fs/nfs_common/nfslocalio.c
+++ b/fs/nfs_common/nfslocalio.c
@@ -151,8 +151,7 @@ EXPORT_SYMBOL_GPL(nfs_localio_enable_client);
*/
static bool nfs_uuid_put(nfs_uuid_t *nfs_uuid)
{
- LIST_HEAD(local_files);
- struct nfs_file_localio *nfl, *tmp;
+ struct nfs_file_localio *nfl;
spin_lock(&nfs_uuid->lock);
if (unlikely(!rcu_access_pointer(nfs_uuid->net))) {
@@ -166,17 +165,42 @@ static bool nfs_uuid_put(nfs_uuid_t *nfs_uuid)
nfs_uuid->dom = NULL;
}
- list_splice_init(&nfs_uuid->files, &local_files);
- spin_unlock(&nfs_uuid->lock);
-
/* Walk list of files and ensure their last references dropped */
- list_for_each_entry_safe(nfl, tmp, &local_files, list) {
- nfs_close_local_fh(nfl);
+
+ while ((nfl = list_first_entry_or_null(&nfs_uuid->files,
+ struct nfs_file_localio,
+ list)) != NULL) {
+ /* If nfs_uuid is already NULL, nfs_close_local_fh is
+ * closing and we must wait, else we unlink and close.
+ */
+ if (rcu_access_pointer(nfl->nfs_uuid) == NULL) {
+ /* nfs_close_local_fh() is doing the
+ * close and we must wait. until it unlinks
+ */
+ wait_var_event_spinlock(nfl,
+ list_first_entry_or_null(
+ &nfs_uuid->files,
+ struct nfs_file_localio,
+ list) != nfl,
+ &nfs_uuid->lock);
+ continue;
+ }
+
+ /* Remove nfl from nfs_uuid->files list */
+ list_del_init(&nfl->list);
+ spin_unlock(&nfs_uuid->lock);
+
+ nfs_to_nfsd_file_put_local(&nfl->ro_file);
+ nfs_to_nfsd_file_put_local(&nfl->rw_file);
cond_resched();
- }
- spin_lock(&nfs_uuid->lock);
- BUG_ON(!list_empty(&nfs_uuid->files));
+ spin_lock(&nfs_uuid->lock);
+ /* Now we can allow racing nfs_close_local_fh() to
+ * skip the locking.
+ */
+ RCU_INIT_POINTER(nfl->nfs_uuid, NULL);
+ wake_up_var_locked(&nfl->nfs_uuid, &nfs_uuid->lock);
+ }
/* Remove client from nn->local_clients */
if (nfs_uuid->list_lock) {
@@ -237,6 +261,7 @@ static void nfs_uuid_add_file(nfs_uuid_t *nfs_uuid, struct nfs_file_localio *nfl
struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid,
struct rpc_clnt *rpc_clnt, const struct cred *cred,
const struct nfs_fh *nfs_fh, struct nfs_file_localio *nfl,
+ struct nfsd_file __rcu **pnf,
const fmode_t fmode)
{
struct net *net;
@@ -261,10 +286,9 @@ struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid,
rcu_read_unlock();
/* We have an implied reference to net thanks to nfsd_net_try_get */
localio = nfs_to->nfsd_open_local_fh(net, uuid->dom, rpc_clnt,
- cred, nfs_fh, fmode);
- if (IS_ERR(localio))
- nfs_to_nfsd_net_put(net);
- else
+ cred, nfs_fh, pnf, fmode);
+ nfs_to_nfsd_net_put(net);
+ if (!IS_ERR(localio))
nfs_uuid_add_file(uuid, nfl);
return localio;
@@ -273,8 +297,6 @@ EXPORT_SYMBOL_GPL(nfs_open_local_fh);
void nfs_close_local_fh(struct nfs_file_localio *nfl)
{
- struct nfsd_file *ro_nf = NULL;
- struct nfsd_file *rw_nf = NULL;
nfs_uuid_t *nfs_uuid;
rcu_read_lock();
@@ -285,28 +307,39 @@ void nfs_close_local_fh(struct nfs_file_localio *nfl)
return;
}
- ro_nf = rcu_access_pointer(nfl->ro_file);
- rw_nf = rcu_access_pointer(nfl->rw_file);
- if (ro_nf || rw_nf) {
- spin_lock(&nfs_uuid->lock);
- if (ro_nf)
- ro_nf = rcu_dereference_protected(xchg(&nfl->ro_file, NULL), 1);
- if (rw_nf)
- rw_nf = rcu_dereference_protected(xchg(&nfl->rw_file, NULL), 1);
-
- /* Remove nfl from nfs_uuid->files list */
- RCU_INIT_POINTER(nfl->nfs_uuid, NULL);
- list_del_init(&nfl->list);
+ spin_lock(&nfs_uuid->lock);
+ if (!rcu_access_pointer(nfl->nfs_uuid)) {
+ /* nfs_uuid_put has finished here */
spin_unlock(&nfs_uuid->lock);
rcu_read_unlock();
-
- if (ro_nf)
- nfs_to_nfsd_file_put_local(ro_nf);
- if (rw_nf)
- nfs_to_nfsd_file_put_local(rw_nf);
return;
}
+ if (list_empty(&nfs_uuid->files)) {
+ /* nfs_uuid_put() has started closing files, wait for it
+ * to finished
+ */
+ spin_unlock(&nfs_uuid->lock);
+ rcu_read_unlock();
+ wait_var_event(&nfl->nfs_uuid,
+ rcu_access_pointer(nfl->nfs_uuid) == NULL);
+ return;
+ }
+ /* tell nfs_uuid_put() to wait for us */
+ RCU_INIT_POINTER(nfl->nfs_uuid, NULL);
+ spin_unlock(&nfs_uuid->lock);
rcu_read_unlock();
+
+ nfs_to_nfsd_file_put_local(&nfl->ro_file);
+ nfs_to_nfsd_file_put_local(&nfl->rw_file);
+
+ /* Remove nfl from nfs_uuid->files list and signal nfs_uuid_put()
+ * that we are done. The moment we drop the spinlock the
+ * nfs_uuid could be freed.
+ */
+ spin_lock(&nfs_uuid->lock);
+ list_del_init(&nfl->list);
+ wake_up_var_locked(&nfl->nfs_uuid, &nfs_uuid->lock);
+ spin_unlock(&nfs_uuid->lock);
}
EXPORT_SYMBOL_GPL(nfs_close_local_fh);
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 08a20e5bcf7f..19078a043e85 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -178,11 +178,13 @@ nfsd4_block_proc_layoutcommit(struct inode *inode,
{
struct iomap *iomaps;
int nr_iomaps;
+ __be32 nfserr;
- nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
- lcp->lc_up_len, &iomaps, i_blocksize(inode));
- if (nr_iomaps < 0)
- return nfserrno(nr_iomaps);
+ nfserr = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
+ lcp->lc_up_len, &iomaps, &nr_iomaps,
+ i_blocksize(inode));
+ if (nfserr != nfs_ok)
+ return nfserr;
return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
}
@@ -316,11 +318,13 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode,
{
struct iomap *iomaps;
int nr_iomaps;
+ __be32 nfserr;
- nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout,
- lcp->lc_up_len, &iomaps, i_blocksize(inode));
- if (nr_iomaps < 0)
- return nfserrno(nr_iomaps);
+ nfserr = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout,
+ lcp->lc_up_len, &iomaps, &nr_iomaps,
+ i_blocksize(inode));
+ if (nfserr != nfs_ok)
+ return nfserr;
return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
}
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index ce78f74715ee..bcf21fde9120 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -112,35 +112,46 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
return 0;
}
-int
+/**
+ * nfsd4_block_decode_layoutupdate - decode the block layout extent array
+ * @p: pointer to the xdr data
+ * @len: number of bytes to decode
+ * @iomapp: pointer to store the decoded extent array
+ * @nr_iomapsp: pointer to store the number of extents
+ * @block_size: alignment of extent offset and length
+ *
+ * This function decodes the opaque field of the layoutupdate4 structure
+ * in a layoutcommit request for the block layout driver. The field is
+ * actually an array of extents sent by the client. It also checks that
+ * the file offset, storage offset and length of each extent are aligned
+ * by @block_size.
+ *
+ * Return values:
+ * %nfs_ok: Successful decoding, @iomapp and @nr_iomapsp are valid
+ * %nfserr_bad_xdr: The encoded array in @p is invalid
+ * %nfserr_inval: An unaligned extent found
+ * %nfserr_delay: Failed to allocate memory for @iomapp
+ */
+__be32
nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
- u32 block_size)
+ int *nr_iomapsp, u32 block_size)
{
struct iomap *iomaps;
u32 nr_iomaps, i;
- if (len < sizeof(u32)) {
- dprintk("%s: extent array too small: %u\n", __func__, len);
- return -EINVAL;
- }
+ if (len < sizeof(u32))
+ return nfserr_bad_xdr;
len -= sizeof(u32);
- if (len % PNFS_BLOCK_EXTENT_SIZE) {
- dprintk("%s: extent array invalid: %u\n", __func__, len);
- return -EINVAL;
- }
+ if (len % PNFS_BLOCK_EXTENT_SIZE)
+ return nfserr_bad_xdr;
nr_iomaps = be32_to_cpup(p++);
- if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) {
- dprintk("%s: extent array size mismatch: %u/%u\n",
- __func__, len, nr_iomaps);
- return -EINVAL;
- }
+ if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE)
+ return nfserr_bad_xdr;
iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
- if (!iomaps) {
- dprintk("%s: failed to allocate extent array\n", __func__);
- return -ENOMEM;
- }
+ if (!iomaps)
+ return nfserr_delay;
for (i = 0; i < nr_iomaps; i++) {
struct pnfs_block_extent bex;
@@ -150,26 +161,18 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
p = xdr_decode_hyper(p, &bex.foff);
if (bex.foff & (block_size - 1)) {
- dprintk("%s: unaligned offset 0x%llx\n",
- __func__, bex.foff);
goto fail;
}
p = xdr_decode_hyper(p, &bex.len);
if (bex.len & (block_size - 1)) {
- dprintk("%s: unaligned length 0x%llx\n",
- __func__, bex.foff);
goto fail;
}
p = xdr_decode_hyper(p, &bex.soff);
if (bex.soff & (block_size - 1)) {
- dprintk("%s: unaligned disk offset 0x%llx\n",
- __func__, bex.soff);
goto fail;
}
bex.es = be32_to_cpup(p++);
if (bex.es != PNFS_BLOCK_READWRITE_DATA) {
- dprintk("%s: incorrect extent state %d\n",
- __func__, bex.es);
goto fail;
}
@@ -178,59 +181,71 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
}
*iomapp = iomaps;
- return nr_iomaps;
+ *nr_iomapsp = nr_iomaps;
+ return nfs_ok;
fail:
kfree(iomaps);
- return -EINVAL;
+ return nfserr_inval;
}
-int
+/**
+ * nfsd4_scsi_decode_layoutupdate - decode the scsi layout extent array
+ * @p: pointer to the xdr data
+ * @len: number of bytes to decode
+ * @iomapp: pointer to store the decoded extent array
+ * @nr_iomapsp: pointer to store the number of extents
+ * @block_size: alignment of extent offset and length
+ *
+ * This function decodes the opaque field of the layoutupdate4 structure
+ * in a layoutcommit request for the scsi layout driver. The field is
+ * actually an array of extents sent by the client. It also checks that
+ * the offset and length of each extent are aligned by @block_size.
+ *
+ * Return values:
+ * %nfs_ok: Successful decoding, @iomapp and @nr_iomapsp are valid
+ * %nfserr_bad_xdr: The encoded array in @p is invalid
+ * %nfserr_inval: An unaligned extent found
+ * %nfserr_delay: Failed to allocate memory for @iomapp
+ */
+__be32
nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
- u32 block_size)
+ int *nr_iomapsp, u32 block_size)
{
struct iomap *iomaps;
u32 nr_iomaps, expected, i;
- if (len < sizeof(u32)) {
- dprintk("%s: extent array too small: %u\n", __func__, len);
- return -EINVAL;
- }
+ if (len < sizeof(u32))
+ return nfserr_bad_xdr;
nr_iomaps = be32_to_cpup(p++);
expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE;
- if (len != expected) {
- dprintk("%s: extent array size mismatch: %u/%u\n",
- __func__, len, expected);
- return -EINVAL;
- }
+ if (len != expected)
+ return nfserr_bad_xdr;
iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
- if (!iomaps) {
- dprintk("%s: failed to allocate extent array\n", __func__);
- return -ENOMEM;
- }
+ if (!iomaps)
+ return nfserr_delay;
for (i = 0; i < nr_iomaps; i++) {
u64 val;
p = xdr_decode_hyper(p, &val);
if (val & (block_size - 1)) {
- dprintk("%s: unaligned offset 0x%llx\n", __func__, val);
goto fail;
}
iomaps[i].offset = val;
p = xdr_decode_hyper(p, &val);
if (val & (block_size - 1)) {
- dprintk("%s: unaligned length 0x%llx\n", __func__, val);
goto fail;
}
iomaps[i].length = val;
}
*iomapp = iomaps;
- return nr_iomaps;
+ *nr_iomapsp = nr_iomaps;
+ return nfs_ok;
fail:
kfree(iomaps);
- return -EINVAL;
+ return nfserr_inval;
}
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index 4e28ac8f1127..15b3569f3d9a 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -54,9 +54,9 @@ __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
const struct nfsd4_getdeviceinfo *gdp);
__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
const struct nfsd4_layoutget *lgp);
-int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
- u32 block_size);
-int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
- u32 block_size);
+__be32 nfsd4_block_decode_layoutupdate(__be32 *p, u32 len,
+ struct iomap **iomapp, int *nr_iomapsp, u32 block_size);
+__be32 nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len,
+ struct iomap **iomapp, int *nr_iomapsp, u32 block_size);
#endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 88ae410b4113..cadfc2bae60e 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -82,8 +82,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
int len;
struct auth_domain *dom = NULL;
int err;
- int fsidtype;
- char *ep;
+ u8 fsidtype;
struct svc_expkey key;
struct svc_expkey *ek = NULL;
@@ -109,10 +108,9 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
err = -EINVAL;
if (qword_get(&mesg, buf, PAGE_SIZE) <= 0)
goto out;
- fsidtype = simple_strtoul(buf, &ep, 10);
- if (*ep)
+ if (kstrtou8(buf, 10, &fsidtype))
goto out;
- dprintk("found fsidtype %d\n", fsidtype);
+ dprintk("found fsidtype %u\n", fsidtype);
if (key_len(fsidtype)==0) /* invalid type */
goto out;
if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 4d92b99c1ffd..b9c0adb3ce09 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -88,7 +88,7 @@ struct svc_expkey {
struct cache_head h;
struct auth_domain * ek_client;
- int ek_fsidtype;
+ u8 ek_fsidtype;
u32 ek_fsid[6];
struct path ek_path;
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index ab85e6a2454f..732abf6b92a5 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -372,21 +372,47 @@ nfsd_file_put(struct nfsd_file *nf)
/**
* nfsd_file_put_local - put nfsd_file reference and arm nfsd_net_put in caller
- * @nf: nfsd_file of which to put the reference
+ * @pnf: nfsd_file of which to put the reference
*
* First save the associated net to return to caller, then put
* the reference of the nfsd_file.
*/
struct net *
-nfsd_file_put_local(struct nfsd_file *nf)
+nfsd_file_put_local(struct nfsd_file __rcu **pnf)
{
- struct net *net = nf->nf_net;
+ struct nfsd_file *nf;
+ struct net *net = NULL;
- nfsd_file_put(nf);
+ nf = unrcu_pointer(xchg(pnf, NULL));
+ if (nf) {
+ net = nf->nf_net;
+ nfsd_file_put(nf);
+ }
return net;
}
/**
+ * nfsd_file_get_local - get nfsd_file reference and reference to net
+ * @nf: nfsd_file of which to put the reference
+ *
+ * Get reference to both the nfsd_file and nf->nf_net.
+ */
+struct nfsd_file *
+nfsd_file_get_local(struct nfsd_file *nf)
+{
+ struct net *net = nf->nf_net;
+
+ if (nfsd_net_try_get(net)) {
+ nf = nfsd_file_get(nf);
+ if (!nf)
+ nfsd_net_put(net);
+ } else {
+ nf = NULL;
+ }
+ return nf;
+}
+
+/**
* nfsd_file_file - get the backing file of an nfsd_file
* @nf: nfsd_file of which to access the backing file.
*
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 5865f9c72712..722b26c71e45 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -62,7 +62,8 @@ void nfsd_file_cache_shutdown(void);
int nfsd_file_cache_start_net(struct net *net);
void nfsd_file_cache_shutdown_net(struct net *net);
void nfsd_file_put(struct nfsd_file *nf);
-struct net *nfsd_file_put_local(struct nfsd_file *nf);
+struct net *nfsd_file_put_local(struct nfsd_file __rcu **nf);
+struct nfsd_file *nfsd_file_get_local(struct nfsd_file *nf);
struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
struct file *nfsd_file_file(struct nfsd_file *nf);
void nfsd_file_close_inode_sync(struct inode *inode);
diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
index 238647fa379e..4f6468eb2adf 100644
--- a/fs/nfsd/localio.c
+++ b/fs/nfsd/localio.c
@@ -24,21 +24,6 @@
#include "filecache.h"
#include "cache.h"
-static const struct nfsd_localio_operations nfsd_localio_ops = {
- .nfsd_net_try_get = nfsd_net_try_get,
- .nfsd_net_put = nfsd_net_put,
- .nfsd_open_local_fh = nfsd_open_local_fh,
- .nfsd_file_put_local = nfsd_file_put_local,
- .nfsd_file_get = nfsd_file_get,
- .nfsd_file_put = nfsd_file_put,
- .nfsd_file_file = nfsd_file_file,
-};
-
-void nfsd_localio_ops_init(void)
-{
- nfs_to = &nfsd_localio_ops;
-}
-
/**
* nfsd_open_local_fh - lookup a local filehandle @nfs_fh and map to nfsd_file
*
@@ -47,6 +32,7 @@ void nfsd_localio_ops_init(void)
* @rpc_clnt: rpc_clnt that the client established
* @cred: cred that the client established
* @nfs_fh: filehandle to lookup
+ * @pnf: place to find the nfsd_file, or store it if it was non-NULL
* @fmode: fmode_t to use for open
*
* This function maps a local fh to a path on a local filesystem.
@@ -57,10 +43,11 @@ void nfsd_localio_ops_init(void)
* set. Caller (NFS client) is responsible for calling nfsd_net_put and
* nfsd_file_put (via nfs_to_nfsd_file_put_local).
*/
-struct nfsd_file *
+static struct nfsd_file *
nfsd_open_local_fh(struct net *net, struct auth_domain *dom,
struct rpc_clnt *rpc_clnt, const struct cred *cred,
- const struct nfs_fh *nfs_fh, const fmode_t fmode)
+ const struct nfs_fh *nfs_fh, struct nfsd_file __rcu **pnf,
+ const fmode_t fmode)
{
int mayflags = NFSD_MAY_LOCALIO;
struct svc_cred rq_cred;
@@ -71,6 +58,15 @@ nfsd_open_local_fh(struct net *net, struct auth_domain *dom,
if (nfs_fh->size > NFS4_FHSIZE)
return ERR_PTR(-EINVAL);
+ if (!nfsd_net_try_get(net))
+ return ERR_PTR(-ENXIO);
+
+ rcu_read_lock();
+ localio = nfsd_file_get(rcu_dereference(*pnf));
+ rcu_read_unlock();
+ if (localio)
+ return localio;
+
/* nfs_fh -> svc_fh */
fh_init(&fh, NFS4_FHSIZE);
fh.fh_handle.fh_size = nfs_fh->size;
@@ -92,9 +88,47 @@ nfsd_open_local_fh(struct net *net, struct auth_domain *dom,
if (rq_cred.cr_group_info)
put_group_info(rq_cred.cr_group_info);
+ if (!IS_ERR(localio)) {
+ struct nfsd_file *new;
+ if (!nfsd_net_try_get(net)) {
+ nfsd_file_put(localio);
+ nfsd_net_put(net);
+ return ERR_PTR(-ENXIO);
+ }
+ nfsd_file_get(localio);
+ again:
+ new = unrcu_pointer(cmpxchg(pnf, NULL, RCU_INITIALIZER(localio)));
+ if (new) {
+ /* Some other thread installed an nfsd_file */
+ if (nfsd_file_get(new) == NULL)
+ goto again;
+ /*
+ * Drop the ref we were going to install and the
+ * one we were going to return.
+ */
+ nfsd_file_put(localio);
+ nfsd_file_put(localio);
+ localio = new;
+ }
+ } else
+ nfsd_net_put(net);
+
return localio;
}
-EXPORT_SYMBOL_GPL(nfsd_open_local_fh);
+
+static const struct nfsd_localio_operations nfsd_localio_ops = {
+ .nfsd_net_try_get = nfsd_net_try_get,
+ .nfsd_net_put = nfsd_net_put,
+ .nfsd_open_local_fh = nfsd_open_local_fh,
+ .nfsd_file_put_local = nfsd_file_put_local,
+ .nfsd_file_get_local = nfsd_file_get_local,
+ .nfsd_file_file = nfsd_file_file,
+};
+
+void nfsd_localio_ops_init(void)
+{
+ nfs_to = &nfsd_localio_ops;
+}
/*
* UUID_IS_LOCAL XDR functions
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index a817d8485d21..b6d03e1ef5f7 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -561,7 +561,7 @@ static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp,
buf->pages = rqstp->rq_next_page;
rqstp->rq_next_page += (buf->buflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
- xdr_init_encode_pages(xdr, buf, buf->pages, NULL);
+ xdr_init_encode_pages(xdr, buf);
}
/*
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index ccb00aa93be0..e00b2aea8da2 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1409,6 +1409,7 @@ void nfsd41_cb_referring_call(struct nfsd4_callback *cb,
out:
if (!rcl->__nr_referring_calls) {
cb->cb_nr_referring_call_list--;
+ list_del(&rcl->__list);
kfree(rcl);
}
}
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 290271ac4245..aea905fcaf87 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -65,7 +65,7 @@ nfsd4_alloc_devid_map(const struct svc_fh *fhp)
return;
map->fsid_type = fh->fh_fsid_type;
- memcpy(&map->fsid, fh->fh_fsid, fsid_len);
+ memcpy(&map->fsid, fh_fsid(fh), fsid_len);
spin_lock(&nfsd_devid_lock);
if (fhp->fh_export->ex_devid_map)
@@ -75,7 +75,7 @@ nfsd4_alloc_devid_map(const struct svc_fh *fhp)
list_for_each_entry(old, &nfsd_devid_hash[i], hash) {
if (old->fsid_type != fh->fh_fsid_type)
continue;
- if (memcmp(old->fsid, fh->fh_fsid,
+ if (memcmp(old->fsid, fh_fsid(fh),
key_len(old->fsid_type)))
continue;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index f13abbb13b38..71b428efcbb5 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1917,13 +1917,6 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd42_write_res *result;
__be32 status;
- /*
- * Currently, async COPY is not reliable. Force all COPY
- * requests to be synchronous to avoid client application
- * hangs waiting for COPY completion.
- */
- nfsd4_copy_set_sync(copy, true);
-
result = &copy->cp_res;
nfsd_copy_write_verifier((__be32 *)&result->wr_verifier.data, nn);
@@ -2842,20 +2835,10 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
rqstp->rq_lease_breaker = (void **)&cstate->clp;
- trace_nfsd_compound(rqstp, args->tag, args->taglen, args->client_opcnt);
+ trace_nfsd_compound(rqstp, args->tag, args->taglen, args->opcnt);
while (!status && resp->opcnt < args->opcnt) {
op = &args->ops[resp->opcnt++];
- if (unlikely(resp->opcnt == NFSD_MAX_OPS_PER_COMPOUND)) {
- /* If there are still more operations to process,
- * stop here and report NFS4ERR_RESOURCE. */
- if (cstate->minorversion == 0 &&
- args->client_opcnt > resp->opcnt) {
- op->status = nfserr_resource;
- goto encode_op;
- }
- }
-
/*
* The XDR decode routines may have pre-set op->status;
* for example, if there is a miscellaneous XDR error
@@ -2932,7 +2915,7 @@ encode_op:
status = op->status;
}
- trace_nfsd_compound_status(args->client_opcnt, resp->opcnt,
+ trace_nfsd_compound_status(args->opcnt, resp->opcnt,
status, nfsd4_op_name(op->opnum));
nfsd4_cstate_clear_replay(cstate);
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 82785db730d9..2231192ec33f 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -950,38 +950,32 @@ static const struct rpc_pipe_ops cld_upcall_ops = {
.destroy_msg = cld_pipe_destroy_msg,
};
-static struct dentry *
+static int
nfsd4_cld_register_sb(struct super_block *sb, struct rpc_pipe *pipe)
{
- struct dentry *dir, *dentry;
+ struct dentry *dir;
+ int err;
dir = rpc_d_lookup_sb(sb, NFSD_PIPE_DIR);
if (dir == NULL)
- return ERR_PTR(-ENOENT);
- dentry = rpc_mkpipe_dentry(dir, NFSD_CLD_PIPE, NULL, pipe);
+ return -ENOENT;
+ err = rpc_mkpipe_dentry(dir, NFSD_CLD_PIPE, NULL, pipe);
dput(dir);
- return dentry;
+ return err;
}
-static void
-nfsd4_cld_unregister_sb(struct rpc_pipe *pipe)
-{
- if (pipe->dentry)
- rpc_unlink(pipe->dentry);
-}
-
-static struct dentry *
+static int
nfsd4_cld_register_net(struct net *net, struct rpc_pipe *pipe)
{
struct super_block *sb;
- struct dentry *dentry;
+ int err;
sb = rpc_get_sb_net(net);
if (!sb)
- return NULL;
- dentry = nfsd4_cld_register_sb(sb, pipe);
+ return 0;
+ err = nfsd4_cld_register_sb(sb, pipe);
rpc_put_sb_net(net);
- return dentry;
+ return err;
}
static void
@@ -991,7 +985,7 @@ nfsd4_cld_unregister_net(struct net *net, struct rpc_pipe *pipe)
sb = rpc_get_sb_net(net);
if (sb) {
- nfsd4_cld_unregister_sb(pipe);
+ rpc_unlink(pipe);
rpc_put_sb_net(net);
}
}
@@ -1001,7 +995,6 @@ static int
__nfsd4_init_cld_pipe(struct net *net)
{
int ret;
- struct dentry *dentry;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct cld_net *cn;
@@ -1022,13 +1015,10 @@ __nfsd4_init_cld_pipe(struct net *net)
spin_lock_init(&cn->cn_lock);
INIT_LIST_HEAD(&cn->cn_list);
- dentry = nfsd4_cld_register_net(net, cn->cn_pipe);
- if (IS_ERR(dentry)) {
- ret = PTR_ERR(dentry);
+ ret = nfsd4_cld_register_net(net, cn->cn_pipe);
+ if (unlikely(ret))
goto err_destroy_data;
- }
- cn->cn_pipe->dentry = dentry;
#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING
cn->cn_has_legacy = false;
#endif
@@ -2121,7 +2111,6 @@ rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr)
struct net *net = sb->s_fs_info;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct cld_net *cn = nn->cld_net;
- struct dentry *dentry;
int ret = 0;
if (!try_module_get(THIS_MODULE))
@@ -2134,16 +2123,10 @@ rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr)
switch (event) {
case RPC_PIPEFS_MOUNT:
- dentry = nfsd4_cld_register_sb(sb, cn->cn_pipe);
- if (IS_ERR(dentry)) {
- ret = PTR_ERR(dentry);
- break;
- }
- cn->cn_pipe->dentry = dentry;
+ ret = nfsd4_cld_register_sb(sb, cn->cn_pipe);
break;
case RPC_PIPEFS_UMOUNT:
- if (cn->cn_pipe->dentry)
- nfsd4_cld_unregister_sb(cn->cn_pipe);
+ rpc_unlink(cn->cn_pipe);
break;
default:
ret = -ENOTSUPP;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d5694987f86f..88c347957da5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -633,18 +633,6 @@ find_readable_file(struct nfs4_file *f)
return ret;
}
-static struct nfsd_file *
-find_rw_file(struct nfs4_file *f)
-{
- struct nfsd_file *ret;
-
- spin_lock(&f->fi_lock);
- ret = nfsd_file_get(f->fi_fds[O_RDWR]);
- spin_unlock(&f->fi_lock);
-
- return ret;
-}
-
struct nfsd_file *
find_any_file(struct nfs4_file *f)
{
@@ -1218,15 +1206,20 @@ nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid)
static void put_deleg_file(struct nfs4_file *fp)
{
+ struct nfsd_file *rnf = NULL;
struct nfsd_file *nf = NULL;
spin_lock(&fp->fi_lock);
- if (--fp->fi_delegees == 0)
+ if (--fp->fi_delegees == 0) {
swap(nf, fp->fi_deleg_file);
+ swap(rnf, fp->fi_rdeleg_file);
+ }
spin_unlock(&fp->fi_lock);
if (nf)
nfsd_file_put(nf);
+ if (rnf)
+ nfs4_file_put_access(fp, NFS4_SHARE_ACCESS_READ);
}
static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp)
@@ -3872,7 +3865,6 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs
ca->headerpadsz = 0;
ca->maxreq_sz = min_t(u32, ca->maxreq_sz, maxrpc);
ca->maxresp_sz = min_t(u32, ca->maxresp_sz, maxrpc);
- ca->maxops = min_t(u32, ca->maxops, NFSD_MAX_OPS_PER_COMPOUND);
ca->maxresp_cached = min_t(u32, ca->maxresp_cached,
NFSD_SLOT_CACHE_SIZE + NFSD_MIN_HDR_SEQ_SZ);
ca->maxreqs = min_t(u32, ca->maxreqs, NFSD_MAX_SLOTS_PER_SESSION);
@@ -4697,10 +4689,16 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
}
status = nfs_ok;
if (conf) {
- old = unconf;
- unhash_client_locked(old);
- nfsd4_change_callback(conf, &unconf->cl_cb_conn);
- } else {
+ if (get_client_locked(conf) == nfs_ok) {
+ old = unconf;
+ unhash_client_locked(old);
+ nfsd4_change_callback(conf, &unconf->cl_cb_conn);
+ } else {
+ conf = NULL;
+ }
+ }
+
+ if (!conf) {
old = find_confirmed_client_by_name(&unconf->cl_name, nn);
if (old) {
status = nfserr_clid_inuse;
@@ -4717,10 +4715,14 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
}
trace_nfsd_clid_replaced(&old->cl_clientid);
}
+ status = get_client_locked(unconf);
+ if (status != nfs_ok) {
+ old = NULL;
+ goto out;
+ }
move_to_confirmed(unconf);
conf = unconf;
}
- get_client_locked(conf);
spin_unlock(&nn->client_lock);
if (conf == unconf)
fsnotify_dentry(conf->cl_nfsd_info_dentry, FS_MODIFY);
@@ -4750,6 +4752,7 @@ static void nfsd4_file_init(const struct svc_fh *fh, struct nfs4_file *fp)
INIT_LIST_HEAD(&fp->fi_clnt_odstate);
fh_copy_shallow(&fp->fi_fhandle, &fh->fh_handle);
fp->fi_deleg_file = NULL;
+ fp->fi_rdeleg_file = NULL;
fp->fi_had_conflict = false;
fp->fi_share_deny = 0;
memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
@@ -6000,14 +6003,19 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
* "An OPEN_DELEGATE_WRITE delegation allows the client to handle,
* on its own, all opens."
*
- * Furthermore the client can use a write delegation for most READ
- * operations as well, so we require a O_RDWR file here.
+ * Furthermore, section 9.1.2 says:
+ *
+ * "In the case of READ, the server may perform the corresponding
+ * check on the access mode, or it may choose to allow READ for
+ * OPEN4_SHARE_ACCESS_WRITE, to accommodate clients whose WRITE
+ * implementation may unavoidably do reads (e.g., due to buffer
+ * cache constraints)."
*
- * Offer a write delegation in the case of a BOTH open, and ensure
- * we get the O_RDWR descriptor.
+ * We choose to offer a write delegation for OPEN with the
+ * OPEN4_SHARE_ACCESS_WRITE access mode to accommodate such clients.
*/
- if ((open->op_share_access & NFS4_SHARE_ACCESS_BOTH) == NFS4_SHARE_ACCESS_BOTH) {
- nf = find_rw_file(fp);
+ if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) {
+ nf = find_writeable_file(fp);
dl_type = deleg_ts ? OPEN_DELEGATE_WRITE_ATTRS_DELEG : OPEN_DELEGATE_WRITE;
}
@@ -6138,7 +6146,7 @@ static bool
nfs4_delegation_stat(struct nfs4_delegation *dp, struct svc_fh *currentfh,
struct kstat *stat)
{
- struct nfsd_file *nf = find_rw_file(dp->dl_stid.sc_file);
+ struct nfsd_file *nf = find_writeable_file(dp->dl_stid.sc_file);
struct path path;
int rc;
@@ -6157,6 +6165,34 @@ nfs4_delegation_stat(struct nfs4_delegation *dp, struct svc_fh *currentfh,
}
/*
+ * Add NFS4_SHARE_ACCESS_READ to the write delegation granted on OPEN
+ * with NFS4_SHARE_ACCESS_WRITE by allocating separate nfsd_file and
+ * struct file to be used for read with delegation stateid.
+ *
+ */
+static bool
+nfsd4_add_rdaccess_to_wrdeleg(struct svc_rqst *rqstp, struct nfsd4_open *open,
+ struct svc_fh *fh, struct nfs4_ol_stateid *stp)
+{
+ struct nfs4_file *fp;
+ struct nfsd_file *nf = NULL;
+
+ if ((open->op_share_access & NFS4_SHARE_ACCESS_BOTH) ==
+ NFS4_SHARE_ACCESS_WRITE) {
+ if (nfsd_file_acquire_opened(rqstp, fh, NFSD_MAY_READ, NULL, &nf))
+ return (false);
+ fp = stp->st_stid.sc_file;
+ spin_lock(&fp->fi_lock);
+ __nfs4_file_get_access(fp, NFS4_SHARE_ACCESS_READ);
+ fp = stp->st_stid.sc_file;
+ fp->fi_fds[O_RDONLY] = nf;
+ fp->fi_rdeleg_file = nf;
+ spin_unlock(&fp->fi_lock);
+ }
+ return true;
+}
+
+/*
* The Linux NFS server does not offer write delegations to NFSv4.0
* clients in order to avoid conflicts between write delegations and
* GETATTRs requesting CHANGE or SIZE attributes.
@@ -6181,8 +6217,9 @@ nfs4_delegation_stat(struct nfs4_delegation *dp, struct svc_fh *currentfh,
* open or lock state.
*/
static void
-nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
- struct svc_fh *currentfh)
+nfs4_open_delegation(struct svc_rqst *rqstp, struct nfsd4_open *open,
+ struct nfs4_ol_stateid *stp, struct svc_fh *currentfh,
+ struct svc_fh *fh)
{
struct nfs4_openowner *oo = openowner(stp->st_stateowner);
bool deleg_ts = nfsd4_want_deleg_timestamps(open);
@@ -6227,7 +6264,8 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid));
if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) {
- if (!nfs4_delegation_stat(dp, currentfh, &stat)) {
+ if (!nfsd4_add_rdaccess_to_wrdeleg(rqstp, open, fh, stp) ||
+ !nfs4_delegation_stat(dp, currentfh, &stat)) {
nfs4_put_stid(&dp->dl_stid);
destroy_delegation(dp);
goto out_no_deleg;
@@ -6322,6 +6360,20 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
status = nfs4_check_deleg(cl, open, &dp);
if (status)
goto out;
+ if (dp && nfsd4_is_deleg_cur(open) &&
+ (dp->dl_stid.sc_file != fp)) {
+ /*
+ * RFC8881 section 8.2.4 mandates the server to return
+ * NFS4ERR_BAD_STATEID if the selected table entry does
+ * not match the current filehandle. However returning
+ * NFS4ERR_BAD_STATEID in the OPEN can cause the client
+ * to repeatedly retry the operation with the same
+ * stateid, since the stateid itself is valid. To avoid
+ * this situation NFSD returns NFS4ERR_INVAL instead.
+ */
+ status = nfserr_inval;
+ goto out;
+ }
stp = nfsd4_find_and_lock_existing_open(fp, open);
} else {
open->op_file = NULL;
@@ -6383,7 +6435,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
* Attempt to hand out a delegation. No error return, because the
* OPEN succeeds even if we fail.
*/
- nfs4_open_delegation(open, stp, &resp->cstate.current_fh);
+ nfs4_open_delegation(rqstp, open, stp,
+ &resp->cstate.current_fh, current_fh);
/*
* If there is an existing open stateid, it must be updated and
@@ -7076,7 +7129,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
return_revoked = true;
if (typemask & SC_TYPE_DELEG)
/* Always allow REVOKED for DELEG so we can
- * retturn the appropriate error.
+ * return the appropriate error.
*/
statusmask |= SC_STATUS_REVOKED;
@@ -7119,10 +7172,6 @@ nfs4_find_file(struct nfs4_stid *s, int flags)
switch (s->sc_type) {
case SC_TYPE_DELEG:
- spin_lock(&s->sc_file->fi_lock);
- ret = nfsd_file_get(s->sc_file->fi_deleg_file);
- spin_unlock(&s->sc_file->fi_lock);
- break;
case SC_TYPE_OPEN:
case SC_TYPE_LOCK:
if (flags & RD_STATE)
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 3afcdbed6e14..ea91bad4eee2 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2500,10 +2500,8 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
if (xdr_stream_decode_u32(argp->xdr, &argp->minorversion) < 0)
return false;
- if (xdr_stream_decode_u32(argp->xdr, &argp->client_opcnt) < 0)
+ if (xdr_stream_decode_u32(argp->xdr, &argp->opcnt) < 0)
return false;
- argp->opcnt = min_t(u32, argp->client_opcnt,
- NFSD_MAX_OPS_PER_COMPOUND);
if (argp->opcnt > ARRAY_SIZE(argp->iops)) {
argp->ops = vcalloc(argp->opcnt, sizeof(*argp->ops));
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3f3e9f6c4250..bc6b776fc657 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1436,7 +1436,7 @@ unsigned int nfsd_net_id;
static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb,
struct netlink_callback *cb,
- struct nfsd_genl_rqstp *rqstp)
+ struct nfsd_genl_rqstp *genl_rqstp)
{
void *hdr;
u32 i;
@@ -1446,22 +1446,22 @@ static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb,
if (!hdr)
return -ENOBUFS;
- if (nla_put_be32(skb, NFSD_A_RPC_STATUS_XID, rqstp->rq_xid) ||
- nla_put_u32(skb, NFSD_A_RPC_STATUS_FLAGS, rqstp->rq_flags) ||
- nla_put_u32(skb, NFSD_A_RPC_STATUS_PROG, rqstp->rq_prog) ||
- nla_put_u32(skb, NFSD_A_RPC_STATUS_PROC, rqstp->rq_proc) ||
- nla_put_u8(skb, NFSD_A_RPC_STATUS_VERSION, rqstp->rq_vers) ||
+ if (nla_put_be32(skb, NFSD_A_RPC_STATUS_XID, genl_rqstp->rq_xid) ||
+ nla_put_u32(skb, NFSD_A_RPC_STATUS_FLAGS, genl_rqstp->rq_flags) ||
+ nla_put_u32(skb, NFSD_A_RPC_STATUS_PROG, genl_rqstp->rq_prog) ||
+ nla_put_u32(skb, NFSD_A_RPC_STATUS_PROC, genl_rqstp->rq_proc) ||
+ nla_put_u8(skb, NFSD_A_RPC_STATUS_VERSION, genl_rqstp->rq_vers) ||
nla_put_s64(skb, NFSD_A_RPC_STATUS_SERVICE_TIME,
- ktime_to_us(rqstp->rq_stime),
+ ktime_to_us(genl_rqstp->rq_stime),
NFSD_A_RPC_STATUS_PAD))
return -ENOBUFS;
- switch (rqstp->rq_saddr.sa_family) {
+ switch (genl_rqstp->rq_saddr.sa_family) {
case AF_INET: {
const struct sockaddr_in *s_in, *d_in;
- s_in = (const struct sockaddr_in *)&rqstp->rq_saddr;
- d_in = (const struct sockaddr_in *)&rqstp->rq_daddr;
+ s_in = (const struct sockaddr_in *)&genl_rqstp->rq_saddr;
+ d_in = (const struct sockaddr_in *)&genl_rqstp->rq_daddr;
if (nla_put_in_addr(skb, NFSD_A_RPC_STATUS_SADDR4,
s_in->sin_addr.s_addr) ||
nla_put_in_addr(skb, NFSD_A_RPC_STATUS_DADDR4,
@@ -1476,8 +1476,8 @@ static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb,
case AF_INET6: {
const struct sockaddr_in6 *s_in, *d_in;
- s_in = (const struct sockaddr_in6 *)&rqstp->rq_saddr;
- d_in = (const struct sockaddr_in6 *)&rqstp->rq_daddr;
+ s_in = (const struct sockaddr_in6 *)&genl_rqstp->rq_saddr;
+ d_in = (const struct sockaddr_in6 *)&genl_rqstp->rq_daddr;
if (nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_SADDR6,
&s_in->sin6_addr) ||
nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_DADDR6,
@@ -1491,9 +1491,9 @@ static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb,
}
}
- for (i = 0; i < rqstp->rq_opcnt; i++)
+ for (i = 0; i < genl_rqstp->rq_opcnt; i++)
if (nla_put_u32(skb, NFSD_A_RPC_STATUS_COMPOUND_OPS,
- rqstp->rq_opnum[i]))
+ genl_rqstp->rq_opnum[i]))
return -ENOBUFS;
genlmsg_end(skb, hdr);
@@ -1569,7 +1569,8 @@ int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb,
int j;
args = rqstp->rq_argp;
- genl_rqstp.rq_opcnt = args->opcnt;
+ genl_rqstp.rq_opcnt = min_t(u32, args->opcnt,
+ ARRAY_SIZE(genl_rqstp.rq_opnum));
for (j = 0; j < genl_rqstp.rq_opcnt; j++)
genl_rqstp.rq_opnum[j] =
args->ops[j].opnum;
@@ -1611,7 +1612,7 @@ out_unlock:
*/
int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info)
{
- int *nthreads, count = 0, nrpools, i, ret = -EOPNOTSUPP, rem;
+ int *nthreads, nrpools = 0, i, ret = -EOPNOTSUPP, rem;
struct net *net = genl_info_net(info);
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
const struct nlattr *attr;
@@ -1621,14 +1622,12 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
/* count number of SERVER_THREADS values */
- nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
- if (nla_type(attr) == NFSD_A_SERVER_THREADS)
- count++;
- }
+ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_THREADS, info->nlhdr,
+ GENL_HDRLEN, rem)
+ nrpools++;
mutex_lock(&nfsd_mutex);
- nrpools = max(count, nfsd_nrpools(net));
nthreads = kcalloc(nrpools, sizeof(int), GFP_KERNEL);
if (!nthreads) {
ret = -ENOMEM;
@@ -1636,12 +1635,11 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info)
}
i = 0;
- nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
- if (nla_type(attr) == NFSD_A_SERVER_THREADS) {
- nthreads[i++] = nla_get_u32(attr);
- if (i >= nrpools)
- break;
- }
+ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_THREADS, info->nlhdr,
+ GENL_HDRLEN, rem) {
+ nthreads[i++] = nla_get_u32(attr);
+ if (i >= nrpools)
+ break;
}
if (info->attrs[NFSD_A_SERVER_GRACETIME] ||
@@ -1782,14 +1780,12 @@ int nfsd_nl_version_set_doit(struct sk_buff *skb, struct genl_info *info)
for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++)
nfsd_minorversion(nn, i, NFSD_CLEAR);
- nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
+ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_PROTO_VERSION, info->nlhdr,
+ GENL_HDRLEN, rem) {
struct nlattr *tb[NFSD_A_VERSION_MAX + 1];
u32 major, minor = 0;
bool enabled;
- if (nla_type(attr) != NFSD_A_SERVER_PROTO_VERSION)
- continue;
-
if (nla_parse_nested(tb, NFSD_A_VERSION_MAX, attr,
nfsd_version_nl_policy, info->extack) < 0)
continue;
@@ -1940,14 +1936,12 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info)
* Walk the list of server_socks from userland and move any that match
* back to sv_permsocks
*/
- nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
+ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr,
+ GENL_HDRLEN, rem) {
struct nlattr *tb[NFSD_A_SOCK_MAX + 1];
const char *xcl_name;
struct sockaddr *sa;
- if (nla_type(attr) != NFSD_A_SERVER_SOCK_ADDR)
- continue;
-
if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr,
nfsd_sock_nl_policy, info->extack) < 0)
continue;
@@ -2002,15 +1996,13 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info)
svc_xprt_destroy_all(serv, net);
/* walk list of addrs again, open any that still don't exist */
- nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
+ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr,
+ GENL_HDRLEN, rem) {
struct nlattr *tb[NFSD_A_SOCK_MAX + 1];
const char *xcl_name;
struct sockaddr *sa;
int ret;
- if (nla_type(attr) != NFSD_A_SERVER_SOCK_ADDR)
- continue;
-
if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr,
nfsd_sock_nl_policy, info->extack) < 0)
continue;
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 1bfd0b4e9af7..1cd0bed57bc2 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -57,9 +57,6 @@ struct readdir_cd {
__be32 err; /* 0, nfserr, or nfserr_eof */
};
-/* Maximum number of operations per session compound */
-#define NFSD_MAX_OPS_PER_COMPOUND 50
-
struct nfsd_genl_rqstp {
struct sockaddr rq_daddr;
struct sockaddr rq_saddr;
@@ -72,7 +69,7 @@ struct nfsd_genl_rqstp {
/* NFSv4 compound */
u32 rq_opcnt;
- u32 rq_opnum[NFSD_MAX_OPS_PER_COMPOUND];
+ u32 rq_opnum[16];
};
extern struct svc_program nfsd_programs[];
@@ -283,6 +280,7 @@ void nfsd_lockd_shutdown(void);
#define nfserr_cb_path_down cpu_to_be32(NFSERR_CB_PATH_DOWN)
#define nfserr_locked cpu_to_be32(NFSERR_LOCKED)
#define nfserr_wrongsec cpu_to_be32(NFSERR_WRONGSEC)
+#define nfserr_delay cpu_to_be32(NFS4ERR_DELAY)
#define nfserr_badiomode cpu_to_be32(NFS4ERR_BADIOMODE)
#define nfserr_badlayout cpu_to_be32(NFS4ERR_BADLAYOUT)
#define nfserr_bad_session_digest cpu_to_be32(NFS4ERR_BAD_SESSION_DIGEST)
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index aef474f1b84b..74cf1f4de174 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -172,6 +172,8 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net,
if (len == 0)
return error;
if (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
+ u32 *fsid = fh_fsid(fh);
+
/* deprecated, convert to type 3 */
len = key_len(FSID_ENCODE_DEV)/4;
fh->fh_fsid_type = FSID_ENCODE_DEV;
@@ -181,17 +183,17 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net,
* confuses sparse, so we must use __force here to
* keep it from complaining.
*/
- fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]),
- ntohl((__force __be32)fh->fh_fsid[1])));
- fh->fh_fsid[1] = fh->fh_fsid[2];
+ fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fsid[0]),
+ ntohl((__force __be32)fsid[1])));
+ fsid[1] = fsid[2];
}
data_left -= len;
if (data_left < 0)
return error;
exp = rqst_exp_find(rqstp ? &rqstp->rq_chandle : NULL,
net, client, gssclient,
- fh->fh_fsid_type, fh->fh_fsid);
- fid = (struct fid *)(fh->fh_fsid + len);
+ fh->fh_fsid_type, fh_fsid(fh));
+ fid = (struct fid *)(fh_fsid(fh) + len);
error = nfserr_stale;
if (IS_ERR(exp)) {
@@ -463,7 +465,7 @@ static void _fh_update(struct svc_fh *fhp, struct svc_export *exp,
{
if (dentry != exp->ex_path.dentry) {
struct fid *fid = (struct fid *)
- (fhp->fh_handle.fh_fsid + fhp->fh_handle.fh_size/4 - 1);
+ (fh_fsid(&fhp->fh_handle) + fhp->fh_handle.fh_size/4 - 1);
int maxsize = (fhp->fh_maxsize - fhp->fh_handle.fh_size)/4;
int fh_flags = (exp->ex_flags & NFSEXP_NOSUBTREECHECK) ? 0 :
EXPORT_FH_CONNECTABLE;
@@ -614,7 +616,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
fhp->fh_handle.fh_auth_type = 0;
mk_fsid(fhp->fh_handle.fh_fsid_type,
- fhp->fh_handle.fh_fsid,
+ fh_fsid(&fhp->fh_handle),
ex_dev,
d_inode(exp->ex_path.dentry)->i_ino,
exp->ex_fsid, exp->ex_uuid);
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 5103c2f4d225..1cf979722521 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -49,18 +49,19 @@ struct knfsd_fh {
* Points to the current size while
* building a new file handle.
*/
- union {
- char fh_raw[NFS4_FHSIZE];
- struct {
- u8 fh_version; /* == 1 */
- u8 fh_auth_type; /* deprecated */
- u8 fh_fsid_type;
- u8 fh_fileid_type;
- u32 fh_fsid[]; /* flexible-array member */
- };
- };
+ u8 fh_raw[NFS4_FHSIZE];
};
+#define fh_version fh_raw[0]
+#define fh_auth_type fh_raw[1]
+#define fh_fsid_type fh_raw[2]
+#define fh_fileid_type fh_raw[3]
+
+static inline u32 *fh_fsid(const struct knfsd_fh *fh)
+{
+ return (u32 *)&fh->fh_raw[4];
+}
+
static inline __u32 ino_t_to_u32(ino_t ino)
{
return (__u32) ino;
@@ -260,9 +261,12 @@ static inline bool fh_match(const struct knfsd_fh *fh1,
static inline bool fh_fsid_match(const struct knfsd_fh *fh1,
const struct knfsd_fh *fh2)
{
+ u32 *fsid1 = fh_fsid(fh1);
+ u32 *fsid2 = fh_fsid(fh2);
+
if (fh1->fh_fsid_type != fh2->fh_fsid_type)
return false;
- if (memcmp(fh1->fh_fsid, fh2->fh_fsid, key_len(fh1->fh_fsid_type)) != 0)
+ if (memcmp(fsid1, fsid2, key_len(fh1->fh_fsid_type)) != 0)
return false;
return true;
}
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index c10fa8128a8a..8f71f5748c75 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -575,7 +575,7 @@ static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp,
buf->pages = rqstp->rq_next_page;
rqstp->rq_next_page++;
- xdr_init_encode_pages(xdr, buf, buf->pages, NULL);
+ xdr_init_encode_pages(xdr, buf);
}
/*
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 1995bca158b8..8adc2550129e 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -665,6 +665,7 @@ struct nfs4_file {
atomic_t fi_access[2];
u32 fi_share_deny;
struct nfsd_file *fi_deleg_file;
+ struct nfsd_file *fi_rdeleg_file;
int fi_delegees;
struct knfsd_fh fi_fhandle;
bool fi_had_conflict;
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 3c5505ef5e3a..a664fdf1161e 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -344,7 +344,7 @@ TRACE_EVENT(nfsd_exp_find_key,
int status),
TP_ARGS(key, status),
TP_STRUCT__entry(
- __field(int, fsidtype)
+ __field(u8, fsidtype)
__array(u32, fsid, 6)
__string(auth_domain, key->ek_client->name)
__field(int, status)
@@ -367,7 +367,7 @@ TRACE_EVENT(nfsd_expkey_update,
TP_PROTO(const struct svc_expkey *key, const char *exp_path),
TP_ARGS(key, exp_path),
TP_STRUCT__entry(
- __field(int, fsidtype)
+ __field(u8, fsidtype)
__array(u32, fsid, 6)
__string(auth_domain, key->ek_client->name)
__string(path, exp_path)
@@ -1108,7 +1108,6 @@ DEFINE_NFSD_FILE_EVENT(nfsd_file_free);
DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
DEFINE_NFSD_FILE_EVENT(nfsd_file_closing);
-DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_queue);
TRACE_EVENT(nfsd_file_alloc,
TP_PROTO(
@@ -1344,9 +1343,7 @@ DEFINE_EVENT(nfsd_file_gc_class, name, \
TP_ARGS(nf))
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_add);
-DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_add_disposed);
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del);
-DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del_disposed);
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_in_use);
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_writeback);
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_referenced);
@@ -1380,7 +1377,6 @@ DEFINE_EVENT(nfsd_file_lruwalk_class, name, \
TP_ARGS(removed, remaining))
DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_gc_removed);
-DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_gc_recent);
DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_shrinker_removed);
TRACE_EVENT(nfsd_file_close,
@@ -2103,25 +2099,6 @@ TRACE_EVENT(nfsd_ctl_maxblksize,
)
);
-TRACE_EVENT(nfsd_ctl_maxconn,
- TP_PROTO(
- const struct net *net,
- int maxconn
- ),
- TP_ARGS(net, maxconn),
- TP_STRUCT__entry(
- __field(unsigned int, netns_ino)
- __field(int, maxconn)
- ),
- TP_fast_assign(
- __entry->netns_ino = net->ns.inum;
- __entry->maxconn = maxconn;
- ),
- TP_printk("maxconn=%d",
- __entry->maxconn
- )
-);
-
TRACE_EVENT(nfsd_ctl_time,
TP_PROTO(
const struct net *net,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index cd689df2ca5d..98ab55ba3ced 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1086,10 +1086,13 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
{
unsigned long v, total;
struct iov_iter iter;
- loff_t ppos = offset;
+ struct kiocb kiocb;
ssize_t host_err;
size_t len;
+ init_sync_kiocb(&kiocb, file);
+ kiocb.ki_pos = offset;
+
v = 0;
total = *count;
while (total) {
@@ -1104,7 +1107,7 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
trace_nfsd_read_vector(rqstp, fhp, offset, *count);
iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, *count);
- host_err = vfs_iter_read(file, &iter, &ppos, 0);
+ host_err = vfs_iocb_iter_read(file, &kiocb, &iter);
return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
}
@@ -1170,15 +1173,14 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
struct file *file = nf->nf_file;
struct super_block *sb = file_inode(file)->i_sb;
+ struct kiocb kiocb;
struct svc_export *exp;
struct iov_iter iter;
errseq_t since;
__be32 nfserr;
int host_err;
- loff_t pos = offset;
unsigned long exp_op_flags = 0;
unsigned int pflags = current->flags;
- rwf_t flags = 0;
bool restore_flags = false;
unsigned int nvecs;
@@ -1204,16 +1206,17 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (!EX_ISSYNC(exp))
stable = NFS_UNSTABLE;
-
+ init_sync_kiocb(&kiocb, file);
+ kiocb.ki_pos = offset;
if (stable && !fhp->fh_use_wgather)
- flags |= RWF_SYNC;
+ kiocb.ki_flags |= IOCB_DSYNC;
nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload);
iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
since = READ_ONCE(file->f_wb_err);
if (verf)
nfsd_copy_write_verifier(verf, nn);
- host_err = vfs_iter_write(file, &iter, &pos, flags);
+ host_err = vfs_iocb_iter_write(file, &kiocb, &iter);
if (host_err < 0) {
commit_reset_write_verifier(nn, rqstp, host_err);
goto out_nfserr;
@@ -1864,7 +1867,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
struct svc_fh *tfhp, char *tname, int tlen)
{
struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap;
- struct inode *fdir, *tdir;
int type = S_IFDIR;
__be32 err;
int host_err;
@@ -1880,10 +1882,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
goto out;
fdentry = ffhp->fh_dentry;
- fdir = d_inode(fdentry);
tdentry = tfhp->fh_dentry;
- tdir = d_inode(tdentry);
err = nfserr_perm;
if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
@@ -1944,10 +1944,10 @@ retry:
} else {
struct renamedata rd = {
.old_mnt_idmap = &nop_mnt_idmap,
- .old_dir = fdir,
+ .old_parent = fdentry,
.old_dentry = odentry,
.new_mnt_idmap = &nop_mnt_idmap,
- .new_dir = tdir,
+ .new_parent = tdentry,
.new_dentry = ndentry,
};
int retries;
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index aa2a356da784..a23bc56051ca 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -870,7 +870,6 @@ struct nfsd4_compoundargs {
char * tag;
u32 taglen;
u32 minorversion;
- u32 client_opcnt;
u32 opcnt;
bool splice_ok;
struct nfsd4_op *ops;
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 0d8f7fb15c2e..dd0c8e560ef6 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -2102,11 +2102,13 @@ static int nilfs_btree_propagate(struct nilfs_bmap *btree,
ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
if (ret < 0) {
- if (unlikely(ret == -ENOENT))
+ if (unlikely(ret == -ENOENT)) {
nilfs_crit(btree->b_inode->i_sb,
"writing node/leaf block does not appear in b-tree (ino=%lu) at key=%llu, level=%d",
btree->b_inode->i_ino,
(unsigned long long)key, level);
+ ret = -EINVAL;
+ }
goto out;
}
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 9b7f8e9655a2..6ca3d74be1e1 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -96,7 +96,7 @@ static void nilfs_commit_chunk(struct folio *folio,
int err;
nr_dirty = nilfs_page_count_clean_buffers(folio, from, to);
- copied = block_write_end(NULL, mapping, pos, len, len, folio, NULL);
+ copied = block_write_end(pos, len, len, folio);
if (pos + copied > dir->i_size)
i_size_write(dir, pos + copied);
if (IS_DIRSYNC(dir))
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 893ab36824cc..2d8dc6b35b54 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -273,6 +273,9 @@ static int nilfs_direct_propagate(struct nilfs_bmap *bmap,
dat = nilfs_bmap_get_dat(bmap);
key = nilfs_bmap_data_get_key(bmap, bh);
ptr = nilfs_direct_get_ptr(bmap, key);
+ if (ptr == NILFS_BMAP_INVALID_PTR)
+ return -EINVAL;
+
if (!buffer_nilfs_volatile(bh)) {
oldreq.pr_entry_nr = ptr;
newreq.pr_entry_nr = ptr;
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 0e3fc5ba33c7..1b8d754db44d 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -125,10 +125,10 @@ static const struct vm_operations_struct nilfs_file_vm_ops = {
.page_mkwrite = nilfs_page_mkwrite,
};
-static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int nilfs_file_mmap_prepare(struct vm_area_desc *desc)
{
- file_accessed(file);
- vma->vm_ops = &nilfs_file_vm_ops;
+ file_accessed(desc->file);
+ desc->vm_ops = &nilfs_file_vm_ops;
return 0;
}
@@ -144,7 +144,7 @@ const struct file_operations nilfs_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = nilfs_compat_ioctl,
#endif /* CONFIG_COMPAT */
- .mmap = nilfs_file_mmap,
+ .mmap_prepare = nilfs_file_mmap_prepare,
.open = generic_file_open,
/* .release = nilfs_release_file, */
.fsync = nilfs_sync_file,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 6613b8fcceb0..87ddde159f0c 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -218,7 +218,8 @@ void nilfs_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int nilfs_write_begin(struct file *file, struct address_space *mapping,
+static int nilfs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
@@ -237,7 +238,8 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping,
return err;
}
-static int nilfs_write_end(struct file *file, struct address_space *mapping,
+static int nilfs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
{
@@ -248,7 +250,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
nr_dirty = nilfs_page_count_clean_buffers(folio, start,
start + copied);
- copied = generic_write_end(file, mapping, pos, len, copied, folio,
+ copied = generic_write_end(iocb, mapping, pos, len, copied, folio,
fsdata);
nilfs_set_file_dirty(inode, nr_dirty);
err = nilfs_transaction_commit(inode->i_sb);
@@ -472,11 +474,18 @@ static int __nilfs_read_inode(struct super_block *sb,
inode->i_op = &nilfs_symlink_inode_operations;
inode_nohighmem(inode);
inode->i_mapping->a_ops = &nilfs_aops;
- } else {
+ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
inode->i_op = &nilfs_special_inode_operations;
init_special_inode(
inode, inode->i_mode,
huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
+ } else {
+ nilfs_error(sb,
+ "invalid file type bits in mode 0%o for inode %lu",
+ inode->i_mode, ino);
+ err = -EIO;
+ goto failed_unmap;
}
nilfs_ifile_unmap_inode(raw_inode);
brelse(bh);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index a66d62a51f77..3288c3b4be9e 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -118,7 +118,7 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
*
* Return: always 0 as success.
*/
-int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int nilfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
@@ -136,7 +136,7 @@ int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
* Return: 0 on success, or a negative error code on failure.
*/
int nilfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct nilfs_transaction_info ti;
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 2f850a18d6e7..946b0d3534a5 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -422,8 +422,6 @@ static int nilfs_mdt_write_folio(struct folio *folio,
if (wbc->sync_mode == WB_SYNC_ALL)
err = nilfs_construct_segment(sb);
- else if (wbc->for_reclaim)
- nilfs_flush_segment(sb, inode->i_ino);
return err;
}
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index cb6ed54accd7..f466daa39440 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -268,9 +268,9 @@ int nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
extern int nilfs_sync_file(struct file *, loff_t, loff_t, int);
/* ioctl.c */
-int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *m);
+int nilfs_fileattr_get(struct dentry *dentry, struct file_kattr *m);
int nilfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
long nilfs_ioctl(struct file *, unsigned int, unsigned long);
long nilfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *,
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 22aecf6e2344..a9c61d0492cb 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -560,8 +560,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
if (unlikely(err))
goto failed_folio;
- block_write_end(NULL, inode->i_mapping, pos, blocksize,
- blocksize, folio, NULL);
+ block_write_end(pos, blocksize, blocksize, folio);
folio_unlock(folio);
folio_put(folio);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 83970d97840b..f15ca6fc400d 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2221,22 +2221,6 @@ static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
spin_unlock(&sci->sc_state_lock);
}
-/**
- * nilfs_flush_segment - trigger a segment construction for resource control
- * @sb: super block
- * @ino: inode number of the file to be flushed out.
- */
-void nilfs_flush_segment(struct super_block *sb, ino_t ino)
-{
- struct the_nilfs *nilfs = sb->s_fs_info;
- struct nilfs_sc_info *sci = nilfs->ns_writer;
-
- if (!sci || nilfs_doing_construction())
- return;
- nilfs_segctor_do_flush(sci, NILFS_MDT_INODE(sb, ino) ? ino : 0);
- /* assign bit 0 to data files */
-}
-
struct nilfs_segctor_wait_request {
wait_queue_entry_t wq;
__u32 seq;
@@ -2501,7 +2485,7 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
static void nilfs_construction_timeout(struct timer_list *t)
{
- struct nilfs_sc_info *sci = from_timer(sci, t, sc_timer);
+ struct nilfs_sc_info *sci = timer_container_of(sci, t, sc_timer);
wake_up_process(sci->sc_task);
}
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index f723f47ddc4e..4b39ed43ae72 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -226,7 +226,6 @@ extern void nilfs_relax_pressure_in_lock(struct super_block *);
extern int nilfs_construct_segment(struct super_block *);
extern int nilfs_construct_dsync_segment(struct super_block *, struct inode *,
loff_t, loff_t);
-extern void nilfs_flush_segment(struct super_block *, ino_t);
extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
void **);
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index c4cdaf5fa7ed..9fb73bafd41d 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -308,6 +308,10 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
goto out_err;
}
+ error = file_f_owner_allocate(filp);
+ if (error)
+ goto out_err;
+
/* new fsnotify mark, we expect most fcntl calls to add a new mark */
new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL);
if (!new_dn_mark) {
@@ -315,10 +319,6 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
goto out_err;
}
- error = file_f_owner_allocate(filp);
- if (error)
- goto out_err;
-
/* set up the new_fsn_mark and new_dn_mark */
new_fsn_mark = &new_dn_mark->fsn_mark;
fsnotify_init_mark(new_fsn_mark, dnotify_group);
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 3083643b864b..bfe884d624e7 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -454,7 +454,13 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode,
dwords = fh_len >> 2;
type = exportfs_encode_fid(inode, buf, &dwords);
err = -EINVAL;
- if (type <= 0 || type == FILEID_INVALID || fh_len != dwords << 2)
+ /*
+ * Unlike file_handle, type and len of struct fanotify_fh are u8.
+ * Traditionally, filesystem return handle_type < 0xff, but there
+ * is no enforecement for that in vfs.
+ */
+ BUILD_BUG_ON(MAX_HANDLE_SZ > 0xff || FILEID_INVALID > 0xff);
+ if (type <= 0 || type >= FILEID_INVALID || fh_len != dwords << 2)
goto out_err;
fh->type = type;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index e2b4f17a48bb..079b868552c2 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -199,8 +199,8 @@ static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask,
}
/* Are there any inode/mount/sb objects that watch for these events? */
-static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
- __u32 mask)
+static inline __u32 fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
+ __u32 mask)
{
__u32 marks_mask = READ_ONCE(inode->i_fsnotify_mask) | mnt_mask |
READ_ONCE(inode->i_sb->s_fsnotify_mask);
@@ -656,20 +656,20 @@ EXPORT_SYMBOL_GPL(fsnotify);
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
/*
- * At open time we check fsnotify_sb_has_priority_watchers() and set the
- * FMODE_NONOTIFY_ mode bits accordignly.
+ * At open time we check fsnotify_sb_has_priority_watchers(), call the open perm
+ * hook and set the FMODE_NONOTIFY_ mode bits accordignly.
* Later, fsnotify permission hooks do not check if there are permission event
* watches, but that there were permission event watches at open time.
*/
-void file_set_fsnotify_mode_from_watchers(struct file *file)
+int fsnotify_open_perm_and_set_mode(struct file *file)
{
struct dentry *dentry = file->f_path.dentry, *parent;
struct super_block *sb = dentry->d_sb;
- __u32 mnt_mask, p_mask;
+ __u32 mnt_mask, p_mask = 0;
/* Is it a file opened by fanotify? */
if (FMODE_FSNOTIFY_NONE(file->f_mode))
- return;
+ return 0;
/*
* Permission events is a super set of pre-content events, so if there
@@ -679,45 +679,64 @@ void file_set_fsnotify_mode_from_watchers(struct file *file)
if (likely(!fsnotify_sb_has_priority_watchers(sb,
FSNOTIFY_PRIO_CONTENT))) {
file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM);
- return;
+ return 0;
}
/*
- * If there are permission event watchers but no pre-content event
- * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that.
+ * OK, there are some permission event watchers. Check if anybody is
+ * watching for permission events on *this* file.
*/
- if ((!d_is_dir(dentry) && !d_is_reg(dentry)) ||
- likely(!fsnotify_sb_has_priority_watchers(sb,
- FSNOTIFY_PRIO_PRE_CONTENT))) {
- file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM);
- return;
+ mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask);
+ p_mask = fsnotify_object_watched(d_inode(dentry), mnt_mask,
+ ALL_FSNOTIFY_PERM_EVENTS);
+ if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) {
+ parent = dget_parent(dentry);
+ p_mask |= fsnotify_inode_watches_children(d_inode(parent));
+ dput(parent);
}
/*
- * OK, there are some pre-content watchers. Check if anybody is
- * watching for pre-content events on *this* file.
+ * Legacy FAN_ACCESS_PERM events have very high performance overhead,
+ * so unlikely to be used in the wild. If they are used there will be
+ * no optimizations at all.
*/
- mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask);
- if (unlikely(fsnotify_object_watched(d_inode(dentry), mnt_mask,
- FSNOTIFY_PRE_CONTENT_EVENTS))) {
- /* Enable pre-content events */
+ if (unlikely(p_mask & FS_ACCESS_PERM)) {
+ /* Enable all permission and pre-content events */
file_set_fsnotify_mode(file, 0);
- return;
+ goto open_perm;
}
- /* Is parent watching for pre-content events on this file? */
- if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) {
- parent = dget_parent(dentry);
- p_mask = fsnotify_inode_watches_children(d_inode(parent));
- dput(parent);
- if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS) {
- /* Enable pre-content events */
- file_set_fsnotify_mode(file, 0);
- return;
- }
+ /*
+ * Pre-content events are only supported on regular files.
+ * If there are pre-content event watchers and no permission access
+ * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that.
+ * That is the common case with HSM service.
+ */
+ if (d_is_reg(dentry) && (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS)) {
+ file_set_fsnotify_mode(file, FMODE_NONOTIFY |
+ FMODE_NONOTIFY_PERM);
+ goto open_perm;
}
- /* Nobody watching for pre-content events from this file */
- file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM);
+
+ /* Nobody watching permission and pre-content events on this file */
+ file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM);
+
+open_perm:
+ /*
+ * Send open perm events depending on object masks and regardless of
+ * FMODE_NONOTIFY_PERM.
+ */
+ if (file->f_flags & __FMODE_EXEC && p_mask & FS_OPEN_EXEC_PERM) {
+ int ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM);
+
+ if (ret)
+ return ret;
+ }
+
+ if (p_mask & FS_OPEN_PERM)
+ return fsnotify_path(&file->f_path, FS_OPEN_PERM);
+
+ return 0;
}
#endif
diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c
index b6da80c69ca6..1b5c865a0339 100644
--- a/fs/ntfs3/dir.c
+++ b/fs/ntfs3/dir.c
@@ -304,6 +304,9 @@ static inline bool ntfs_dir_emit(struct ntfs_sb_info *sbi,
if (sbi->options->nohidden && (fname->dup.fa & FILE_ATTRIBUTE_HIDDEN))
return true;
+ if (fname->name_len + sizeof(struct NTFS_DE) > le16_to_cpu(e->size))
+ return true;
+
name_len = ntfs_utf16_to_nls(sbi, fname->name, fname->name_len, name,
PATH_MAX);
if (name_len <= 0) {
@@ -329,8 +332,7 @@ static inline bool ntfs_dir_emit(struct ntfs_sb_info *sbi,
* It does additional locks/reads just to get the type of name.
* Should we use additional mount option to enable branch below?
*/
- if (((fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) ||
- fname->dup.ea_size) &&
+ if (fname->dup.extend_data &&
ino != ni->mi.rno) {
struct inode *inode = ntfs_iget5(sbi->sb, &e->ref, NULL);
if (!IS_ERR_OR_NULL(inode)) {
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 34ed242e1063..c1ece707b195 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -57,6 +57,10 @@ long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg)
struct inode *inode = file_inode(filp);
struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ntfs_i(inode))))
+ return -EINVAL;
+
switch (cmd) {
case FITRIM:
return ntfs_ioctl_fitrim(sbi, arg);
@@ -81,6 +85,10 @@ int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct inode *inode = d_inode(path->dentry);
struct ntfs_inode *ni = ntfs_i(inode);
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
stat->result_mask |= STATX_BTIME;
stat->btime = ni->i_crtime;
stat->blksize = ni->mi.sbi->cluster_size; /* 512, 1K, ..., 2M */
@@ -154,13 +162,13 @@ static int ntfs_extend_initialized_size(struct file *file,
if (pos + len > new_valid)
len = new_valid - pos;
- err = ntfs_write_begin(file, mapping, pos, len, &folio, NULL);
+ err = ntfs_write_begin(NULL, mapping, pos, len, &folio, NULL);
if (err)
goto out;
folio_zero_range(folio, zerofrom, folio_size(folio) - zerofrom);
- err = ntfs_write_end(file, mapping, pos, len, len, folio, NULL);
+ err = ntfs_write_end(NULL, mapping, pos, len, len, folio, NULL);
if (err < 0)
goto out;
pos += len;
@@ -261,16 +269,21 @@ out:
}
/*
- * ntfs_file_mmap - file_operations::mmap
+ * ntfs_file_mmap_prepare - file_operations::mmap_prepare
*/
-static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int ntfs_file_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
struct inode *inode = file_inode(file);
struct ntfs_inode *ni = ntfs_i(inode);
- u64 from = ((u64)vma->vm_pgoff << PAGE_SHIFT);
- bool rw = vma->vm_flags & VM_WRITE;
+ u64 from = ((u64)desc->pgoff << PAGE_SHIFT);
+ bool rw = desc->vm_flags & VM_WRITE;
int err;
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
return -EIO;
@@ -291,7 +304,7 @@ static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma)
if (rw) {
u64 to = min_t(loff_t, i_size_read(inode),
- from + vma->vm_end - vma->vm_start);
+ from + desc->end - desc->start);
if (is_sparsed(ni)) {
/* Allocate clusters for rw map. */
@@ -310,7 +323,10 @@ static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma)
}
if (ni->i_valid < to) {
- inode_lock(inode);
+ if (!inode_trylock(inode)) {
+ err = -EAGAIN;
+ goto out;
+ }
err = ntfs_extend_initialized_size(file, ni,
ni->i_valid, to);
inode_unlock(inode);
@@ -319,7 +335,7 @@ static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma)
}
}
- err = generic_file_mmap(file, vma);
+ err = generic_file_mmap_prepare(desc);
out:
return err;
}
@@ -735,6 +751,10 @@ int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
umode_t mode = inode->i_mode;
int err;
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
return -EIO;
@@ -795,6 +815,10 @@ static int check_read_restriction(struct inode *inode)
{
struct ntfs_inode *ni = ntfs_i(inode);
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
return -EIO;
@@ -913,7 +937,8 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
struct ntfs_inode *ni = ntfs_i(inode);
u64 valid = ni->i_valid;
struct ntfs_sb_info *sbi = ni->mi.sbi;
- struct page *page, **pages = NULL;
+ struct page **pages = NULL;
+ struct folio *folio;
size_t written = 0;
u8 frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits;
u32 frame_size = 1u << frame_bits;
@@ -923,7 +948,6 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
u64 frame_vbo;
pgoff_t index;
bool frame_uptodate;
- struct folio *folio;
if (frame_size < PAGE_SIZE) {
/*
@@ -977,8 +1001,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
pages_per_frame);
if (err) {
for (ip = 0; ip < pages_per_frame; ip++) {
- page = pages[ip];
- folio = page_folio(page);
+ folio = page_folio(pages[ip]);
folio_unlock(folio);
folio_put(folio);
}
@@ -989,10 +1012,9 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
ip = off >> PAGE_SHIFT;
off = offset_in_page(valid);
for (; ip < pages_per_frame; ip++, off = 0) {
- page = pages[ip];
- folio = page_folio(page);
- zero_user_segment(page, off, PAGE_SIZE);
- flush_dcache_page(page);
+ folio = page_folio(pages[ip]);
+ folio_zero_segment(folio, off, PAGE_SIZE);
+ flush_dcache_folio(folio);
folio_mark_uptodate(folio);
}
@@ -1001,8 +1023,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
ni_unlock(ni);
for (ip = 0; ip < pages_per_frame; ip++) {
- page = pages[ip];
- folio = page_folio(page);
+ folio = page_folio(pages[ip]);
folio_mark_uptodate(folio);
folio_unlock(folio);
folio_put(folio);
@@ -1046,8 +1067,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
if (err) {
for (ip = 0; ip < pages_per_frame;
ip++) {
- page = pages[ip];
- folio = page_folio(page);
+ folio = page_folio(pages[ip]);
folio_unlock(folio);
folio_put(folio);
}
@@ -1065,10 +1085,10 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
for (;;) {
size_t cp, tail = PAGE_SIZE - off;
- page = pages[ip];
- cp = copy_page_from_iter_atomic(page, off,
+ folio = page_folio(pages[ip]);
+ cp = copy_folio_from_iter_atomic(folio, off,
min(tail, bytes), from);
- flush_dcache_page(page);
+ flush_dcache_folio(folio);
copied += cp;
bytes -= cp;
@@ -1088,9 +1108,8 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
ni_unlock(ni);
for (ip = 0; ip < pages_per_frame; ip++) {
- page = pages[ip];
- ClearPageDirty(page);
- folio = page_folio(page);
+ folio = page_folio(pages[ip]);
+ folio_clear_dirty(folio);
folio_mark_uptodate(folio);
folio_unlock(folio);
folio_put(folio);
@@ -1135,6 +1154,10 @@ static int check_write_restriction(struct inode *inode)
{
struct ntfs_inode *ni = ntfs_i(inode);
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
return -EIO;
@@ -1217,6 +1240,10 @@ int ntfs_file_open(struct inode *inode, struct file *file)
{
struct ntfs_inode *ni = ntfs_i(inode);
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
return -EIO;
@@ -1286,6 +1313,10 @@ int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
int err;
struct ntfs_inode *ni = ntfs_i(inode);
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
err = fiemap_prep(inode, fieinfo, start, &len, ~FIEMAP_FLAG_XATTR);
if (err)
return err;
@@ -1336,7 +1367,7 @@ const struct file_operations ntfs_file_operations = {
#endif
.splice_read = ntfs_file_splice_read,
.splice_write = ntfs_file_splice_write,
- .mmap = ntfs_file_mmap,
+ .mmap_prepare = ntfs_file_mmap_prepare,
.open = ntfs_file_open,
.fsync = generic_file_fsync,
.fallocate = ntfs_fallocate,
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index 756e1306fe6c..8f9fe1d7a690 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -3003,8 +3003,7 @@ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
* ni_rename - Remove one name and insert new name.
*/
int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni,
- struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de,
- bool *is_bad)
+ struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de)
{
int err;
struct NTFS_DE *de2 = NULL;
@@ -3027,8 +3026,8 @@ int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni,
err = ni_add_name(new_dir_ni, ni, new_de);
if (!err) {
err = ni_remove_name(dir_ni, ni, de, &de2, &undo);
- if (err && ni_remove_name(new_dir_ni, ni, new_de, &de2, &undo))
- *is_bad = true;
+ WARN_ON(err && ni_remove_name(new_dir_ni, ni, new_de, &de2,
+ &undo));
}
/*
@@ -3119,11 +3118,21 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup,
}
}
- /* TODO: Fill reparse info. */
- dup->reparse = 0;
- dup->ea_size = 0;
+ dup->extend_data = 0;
- if (ni->ni_flags & NI_FLAG_EA) {
+ if (dup->fa & FILE_ATTRIBUTE_REPARSE_POINT) {
+ attr = ni_find_attr(ni, NULL, NULL, ATTR_REPARSE, NULL, 0, NULL,
+ NULL);
+
+ if (attr) {
+ const struct REPARSE_POINT *rp;
+
+ rp = resident_data_ex(attr, sizeof(struct REPARSE_POINT));
+ /* If ATTR_REPARSE exists 'rp' can't be NULL. */
+ if (rp)
+ dup->extend_data = rp->ReparseTag;
+ }
+ } else if (ni->ni_flags & NI_FLAG_EA) {
attr = ni_find_attr(ni, attr, &le, ATTR_EA_INFO, NULL, 0, NULL,
NULL);
if (attr) {
@@ -3132,7 +3141,7 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup,
info = resident_data_ex(attr, sizeof(struct EA_INFO));
/* If ATTR_EA_INFO exists 'info' can't be NULL. */
if (info)
- dup->ea_size = info->size_pack;
+ dup->extend_data = info->size;
}
}
@@ -3199,6 +3208,10 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
if (is_bad_inode(inode) || sb_rdonly(sb))
return 0;
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(sb)))
return -EIO;
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index df81f1f7330c..c7a2f191254d 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -905,9 +905,13 @@ void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait)
void ntfs_bad_inode(struct inode *inode, const char *hint)
{
struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
+ struct ntfs_inode *ni = ntfs_i(inode);
ntfs_inode_err(inode, "%s", hint);
- make_bad_inode(inode);
+
+ /* Do not call make_bad_inode()! */
+ ni->ni_bad = true;
+
/* Avoid recursion if bad inode is $Volume. */
if (inode->i_ino != MFT_REC_VOL &&
!(sbi->flags & NTFS_FLAGS_LOG_REPLAYING)) {
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 0f0d27d4644a..37cbbee7fa58 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -878,6 +878,10 @@ static int ntfs_resident_writepage(struct folio *folio,
struct ntfs_inode *ni = ntfs_i(inode);
int ret;
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
return -EIO;
@@ -896,6 +900,10 @@ static int ntfs_writepages(struct address_space *mapping,
{
struct inode *inode = mapping->host;
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ntfs_i(inode))))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
return -EIO;
@@ -912,13 +920,17 @@ static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn,
bh_result, create, GET_BLOCK_WRITE_BEGIN);
}
-int ntfs_write_begin(struct file *file, struct address_space *mapping,
+int ntfs_write_begin(const struct kiocb *iocb, struct address_space *mapping,
loff_t pos, u32 len, struct folio **foliop, void **fsdata)
{
int err;
struct inode *inode = mapping->host;
struct ntfs_inode *ni = ntfs_i(inode);
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
return -EIO;
@@ -957,7 +969,8 @@ out:
/*
* ntfs_write_end - Address_space_operations::write_end.
*/
-int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
+int ntfs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos,
u32 len, u32 copied, struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
@@ -989,7 +1002,7 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
folio_unlock(folio);
folio_put(folio);
} else {
- err = generic_write_end(file, mapping, pos, len, copied, folio,
+ err = generic_write_end(iocb, mapping, pos, len, copied, folio,
fsdata);
}
@@ -1062,10 +1075,10 @@ int inode_read_data(struct inode *inode, void *data, size_t bytes)
* Number of bytes for REPARSE_DATA_BUFFER(IO_REPARSE_TAG_SYMLINK)
* for unicode string of @uni_len length.
*/
-static inline u32 ntfs_reparse_bytes(u32 uni_len)
+static inline u32 ntfs_reparse_bytes(u32 uni_len, bool is_absolute)
{
/* Header + unicode string + decorated unicode string. */
- return sizeof(short) * (2 * uni_len + 4) +
+ return sizeof(short) * (2 * uni_len + (is_absolute ? 4 : 0)) +
offsetof(struct REPARSE_DATA_BUFFER,
SymbolicLinkReparseBuffer.PathBuffer);
}
@@ -1078,8 +1091,11 @@ ntfs_create_reparse_buffer(struct ntfs_sb_info *sbi, const char *symname,
struct REPARSE_DATA_BUFFER *rp;
__le16 *rp_name;
typeof(rp->SymbolicLinkReparseBuffer) *rs;
+ bool is_absolute;
- rp = kzalloc(ntfs_reparse_bytes(2 * size + 2), GFP_NOFS);
+ is_absolute = (strlen(symname) > 1 && symname[1] == ':');
+
+ rp = kzalloc(ntfs_reparse_bytes(2 * size + 2, is_absolute), GFP_NOFS);
if (!rp)
return ERR_PTR(-ENOMEM);
@@ -1094,7 +1110,7 @@ ntfs_create_reparse_buffer(struct ntfs_sb_info *sbi, const char *symname,
goto out;
/* err = the length of unicode name of symlink. */
- *nsize = ntfs_reparse_bytes(err);
+ *nsize = ntfs_reparse_bytes(err, is_absolute);
if (*nsize > sbi->reparse.max_size) {
err = -EFBIG;
@@ -1114,7 +1130,7 @@ ntfs_create_reparse_buffer(struct ntfs_sb_info *sbi, const char *symname,
/* PrintName + SubstituteName. */
rs->SubstituteNameOffset = cpu_to_le16(sizeof(short) * err);
- rs->SubstituteNameLength = cpu_to_le16(sizeof(short) * err + 8);
+ rs->SubstituteNameLength = cpu_to_le16(sizeof(short) * err + (is_absolute ? 8 : 0));
rs->PrintNameLength = rs->SubstituteNameOffset;
/*
@@ -1122,16 +1138,18 @@ ntfs_create_reparse_buffer(struct ntfs_sb_info *sbi, const char *symname,
* parse this path.
* 0-absolute path 1- relative path (SYMLINK_FLAG_RELATIVE).
*/
- rs->Flags = 0;
+ rs->Flags = cpu_to_le32(is_absolute ? 0 : SYMLINK_FLAG_RELATIVE);
- memmove(rp_name + err + 4, rp_name, sizeof(short) * err);
+ memmove(rp_name + err + (is_absolute ? 4 : 0), rp_name, sizeof(short) * err);
- /* Decorate SubstituteName. */
- rp_name += err;
- rp_name[0] = cpu_to_le16('\\');
- rp_name[1] = cpu_to_le16('?');
- rp_name[2] = cpu_to_le16('?');
- rp_name[3] = cpu_to_le16('\\');
+ if (is_absolute) {
+ /* Decorate SubstituteName. */
+ rp_name += err;
+ rp_name[0] = cpu_to_le16('\\');
+ rp_name[1] = cpu_to_le16('?');
+ rp_name[2] = cpu_to_le16('?');
+ rp_name[3] = cpu_to_le16('\\');
+ }
return rp;
out:
@@ -1260,6 +1278,12 @@ int ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
goto out1;
}
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(dir_ni))) {
+ err = -EINVAL;
+ goto out2;
+ }
+
if (unlikely(ntfs3_forced_shutdown(sb))) {
err = -EIO;
goto out2;
@@ -1350,7 +1374,7 @@ int ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
fname->dup.a_time = std5->cr_time;
fname->dup.alloc_size = fname->dup.data_size = 0;
fname->dup.fa = std5->fa;
- fname->dup.ea_size = fname->dup.reparse = 0;
+ fname->dup.extend_data = S_ISLNK(mode) ? IO_REPARSE_TAG_SYMLINK : 0;
dsize = le16_to_cpu(new_de->key_size);
asize = ALIGN(SIZEOF_RESIDENT + dsize, 8);
@@ -1590,27 +1614,29 @@ int ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
inode->i_flags |= S_NOSEC;
}
- /*
- * ntfs_init_acl and ntfs_save_wsl_perm update extended attribute.
- * The packed size of extended attribute is stored in direntry too.
- * 'fname' here points to inside new_de.
- */
- err = ntfs_save_wsl_perm(inode, &fname->dup.ea_size);
- if (err)
- goto out6;
+ if (!S_ISLNK(mode)) {
+ /*
+ * ntfs_init_acl and ntfs_save_wsl_perm update extended attribute.
+ * The packed size of extended attribute is stored in direntry too.
+ * 'fname' here points to inside new_de.
+ */
+ err = ntfs_save_wsl_perm(inode, &fname->dup.extend_data);
+ if (err)
+ goto out6;
- /*
- * update ea_size in file_name attribute too.
- * Use ni_find_attr cause layout of MFT record may be changed
- * in ntfs_init_acl and ntfs_save_wsl_perm.
- */
- attr = ni_find_attr(ni, NULL, NULL, ATTR_NAME, NULL, 0, NULL, NULL);
- if (attr) {
- struct ATTR_FILE_NAME *fn;
+ /*
+ * update ea_size in file_name attribute too.
+ * Use ni_find_attr cause layout of MFT record may be changed
+ * in ntfs_init_acl and ntfs_save_wsl_perm.
+ */
+ attr = ni_find_attr(ni, NULL, NULL, ATTR_NAME, NULL, 0, NULL, NULL);
+ if (attr) {
+ struct ATTR_FILE_NAME *fn;
- fn = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME);
- if (fn)
- fn->dup.ea_size = fname->dup.ea_size;
+ fn = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME);
+ if (fn)
+ fn->dup.extend_data = fname->dup.extend_data;
+ }
}
/* We do not need to update parent directory later */
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index b807744fc6a9..82c8ae56beee 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -171,6 +171,10 @@ static int ntfs_unlink(struct inode *dir, struct dentry *dentry)
struct ntfs_inode *ni = ntfs_i(dir);
int err;
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(dir->i_sb)))
return -EIO;
@@ -191,6 +195,10 @@ static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
{
u32 size = strlen(symname);
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ntfs_i(dir))))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(dir->i_sb)))
return -EIO;
@@ -216,6 +224,10 @@ static int ntfs_rmdir(struct inode *dir, struct dentry *dentry)
struct ntfs_inode *ni = ntfs_i(dir);
int err;
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(dir->i_sb)))
return -EIO;
@@ -244,7 +256,7 @@ static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir,
struct ntfs_inode *ni = ntfs_i(inode);
struct inode *new_inode = d_inode(new_dentry);
struct NTFS_DE *de, *new_de;
- bool is_same, is_bad;
+ bool is_same;
/*
* de - memory of PATH_MAX bytes:
* [0-1024) - original name (dentry->d_name)
@@ -256,6 +268,10 @@ static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir,
1024);
static_assert(PATH_MAX >= 4 * 1024);
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(sb)))
return -EIO;
@@ -313,12 +329,8 @@ static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir,
if (dir_ni != new_dir_ni)
ni_lock_dir2(new_dir_ni);
- is_bad = false;
- err = ni_rename(dir_ni, new_dir_ni, ni, de, new_de, &is_bad);
- if (is_bad) {
- /* Restore after failed rename failed too. */
- _ntfs_bad_inode(inode);
- } else if (!err) {
+ err = ni_rename(dir_ni, new_dir_ni, ni, de, new_de);
+ if (!err) {
simple_rename_timestamp(dir, dentry, new_dir, new_dentry);
mark_inode_dirty(inode);
mark_inode_dirty(dir);
diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h
index 1ff13b6f9613..552b97905813 100644
--- a/fs/ntfs3/ntfs.h
+++ b/fs/ntfs3/ntfs.h
@@ -561,8 +561,7 @@ struct NTFS_DUP_INFO {
__le64 alloc_size; // 0x20: Data attribute allocated size, multiple of cluster size.
__le64 data_size; // 0x28: Data attribute size <= Dataalloc_size.
enum FILE_ATTRIBUTE fa; // 0x30: Standard DOS attributes & more.
- __le16 ea_size; // 0x34: Packed EAs.
- __le16 reparse; // 0x36: Used by Reparse.
+ __le32 extend_data; // 0x34: Extended data.
}; // 0x38
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 36b8052660d5..1296e6fcc779 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -377,6 +377,13 @@ struct ntfs_inode {
*/
u8 mi_loaded;
+ /*
+ * Use this field to avoid any write(s).
+ * If inode is bad during initialization - use make_bad_inode
+ * If inode is bad during operations - use this field
+ */
+ u8 ni_bad;
+
union {
struct ntfs_index dir;
struct {
@@ -577,8 +584,7 @@ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
struct NTFS_DE *de);
int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni,
- struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de,
- bool *is_bad);
+ struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de);
bool ni_is_dirty(struct inode *inode);
@@ -702,10 +708,12 @@ struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref,
int ntfs_set_size(struct inode *inode, u64 new_size);
int ntfs_get_block(struct inode *inode, sector_t vbn,
struct buffer_head *bh_result, int create);
-int ntfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, u32 len, struct folio **foliop, void **fsdata);
-int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
- u32 len, u32 copied, struct folio *folio, void *fsdata);
+int ntfs_write_begin(const struct kiocb *iocb, struct address_space *mapping,
+ loff_t pos, u32 len, struct folio **foliop,
+ void **fsdata);
+int ntfs_write_end(const struct kiocb *iocb, struct address_space *mapping,
+ loff_t pos, u32 len, u32 copied, struct folio *folio,
+ void *fsdata);
int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc);
int ntfs_sync_inode(struct inode *inode);
int inode_read_data(struct inode *inode, void *data, size_t bytes);
@@ -874,7 +882,7 @@ int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry);
ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
extern const struct xattr_handler *const ntfs_xattr_handlers[];
-int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size);
+int ntfs_save_wsl_perm(struct inode *inode, __le32 *ea_size);
void ntfs_get_wsl_perm(struct inode *inode);
/* globals from lznt.c */
@@ -1025,6 +1033,11 @@ static inline bool is_compressed(const struct ntfs_inode *ni)
(ni->ni_flags & NI_FLAG_COMPRESSED_MASK);
}
+static inline bool is_bad_ni(const struct ntfs_inode *ni)
+{
+ return ni->ni_bad;
+}
+
static inline int ni_ext_compress_bits(const struct ntfs_inode *ni)
{
return 0xb + (ni->ni_flags & NI_FLAG_COMPRESSED_MASK);
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 920a1ab47b63..ddff94c091b8 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -1223,7 +1223,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_export_op = &ntfs_export_ops;
sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec
sb->s_xattr = ntfs_xattr_handlers;
- sb->s_d_op = options->nocase ? &ntfs_dentry_ops : NULL;
+ if (options->nocase)
+ set_default_d_op(sb, &ntfs_dentry_ops);
options->nls = ntfs_load_nls(options->nls_name);
if (IS_ERR(options->nls)) {
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index e0055dcf8fe3..e519e21596a7 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -313,7 +313,7 @@ out:
static noinline int ntfs_set_ea(struct inode *inode, const char *name,
size_t name_len, const void *value,
size_t val_size, int flags, bool locked,
- __le16 *ea_size)
+ __le32 *ea_size)
{
struct ntfs_inode *ni = ntfs_i(inode);
struct ntfs_sb_info *sbi = ni->mi.sbi;
@@ -522,7 +522,7 @@ update_ea:
if (ea_info.size_pack != size_pack)
ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
if (ea_size)
- *ea_size = ea_info.size_pack;
+ *ea_size = ea_info.size;
mark_inode_dirty(&ni->vfs_inode);
out:
@@ -552,6 +552,10 @@ struct posix_acl *ntfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry,
int err;
void *buf;
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return ERR_PTR(-EINVAL);
+
/* Allocate PATH_MAX bytes. */
buf = __getname();
if (!buf)
@@ -600,6 +604,10 @@ static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap,
int flags;
umode_t mode;
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ntfs_i(inode))))
+ return -EINVAL;
+
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
@@ -730,6 +738,10 @@ ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
struct ntfs_inode *ni = ntfs_i(inode);
ssize_t ret;
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (!(ni->ni_flags & NI_FLAG_EA)) {
/* no xattr in file */
return 0;
@@ -751,6 +763,10 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de,
int err;
struct ntfs_inode *ni = ntfs_i(inode);
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
return -EIO;
@@ -950,7 +966,7 @@ out:
*
* save uid/gid/mode in xattr
*/
-int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size)
+int ntfs_save_wsl_perm(struct inode *inode, __le32 *ea_size)
{
int err;
__le32 value;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 40b6bce12951..2203438738f6 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1856,7 +1856,8 @@ out:
return ret;
}
-static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
+static int ocfs2_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
@@ -2047,7 +2048,8 @@ out:
return copied;
}
-static int ocfs2_write_end(struct file *file, struct address_space *mapping,
+static int ocfs2_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
{
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index fce9beb214f0..b05d4e9d13b2 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1483,12 +1483,13 @@ static void o2net_sc_send_keep_req(struct work_struct *work)
sc_put(sc);
}
-/* socket shutdown does a del_timer_sync against this as it tears down.
+/* socket shutdown does a timer_delete_sync against this as it tears down.
* we can't start this timer until we've got to the point in sc buildup
* where shutdown is going to be involved */
static void o2net_idle_timer(struct timer_list *t)
{
- struct o2net_sock_container *sc = from_timer(sc, t, sc_idle_timeout);
+ struct o2net_sock_container *sc = timer_container_of(sc, t,
+ sc_idle_timeout);
struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
#ifdef CONFIG_DEBUG_FS
unsigned long msecs = ktime_to_ms(ktime_get()) -
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 2056cf08ac1e..21d797ccccd0 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2800,7 +2800,7 @@ const struct inode_operations ocfs2_special_file_iops = {
*/
const struct file_operations ocfs2_fops = {
.llseek = ocfs2_file_llseek,
- .mmap = ocfs2_mmap,
+ .mmap_prepare = ocfs2_mmap_prepare,
.fsync = ocfs2_sync_file,
.release = ocfs2_file_release,
.open = ocfs2_file_open,
@@ -2850,7 +2850,7 @@ const struct file_operations ocfs2_dops = {
*/
const struct file_operations ocfs2_fops_no_plocks = {
.llseek = ocfs2_file_llseek,
- .mmap = ocfs2_mmap,
+ .mmap_prepare = ocfs2_mmap_prepare,
.fsync = ocfs2_sync_file,
.release = ocfs2_file_release,
.open = ocfs2_file_open,
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
index 1ad7106741f8..3ad7baf67658 100644
--- a/fs/ocfs2/filecheck.c
+++ b/fs/ocfs2/filecheck.c
@@ -505,5 +505,5 @@ static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj,
ocfs2_filecheck_handle_entry(ent, entry);
exit:
- return (!ret ? count : ret);
+ return ret ?: count;
}
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7ae96fb8807a..db14c92302a1 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -62,7 +62,7 @@ static inline int o2info_coherent(struct ocfs2_info_request *req)
return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
}
-int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int ocfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
unsigned int flags;
@@ -83,7 +83,7 @@ int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
int ocfs2_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
unsigned int flags = fa->flags;
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index 48a5fdfe87a1..4a1c2313b429 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -11,9 +11,9 @@
#ifndef OCFS2_IOCTL_PROTO_H
#define OCFS2_IOCTL_PROTO_H
-int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int ocfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
int ocfs2_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 6a314e9f2b49..50e2faf64c19 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -159,8 +159,9 @@ static const struct vm_operations_struct ocfs2_file_vm_ops = {
.page_mkwrite = ocfs2_page_mkwrite,
};
-int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
+int ocfs2_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
int ret = 0, lock_level = 0;
ret = ocfs2_inode_lock_atime(file_inode(file),
@@ -171,7 +172,7 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
}
ocfs2_inode_unlock(file_inode(file), lock_level);
out:
- vma->vm_ops = &ocfs2_file_vm_ops;
+ desc->vm_ops = &ocfs2_file_vm_ops;
return 0;
}
diff --git a/fs/ocfs2/mmap.h b/fs/ocfs2/mmap.h
index 1051507cc684..d21c30de6b8c 100644
--- a/fs/ocfs2/mmap.h
+++ b/fs/ocfs2/mmap.h
@@ -2,6 +2,6 @@
#ifndef OCFS2_MMAP_H
#define OCFS2_MMAP_H
-int ocfs2_mmap(struct file *file, struct vm_area_struct *vma);
+int ocfs2_mmap_prepare(struct vm_area_desc *desc);
#endif /* OCFS2_MMAP_H */
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index e272429da3db..de7f12858729 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -674,7 +674,7 @@ out_put:
break;
}
out:
- kfree(rec);
+ ocfs2_free_quota_recovery(rec);
return status;
}
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index ddd761cf44c8..a28c127b9934 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -691,8 +691,7 @@ static void __exit ocfs2_stack_glue_exit(void)
memset(&locking_max_version, 0,
sizeof(struct ocfs2_protocol_version));
ocfs2_sysfs_exit();
- if (ocfs2_table_header)
- unregister_sysctl_table(ocfs2_table_header);
+ unregister_sysctl_table(ocfs2_table_header);
}
MODULE_AUTHOR("Oracle");
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3d2533950bae..53daa4482406 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1962,7 +1962,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
sb->s_fs_info = osb;
sb->s_op = &ocfs2_sops;
- sb->s_d_op = &ocfs2_dentry_ops;
+ set_default_d_op(sb, &ocfs2_dentry_ops);
sb->s_export_op = &ocfs2_export_ops;
sb->s_qcop = &dquot_quotactl_sysfile_ops;
sb->dq_op = &ocfs2_quota_operations;
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 98358d405b6a..49a1de5a827f 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -310,9 +310,10 @@ static void omfs_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int omfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int omfs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int ret;
@@ -332,7 +333,7 @@ const struct file_operations omfs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.fsync = generic_file_fsync,
.splice_read = filemap_splice_read,
};
diff --git a/fs/open.c b/fs/open.c
index 7828234a7caa..9655158c3885 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -281,6 +281,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
break;
case FALLOC_FL_COLLAPSE_RANGE:
case FALLOC_FL_INSERT_RANGE:
+ case FALLOC_FL_WRITE_ZEROES:
if (mode & FALLOC_FL_KEEP_SIZE)
return -EOPNOTSUPP;
break;
@@ -943,12 +944,12 @@ static int do_dentry_open(struct file *f,
goto cleanup_all;
/*
- * Set FMODE_NONOTIFY_* bits according to existing permission watches.
+ * Call fsnotify open permission hook and set FMODE_NONOTIFY_* bits
+ * according to existing permission watches.
* If FMODE_NONOTIFY mode was already set for an fanotify fd or for a
* pseudo file, this call will not change the mode.
*/
- file_set_fsnotify_mode_from_watchers(f);
- error = fsnotify_open_perm(f);
+ error = fsnotify_open_perm_and_set_mode(f);
if (error)
goto cleanup_all;
@@ -1204,14 +1205,11 @@ struct file *kernel_file_open(const struct path *path, int flags,
if (IS_ERR(f))
return f;
- f->f_path = *path;
- error = do_dentry_open(f, NULL);
+ error = vfs_open(path, f);
if (error) {
fput(f);
return ERR_PTR(error);
}
-
- fsnotify_open(f);
return f;
}
EXPORT_SYMBOL_GPL(kernel_file_open);
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 90c49c0de243..919f99b16834 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -398,8 +398,9 @@ static const struct vm_operations_struct orangefs_file_vm_ops = {
/*
* Memory map a region of a file.
*/
-static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int orangefs_file_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
int ret;
ret = orangefs_revalidate_mapping(file_inode(file));
@@ -410,10 +411,11 @@ static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma)
"orangefs_file_mmap: called on %pD\n", file);
/* set the sequential readahead hint */
- vm_flags_mod(vma, VM_SEQ_READ, VM_RAND_READ);
+ desc->vm_flags |= VM_SEQ_READ;
+ desc->vm_flags &= ~VM_RAND_READ;
file_accessed(file);
- vma->vm_ops = &orangefs_file_vm_ops;
+ desc->vm_ops = &orangefs_file_vm_ops;
return 0;
}
@@ -574,7 +576,7 @@ const struct file_operations orangefs_file_operations = {
.read_iter = orangefs_file_read_iter,
.write_iter = orangefs_file_write_iter,
.lock = orangefs_lock,
- .mmap = orangefs_file_mmap,
+ .mmap_prepare = orangefs_file_mmap_prepare,
.open = generic_file_open,
.splice_read = orangefs_file_splice_read,
.splice_write = iter_file_splice_write,
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 08a6f372a352..a01400cd41fd 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -285,9 +285,10 @@ static int orangefs_read_folio(struct file *file, struct folio *folio)
return ret;
}
-static int orangefs_write_begin(struct file *file,
- struct address_space *mapping, loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int orangefs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos,
+ unsigned len, struct folio **foliop,
+ void **fsdata)
{
struct orangefs_write_range *wr;
struct folio *folio;
@@ -340,9 +341,10 @@ okay:
return 0;
}
-static int orangefs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied, struct folio *folio,
- void *fsdata)
+static int orangefs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
struct inode *inode = folio->mapping->host;
loff_t last_pos = pos + copied;
@@ -372,7 +374,7 @@ static int orangefs_write_end(struct file *file, struct address_space *mapping,
folio_unlock(folio);
folio_put(folio);
- mark_inode_dirty_sync(file_inode(file));
+ mark_inode_dirty_sync(file_inode(iocb->ki_filp));
return copied;
}
@@ -887,7 +889,7 @@ int orangefs_update_time(struct inode *inode, int flags)
return __orangefs_setattr(inode, &iattr);
}
-static int orangefs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+static int orangefs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
u64 val = 0;
int ret;
@@ -908,7 +910,7 @@ static int orangefs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
static int orangefs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
u64 val = 0;
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index f7095c91660c..1c375fb65018 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -396,7 +396,7 @@ static ssize_t orangefs_debug_read(struct file *file,
goto out;
mutex_lock(&orangefs_debug_lock);
- sprintf_ret = sprintf(buf, "%s", (char *)file->private_data);
+ sprintf_ret = scnprintf(buf, ORANGEFS_MAX_DEBUG_STRING_LEN, "%s", (char *)file->private_data);
mutex_unlock(&orangefs_debug_lock);
read_ret = simple_read_from_buffer(ubuf, count, ppos, buf, sprintf_ret);
@@ -769,8 +769,8 @@ static void do_k_string(void *k_mask, int index)
if (*mask & s_kmod_keyword_mask_map[index].mask_val) {
if ((strlen(kernel_debug_string) +
- strlen(s_kmod_keyword_mask_map[index].keyword))
- < ORANGEFS_MAX_DEBUG_STRING_LEN - 1) {
+ strlen(s_kmod_keyword_mask_map[index].keyword) + 1)
+ < ORANGEFS_MAX_DEBUG_STRING_LEN) {
strcat(kernel_debug_string,
s_kmod_keyword_mask_map[index].keyword);
strcat(kernel_debug_string, ",");
@@ -797,7 +797,7 @@ static void do_c_string(void *c_mask, int index)
(mask->mask2 & cdm_array[index].mask2)) {
if ((strlen(client_debug_string) +
strlen(cdm_array[index].keyword) + 1)
- < ORANGEFS_MAX_DEBUG_STRING_LEN - 2) {
+ < ORANGEFS_MAX_DEBUG_STRING_LEN) {
strcat(client_debug_string,
cdm_array[index].keyword);
strcat(client_debug_string, ",");
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
index 04e15dfa504a..369455b354ef 100644
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -217,36 +217,31 @@ static ssize_t sysfs_int_show(struct kobject *kobj,
if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) {
if (!strcmp(attr->attr.name, "op_timeout_secs")) {
- rc = scnprintf(buf,
- PAGE_SIZE,
+ rc = sysfs_emit(buf,
"%d\n",
op_timeout_secs);
goto out;
} else if (!strcmp(attr->attr.name,
"slot_timeout_secs")) {
- rc = scnprintf(buf,
- PAGE_SIZE,
+ rc = sysfs_emit(buf,
"%d\n",
slot_timeout_secs);
goto out;
} else if (!strcmp(attr->attr.name,
"cache_timeout_msecs")) {
- rc = scnprintf(buf,
- PAGE_SIZE,
+ rc = sysfs_emit(buf,
"%d\n",
orangefs_cache_timeout_msecs);
goto out;
} else if (!strcmp(attr->attr.name,
"dcache_timeout_msecs")) {
- rc = scnprintf(buf,
- PAGE_SIZE,
+ rc = sysfs_emit(buf,
"%d\n",
orangefs_dcache_timeout_msecs);
goto out;
} else if (!strcmp(attr->attr.name,
"getattr_timeout_msecs")) {
- rc = scnprintf(buf,
- PAGE_SIZE,
+ rc = sysfs_emit(buf,
"%d\n",
orangefs_getattr_timeout_msecs);
goto out;
@@ -256,14 +251,12 @@ static ssize_t sysfs_int_show(struct kobject *kobj,
} else if (!strcmp(kobj->name, STATS_KOBJ_ID)) {
if (!strcmp(attr->attr.name, "reads")) {
- rc = scnprintf(buf,
- PAGE_SIZE,
+ rc = sysfs_emit(buf,
"%lu\n",
orangefs_stats.reads);
goto out;
} else if (!strcmp(attr->attr.name, "writes")) {
- rc = scnprintf(buf,
- PAGE_SIZE,
+ rc = sysfs_emit(buf,
"%lu\n",
orangefs_stats.writes);
goto out;
@@ -497,19 +490,18 @@ out:
if (strcmp(kobj->name, PC_KOBJ_ID)) {
if (new_op->upcall.req.param.op ==
ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE) {
- rc = scnprintf(buf, PAGE_SIZE, "%d %d\n",
+ rc = sysfs_emit(buf, "%d %d\n",
(int)new_op->downcall.resp.param.u.
value32[0],
(int)new_op->downcall.resp.param.u.
value32[1]);
} else {
- rc = scnprintf(buf, PAGE_SIZE, "%d\n",
+ rc = sysfs_emit(buf, "%d\n",
(int)new_op->downcall.resp.param.u.value64);
}
} else {
- rc = scnprintf(
+ rc = sysfs_emit(
buf,
- PAGE_SIZE,
"%s",
new_op->downcall.resp.perf_count.buffer);
}
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 64ca9498f550..f3da840758e7 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -416,7 +416,7 @@ static int orangefs_fill_sb(struct super_block *sb,
sb->s_xattr = orangefs_xattr_handlers;
sb->s_magic = ORANGEFS_SUPER_MAGIC;
sb->s_op = &orangefs_s_ops;
- sb->s_d_op = &orangefs_dentry_operations;
+ set_default_d_op(sb, &orangefs_dentry_operations);
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index d7310fcf3888..27396fe63f6d 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -171,14 +171,14 @@ out:
static int ovl_copy_fileattr(struct inode *inode, const struct path *old,
const struct path *new)
{
- struct fileattr oldfa = { .flags_valid = true };
- struct fileattr newfa = { .flags_valid = true };
+ struct file_kattr oldfa = { .flags_valid = true };
+ struct file_kattr newfa = { .flags_valid = true };
int err;
err = ovl_real_fileattr_get(old, &oldfa);
if (err) {
/* Ntfs-3g returns -EINVAL for "no fileattr support" */
- if (err == -ENOTTY || err == -EINVAL)
+ if (err == -EOPNOTSUPP || err == -EINVAL)
return 0;
pr_warn("failed to retrieve lower fileattr (%pd2, err=%i)\n",
old->dentry, err);
@@ -517,15 +517,12 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper,
/*
* Create and install index entry.
- *
- * Caller must hold i_mutex on indexdir.
*/
static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh,
struct dentry *upper)
{
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *indexdir = ovl_indexdir(dentry->d_sb);
- struct inode *dir = d_inode(indexdir);
struct dentry *index = NULL;
struct dentry *temp = NULL;
struct qstr name = { };
@@ -559,16 +556,20 @@ static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh,
if (err)
goto out;
+ err = ovl_parent_lock(indexdir, temp);
+ if (err)
+ goto out;
index = ovl_lookup_upper(ofs, name.name, indexdir, name.len);
if (IS_ERR(index)) {
err = PTR_ERR(index);
} else {
- err = ovl_do_rename(ofs, dir, temp, dir, index, 0);
+ err = ovl_do_rename(ofs, indexdir, temp, indexdir, index, 0);
dput(index);
}
+ ovl_parent_unlock(indexdir);
out:
if (err)
- ovl_cleanup(ofs, dir, temp);
+ ovl_cleanup(ofs, indexdir, temp);
dput(temp);
free_name:
kfree(name.name);
@@ -762,7 +763,6 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
{
struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
struct inode *inode;
- struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir);
struct path path = { .mnt = ovl_upper_mnt(ofs) };
struct dentry *temp, *upper, *trap;
struct ovl_cu_creds cc;
@@ -779,9 +779,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
return err;
ovl_start_write(c->dentry);
- inode_lock(wdir);
temp = ovl_create_temp(ofs, c->workdir, &cattr);
- inode_unlock(wdir);
ovl_end_write(c->dentry);
ovl_revert_cu_creds(&cc);
@@ -794,45 +792,47 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
*/
path.dentry = temp;
err = ovl_copy_up_data(c, &path);
+ ovl_start_write(c->dentry);
+ if (err)
+ goto cleanup_unlocked;
+
+ if (S_ISDIR(c->stat.mode) && c->indexed) {
+ err = ovl_create_index(c->dentry, c->origin_fh, temp);
+ if (err)
+ goto cleanup_unlocked;
+ }
+
/*
* We cannot hold lock_rename() throughout this helper, because of
* lock ordering with sb_writers, which shouldn't be held when calling
* ovl_copy_up_data(), so lock workdir and destdir and make sure that
* temp wasn't moved before copy up completion or cleanup.
*/
- ovl_start_write(c->dentry);
trap = lock_rename(c->workdir, c->destdir);
if (trap || temp->d_parent != c->workdir) {
/* temp or workdir moved underneath us? abort without cleanup */
dput(temp);
err = -EIO;
- if (IS_ERR(trap))
- goto out;
- goto unlock;
- } else if (err) {
- goto cleanup;
+ if (!IS_ERR(trap))
+ unlock_rename(c->workdir, c->destdir);
+ goto out;
}
err = ovl_copy_up_metadata(c, temp);
if (err)
goto cleanup;
- if (S_ISDIR(c->stat.mode) && c->indexed) {
- err = ovl_create_index(c->dentry, c->origin_fh, temp);
- if (err)
- goto cleanup;
- }
-
upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir,
c->destname.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto cleanup;
- err = ovl_do_rename(ofs, wdir, temp, udir, upper, 0);
+ err = ovl_do_rename(ofs, c->workdir, temp, c->destdir, upper, 0);
+ unlock_rename(c->workdir, c->destdir);
dput(upper);
if (err)
- goto cleanup;
+ goto cleanup_unlocked;
inode = d_inode(c->dentry);
if (c->metacopy_digest)
@@ -846,17 +846,17 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
ovl_inode_update(inode, temp);
if (S_ISDIR(inode->i_mode))
ovl_set_flag(OVL_WHITEOUTS, inode);
-unlock:
- unlock_rename(c->workdir, c->destdir);
out:
ovl_end_write(c->dentry);
return err;
cleanup:
- ovl_cleanup(ofs, wdir, temp);
+ unlock_rename(c->workdir, c->destdir);
+cleanup_unlocked:
+ ovl_cleanup(ofs, c->workdir, temp);
dput(temp);
- goto unlock;
+ goto out;
}
/* Copyup using O_TMPFILE which does not require cross dir locking */
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index fe493f3ed6b6..70b8687dc45e 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -24,7 +24,8 @@ MODULE_PARM_DESC(redirect_max,
static int ovl_set_redirect(struct dentry *dentry, bool samedir);
-int ovl_cleanup(struct ovl_fs *ofs, struct inode *wdir, struct dentry *wdentry)
+static int ovl_cleanup_locked(struct ovl_fs *ofs, struct inode *wdir,
+ struct dentry *wdentry)
{
int err;
@@ -43,6 +44,21 @@ int ovl_cleanup(struct ovl_fs *ofs, struct inode *wdir, struct dentry *wdentry)
return err;
}
+int ovl_cleanup(struct ovl_fs *ofs, struct dentry *workdir,
+ struct dentry *wdentry)
+{
+ int err;
+
+ err = ovl_parent_lock(workdir, wdentry);
+ if (err)
+ return err;
+
+ ovl_cleanup_locked(ofs, workdir->d_inode, wdentry);
+ ovl_parent_unlock(workdir);
+
+ return 0;
+}
+
struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir)
{
struct dentry *temp;
@@ -62,7 +78,6 @@ struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir)
return temp;
}
-/* caller holds i_mutex on workdir */
static struct dentry *ovl_whiteout(struct ovl_fs *ofs)
{
int err;
@@ -70,47 +85,52 @@ static struct dentry *ovl_whiteout(struct ovl_fs *ofs)
struct dentry *workdir = ofs->workdir;
struct inode *wdir = workdir->d_inode;
+ guard(mutex)(&ofs->whiteout_lock);
+
if (!ofs->whiteout) {
+ inode_lock_nested(wdir, I_MUTEX_PARENT);
whiteout = ovl_lookup_temp(ofs, workdir);
- if (IS_ERR(whiteout))
- goto out;
-
- err = ovl_do_whiteout(ofs, wdir, whiteout);
- if (err) {
- dput(whiteout);
- whiteout = ERR_PTR(err);
- goto out;
+ if (!IS_ERR(whiteout)) {
+ err = ovl_do_whiteout(ofs, wdir, whiteout);
+ if (err) {
+ dput(whiteout);
+ whiteout = ERR_PTR(err);
+ }
}
+ inode_unlock(wdir);
+ if (IS_ERR(whiteout))
+ return whiteout;
ofs->whiteout = whiteout;
}
if (!ofs->no_shared_whiteout) {
+ inode_lock_nested(wdir, I_MUTEX_PARENT);
whiteout = ovl_lookup_temp(ofs, workdir);
- if (IS_ERR(whiteout))
- goto out;
-
- err = ovl_do_link(ofs, ofs->whiteout, wdir, whiteout);
- if (!err)
- goto out;
-
- if (err != -EMLINK) {
- pr_warn("Failed to link whiteout - disabling whiteout inode sharing(nlink=%u, err=%i)\n",
- ofs->whiteout->d_inode->i_nlink, err);
+ if (!IS_ERR(whiteout)) {
+ err = ovl_do_link(ofs, ofs->whiteout, wdir, whiteout);
+ if (err) {
+ dput(whiteout);
+ whiteout = ERR_PTR(err);
+ }
+ }
+ inode_unlock(wdir);
+ if (!IS_ERR(whiteout))
+ return whiteout;
+ if (PTR_ERR(whiteout) != -EMLINK) {
+ pr_warn("Failed to link whiteout - disabling whiteout inode sharing(nlink=%u, err=%lu)\n",
+ ofs->whiteout->d_inode->i_nlink,
+ PTR_ERR(whiteout));
ofs->no_shared_whiteout = true;
}
- dput(whiteout);
}
whiteout = ofs->whiteout;
ofs->whiteout = NULL;
-out:
return whiteout;
}
-/* Caller must hold i_mutex on both workdir and dir */
-int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir,
+int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct dentry *dir,
struct dentry *dentry)
{
- struct inode *wdir = ofs->workdir->d_inode;
struct dentry *whiteout;
int err;
int flags = 0;
@@ -123,24 +143,29 @@ int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir,
if (d_is_dir(dentry))
flags = RENAME_EXCHANGE;
- err = ovl_do_rename(ofs, wdir, whiteout, dir, dentry, flags);
+ err = ovl_lock_rename_workdir(ofs->workdir, whiteout, dir, dentry);
+ if (!err) {
+ err = ovl_do_rename(ofs, ofs->workdir, whiteout, dir, dentry, flags);
+ unlock_rename(ofs->workdir, dir);
+ }
if (err)
goto kill_whiteout;
if (flags)
- ovl_cleanup(ofs, wdir, dentry);
+ ovl_cleanup(ofs, ofs->workdir, dentry);
out:
dput(whiteout);
return err;
kill_whiteout:
- ovl_cleanup(ofs, wdir, whiteout);
+ ovl_cleanup(ofs, ofs->workdir, whiteout);
goto out;
}
-struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir,
+struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent,
struct dentry *newdentry, struct ovl_cattr *attr)
{
+ struct inode *dir = parent->d_inode;
int err;
if (IS_ERR(newdentry))
@@ -199,8 +224,12 @@ out:
struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
struct ovl_cattr *attr)
{
- return ovl_create_real(ofs, d_inode(workdir),
- ovl_lookup_temp(ofs, workdir), attr);
+ struct dentry *ret;
+ inode_lock(workdir->d_inode);
+ ret = ovl_create_real(ofs, workdir,
+ ovl_lookup_temp(ofs, workdir), attr);
+ inode_unlock(workdir->d_inode);
+ return ret;
}
static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper,
@@ -303,13 +332,13 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
int err;
inode_lock_nested(udir, I_MUTEX_PARENT);
- newdentry = ovl_create_real(ofs, udir,
+ newdentry = ovl_create_real(ofs, upperdir,
ovl_lookup_upper(ofs, dentry->d_name.name,
upperdir, dentry->d_name.len),
attr);
- err = PTR_ERR(newdentry);
+ inode_unlock(udir);
if (IS_ERR(newdentry))
- goto out_unlock;
+ return PTR_ERR(newdentry);
if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry) &&
!ovl_allow_offline_changes(ofs)) {
@@ -321,14 +350,12 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
err = ovl_instantiate(dentry, inode, newdentry, !!attr->hardlink, NULL);
if (err)
goto out_cleanup;
-out_unlock:
- inode_unlock(udir);
- return err;
+ return 0;
out_cleanup:
- ovl_cleanup(ofs, udir, newdentry);
+ ovl_cleanup(ofs, upperdir, newdentry);
dput(newdentry);
- goto out_unlock;
+ return err;
}
static struct dentry *ovl_clear_empty(struct dentry *dentry,
@@ -336,9 +363,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
{
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *workdir = ovl_workdir(dentry);
- struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
- struct inode *udir = upperdir->d_inode;
struct path upperpath;
struct dentry *upper;
struct dentry *opaquedir;
@@ -348,27 +373,25 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
if (WARN_ON(!workdir))
return ERR_PTR(-EROFS);
- err = ovl_lock_rename_workdir(workdir, upperdir);
- if (err)
- goto out;
-
ovl_path_upper(dentry, &upperpath);
err = vfs_getattr(&upperpath, &stat,
STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
if (err)
- goto out_unlock;
+ goto out;
err = -ESTALE;
if (!S_ISDIR(stat.mode))
- goto out_unlock;
+ goto out;
upper = upperpath.dentry;
- if (upper->d_parent->d_inode != udir)
- goto out_unlock;
opaquedir = ovl_create_temp(ofs, workdir, OVL_CATTR(stat.mode));
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
- goto out_unlock;
+ goto out;
+
+ err = ovl_lock_rename_workdir(workdir, opaquedir, upperdir, upper);
+ if (err)
+ goto out_cleanup_unlocked;
err = ovl_copy_xattr(dentry->d_sb, &upperpath, opaquedir);
if (err)
@@ -384,13 +407,13 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
if (err)
goto out_cleanup;
- err = ovl_do_rename(ofs, wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
+ err = ovl_do_rename(ofs, workdir, opaquedir, upperdir, upper, RENAME_EXCHANGE);
+ unlock_rename(workdir, upperdir);
if (err)
- goto out_cleanup;
+ goto out_cleanup_unlocked;
ovl_cleanup_whiteouts(ofs, upper, list);
- ovl_cleanup(ofs, wdir, upper);
- unlock_rename(workdir, upperdir);
+ ovl_cleanup(ofs, workdir, upper);
/* dentry's upper doesn't match now, get rid of it */
d_drop(dentry);
@@ -398,10 +421,10 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
return opaquedir;
out_cleanup:
- ovl_cleanup(ofs, wdir, opaquedir);
- dput(opaquedir);
-out_unlock:
unlock_rename(workdir, upperdir);
+out_cleanup_unlocked:
+ ovl_cleanup(ofs, workdir, opaquedir);
+ dput(opaquedir);
out:
return ERR_PTR(err);
}
@@ -420,9 +443,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
{
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *workdir = ovl_workdir(dentry);
- struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
- struct inode *udir = upperdir->d_inode;
struct dentry *upper;
struct dentry *newdentry;
int err;
@@ -439,15 +460,11 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
return err;
}
- err = ovl_lock_rename_workdir(workdir, upperdir);
- if (err)
- goto out;
-
- upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir,
- dentry->d_name.len);
+ upper = ovl_lookup_upper_unlocked(ofs, dentry->d_name.name, upperdir,
+ dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
- goto out_unlock;
+ goto out;
err = -ESTALE;
if (d_is_negative(upper) || !ovl_upper_is_whiteout(ofs, upper))
@@ -458,6 +475,10 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (IS_ERR(newdentry))
goto out_dput;
+ err = ovl_lock_rename_workdir(workdir, newdentry, upperdir, upper);
+ if (err)
+ goto out_cleanup_unlocked;
+
/*
* mode could have been mutilated due to umask (e.g. sgid directory)
*/
@@ -491,27 +512,27 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (err)
goto out_cleanup;
- err = ovl_do_rename(ofs, wdir, newdentry, udir, upper,
+ err = ovl_do_rename(ofs, workdir, newdentry, upperdir, upper,
RENAME_EXCHANGE);
+ unlock_rename(workdir, upperdir);
if (err)
- goto out_cleanup;
+ goto out_cleanup_unlocked;
- ovl_cleanup(ofs, wdir, upper);
+ ovl_cleanup(ofs, workdir, upper);
} else {
- err = ovl_do_rename(ofs, wdir, newdentry, udir, upper, 0);
+ err = ovl_do_rename(ofs, workdir, newdentry, upperdir, upper, 0);
+ unlock_rename(workdir, upperdir);
if (err)
- goto out_cleanup;
+ goto out_cleanup_unlocked;
}
ovl_dir_modified(dentry->d_parent, false);
err = ovl_instantiate(dentry, inode, newdentry, hardlink, NULL);
if (err) {
- ovl_cleanup(ofs, udir, newdentry);
+ ovl_cleanup(ofs, upperdir, newdentry);
dput(newdentry);
}
out_dput:
dput(upper);
-out_unlock:
- unlock_rename(workdir, upperdir);
out:
if (!hardlink) {
posix_acl_release(acl);
@@ -520,7 +541,9 @@ out:
return err;
out_cleanup:
- ovl_cleanup(ofs, wdir, newdentry);
+ unlock_rename(workdir, upperdir);
+out_cleanup_unlocked:
+ ovl_cleanup(ofs, workdir, newdentry);
dput(newdentry);
goto out_dput;
}
@@ -757,15 +780,11 @@ static int ovl_remove_and_whiteout(struct dentry *dentry,
goto out;
}
- err = ovl_lock_rename_workdir(workdir, upperdir);
- if (err)
- goto out_dput;
-
- upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir,
- dentry->d_name.len);
+ upper = ovl_lookup_upper_unlocked(ofs, dentry->d_name.name, upperdir,
+ dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
- goto out_unlock;
+ goto out_dput;
err = -ESTALE;
if ((opaquedir && upper != opaquedir) ||
@@ -774,17 +793,13 @@ static int ovl_remove_and_whiteout(struct dentry *dentry,
goto out_dput_upper;
}
- err = ovl_cleanup_and_whiteout(ofs, d_inode(upperdir), upper);
- if (err)
- goto out_d_drop;
+ err = ovl_cleanup_and_whiteout(ofs, upperdir, upper);
+ if (!err)
+ ovl_dir_modified(dentry->d_parent, true);
- ovl_dir_modified(dentry->d_parent, true);
-out_d_drop:
d_drop(dentry);
out_dput_upper:
dput(upper);
-out_unlock:
- unlock_rename(workdir, upperdir);
out_dput:
dput(opaquedir);
out:
@@ -1069,9 +1084,9 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
int err;
struct dentry *old_upperdir;
struct dentry *new_upperdir;
- struct dentry *olddentry;
- struct dentry *newdentry;
- struct dentry *trap;
+ struct dentry *olddentry = NULL;
+ struct dentry *newdentry = NULL;
+ struct dentry *trap, *de;
bool old_opaque;
bool new_opaque;
bool cleanup_whiteout = false;
@@ -1184,21 +1199,23 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
goto out_revert_creds;
}
- olddentry = ovl_lookup_upper(ofs, old->d_name.name, old_upperdir,
- old->d_name.len);
- err = PTR_ERR(olddentry);
- if (IS_ERR(olddentry))
+ de = ovl_lookup_upper(ofs, old->d_name.name, old_upperdir,
+ old->d_name.len);
+ err = PTR_ERR(de);
+ if (IS_ERR(de))
goto out_unlock;
+ olddentry = de;
err = -ESTALE;
if (!ovl_matches_upper(old, olddentry))
- goto out_dput_old;
+ goto out_unlock;
- newdentry = ovl_lookup_upper(ofs, new->d_name.name, new_upperdir,
- new->d_name.len);
- err = PTR_ERR(newdentry);
- if (IS_ERR(newdentry))
- goto out_dput_old;
+ de = ovl_lookup_upper(ofs, new->d_name.name, new_upperdir,
+ new->d_name.len);
+ err = PTR_ERR(de);
+ if (IS_ERR(de))
+ goto out_unlock;
+ newdentry = de;
old_opaque = ovl_dentry_is_opaque(old);
new_opaque = ovl_dentry_is_opaque(new);
@@ -1207,28 +1224,28 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
if (d_inode(new) && ovl_dentry_upper(new)) {
if (opaquedir) {
if (newdentry != opaquedir)
- goto out_dput;
+ goto out_unlock;
} else {
if (!ovl_matches_upper(new, newdentry))
- goto out_dput;
+ goto out_unlock;
}
} else {
if (!d_is_negative(newdentry)) {
if (!new_opaque || !ovl_upper_is_whiteout(ofs, newdentry))
- goto out_dput;
+ goto out_unlock;
} else {
if (flags & RENAME_EXCHANGE)
- goto out_dput;
+ goto out_unlock;
}
}
if (olddentry == trap)
- goto out_dput;
+ goto out_unlock;
if (newdentry == trap)
- goto out_dput;
+ goto out_unlock;
if (olddentry->d_inode == newdentry->d_inode)
- goto out_dput;
+ goto out_unlock;
err = 0;
if (ovl_type_merge_or_lower(old))
@@ -1236,7 +1253,7 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
else if (is_dir && !old_opaque && ovl_type_merge(new->d_parent))
err = ovl_set_opaque_xerr(old, olddentry, -EXDEV);
if (err)
- goto out_dput;
+ goto out_unlock;
if (!overwrite && ovl_type_merge_or_lower(new))
err = ovl_set_redirect(new, samedir);
@@ -1244,15 +1261,16 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
ovl_type_merge(old->d_parent))
err = ovl_set_opaque_xerr(new, newdentry, -EXDEV);
if (err)
- goto out_dput;
+ goto out_unlock;
- err = ovl_do_rename(ofs, old_upperdir->d_inode, olddentry,
- new_upperdir->d_inode, newdentry, flags);
+ err = ovl_do_rename(ofs, old_upperdir, olddentry,
+ new_upperdir, newdentry, flags);
+ unlock_rename(new_upperdir, old_upperdir);
if (err)
- goto out_dput;
+ goto out_revert_creds;
if (cleanup_whiteout)
- ovl_cleanup(ofs, old_upperdir->d_inode, newdentry);
+ ovl_cleanup(ofs, old_upperdir, newdentry);
if (overwrite && d_inode(new)) {
if (new_is_dir)
@@ -1271,12 +1289,6 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
if (d_inode(new) && ovl_dentry_upper(new))
ovl_copyattr(d_inode(new));
-out_dput:
- dput(newdentry);
-out_dput_old:
- dput(olddentry);
-out_unlock:
- unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
ovl_revert_creds(old_cred);
if (update_nlink)
@@ -1284,9 +1296,15 @@ out_revert_creds:
else
ovl_drop_write(old);
out:
+ dput(newdentry);
+ dput(olddentry);
dput(opaquedir);
ovl_cache_free(&list);
return err;
+
+out_unlock:
+ unlock_rename(new_upperdir, old_upperdir);
+ goto out_revert_creds;
}
static int ovl_create_tmpfile(struct file *file, struct dentry *dentry,
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 969b458100fe..f5b8877d5fe2 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -48,8 +48,8 @@ static struct file *ovl_open_realfile(const struct file *file,
if (!inode_owner_or_capable(real_idmap, realinode))
flags &= ~O_NOATIME;
- realfile = backing_file_open(&file->f_path, flags, realpath,
- current_cred());
+ realfile = backing_file_open(file_user_path(file),
+ flags, realpath, current_cred());
}
ovl_revert_creds(old_cred);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 6f0e15f86c21..ecb9f2019395 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -610,7 +610,7 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* Introducing security_inode_fileattr_get/set() hooks would solve this issue
* properly.
*/
-static int ovl_security_fileattr(const struct path *realpath, struct fileattr *fa,
+static int ovl_security_fileattr(const struct path *realpath, struct file_kattr *fa,
bool set)
{
struct file *file;
@@ -637,7 +637,7 @@ static int ovl_security_fileattr(const struct path *realpath, struct fileattr *f
return err;
}
-int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa)
+int ovl_real_fileattr_set(const struct path *realpath, struct file_kattr *fa)
{
int err;
@@ -649,7 +649,7 @@ int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa)
}
int ovl_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct path upperpath;
@@ -697,7 +697,7 @@ out:
}
/* Convert inode protection flags to fileattr flags */
-static void ovl_fileattr_prot_flags(struct inode *inode, struct fileattr *fa)
+static void ovl_fileattr_prot_flags(struct inode *inode, struct file_kattr *fa)
{
BUILD_BUG_ON(OVL_PROT_FS_FLAGS_MASK & ~FS_COMMON_FL);
BUILD_BUG_ON(OVL_PROT_FSX_FLAGS_MASK & ~FS_XFLAG_COMMON);
@@ -712,7 +712,7 @@ static void ovl_fileattr_prot_flags(struct inode *inode, struct fileattr *fa)
}
}
-int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa)
+int ovl_real_fileattr_get(const struct path *realpath, struct file_kattr *fa)
{
int err;
@@ -720,13 +720,10 @@ int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa)
if (err)
return err;
- err = vfs_fileattr_get(realpath->dentry, fa);
- if (err == -ENOIOCTLCMD)
- err = -ENOTTY;
- return err;
+ return vfs_fileattr_get(realpath->dentry, fa);
}
-int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int ovl_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct path realpath;
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index bf722daf19a9..76d6248b625e 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -16,6 +16,7 @@
struct ovl_lookup_data {
struct super_block *sb;
+ struct dentry *dentry;
const struct ovl_layer *layer;
struct qstr name;
bool is_dir;
@@ -24,6 +25,7 @@ struct ovl_lookup_data {
bool stop;
bool last;
char *redirect;
+ char *upperredirect;
int metacopy;
/* Referring to last redirect xattr */
bool absolute_redirect;
@@ -228,13 +230,26 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
struct dentry **ret, bool drop_negative)
{
struct ovl_fs *ofs = OVL_FS(d->sb);
- struct dentry *this;
+ struct dentry *this = NULL;
+ const char *warn;
struct path path;
int err;
bool last_element = !post[0];
bool is_upper = d->layer->idx == 0;
char val;
+ /*
+ * We allow filesystems that are case-folding capable but deny composing
+ * ovl stack from case-folded directories. If someone has enabled case
+ * folding on a directory on underlying layer, the warranty of the ovl
+ * stack is voided.
+ */
+ if (ovl_dentry_casefolded(base)) {
+ warn = "case folded parent";
+ err = -ESTALE;
+ goto out_warn;
+ }
+
this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative);
if (IS_ERR(this)) {
err = PTR_ERR(this);
@@ -244,10 +259,17 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
goto out_err;
}
+ if (ovl_dentry_casefolded(this)) {
+ warn = "case folded child";
+ err = -EREMOTE;
+ goto out_warn;
+ }
+
if (ovl_dentry_weird(this)) {
/* Don't support traversing automounts and other weirdness */
+ warn = "unsupported object type";
err = -EREMOTE;
- goto out_err;
+ goto out_warn;
}
path.dentry = this;
@@ -281,8 +303,9 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
} else {
if (ovl_lookup_trap_inode(d->sb, this)) {
/* Caught in a trap of overlapping layers */
+ warn = "overlapping layers";
err = -ELOOP;
- goto out_err;
+ goto out_warn;
}
if (last_element)
@@ -314,6 +337,10 @@ put_and_out:
this = NULL;
goto out;
+out_warn:
+ pr_warn_ratelimited("failed lookup in %s (%pd2, name='%.*s', err=%i): %s\n",
+ is_upper ? "upper" : "lower", base,
+ namelen, name, err, warn);
out_err:
dput(this);
return err;
@@ -1024,6 +1051,31 @@ int ovl_verify_lowerdata(struct dentry *dentry)
return ovl_maybe_validate_verity(dentry);
}
+/*
+ * Following redirects/metacopy can have security consequences: it's like a
+ * symlink into the lower layer without the permission checks.
+ *
+ * This is only a problem if the upper layer is untrusted (e.g comes from an USB
+ * drive). This can allow a non-readable file or directory to become readable.
+ *
+ * Only following redirects when redirects are enabled disables this attack
+ * vector when not necessary.
+ */
+static bool ovl_check_follow_redirect(struct ovl_lookup_data *d)
+{
+ struct ovl_fs *ofs = OVL_FS(d->sb);
+
+ if (d->metacopy && !ofs->config.metacopy) {
+ pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", d->dentry);
+ return false;
+ }
+ if ((d->redirect || d->upperredirect) && !ovl_redirect_follow(ofs)) {
+ pr_warn_ratelimited("refusing to follow redirect for (%pd2)\n", d->dentry);
+ return false;
+ }
+ return true;
+}
+
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
@@ -1039,7 +1091,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int ctr = 0;
struct inode *inode = NULL;
bool upperopaque = false;
- char *upperredirect = NULL;
+ bool check_redirect = (ovl_redirect_follow(ofs) || ofs->numdatalayer);
struct dentry *this;
unsigned int i;
int err;
@@ -1047,12 +1099,14 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
int metacopy_size = 0;
struct ovl_lookup_data d = {
.sb = dentry->d_sb,
+ .dentry = dentry,
.name = dentry->d_name,
.is_dir = false,
.opaque = false,
.stop = false,
- .last = ovl_redirect_follow(ofs) ? false : !ovl_numlower(poe),
+ .last = check_redirect ? false : !ovl_numlower(poe),
.redirect = NULL,
+ .upperredirect = NULL,
.metacopy = 0,
};
@@ -1094,8 +1148,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (d.redirect) {
err = -ENOMEM;
- upperredirect = kstrdup(d.redirect, GFP_KERNEL);
- if (!upperredirect)
+ d.upperredirect = kstrdup(d.redirect, GFP_KERNEL);
+ if (!d.upperredirect)
goto out_put_upper;
if (d.redirect[0] == '/')
poe = roe;
@@ -1113,7 +1167,12 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
for (i = 0; !d.stop && i < ovl_numlower(poe); i++) {
struct ovl_path lower = ovl_lowerstack(poe)[i];
- if (!ovl_redirect_follow(ofs))
+ if (!ovl_check_follow_redirect(&d)) {
+ err = -EPERM;
+ goto out_put;
+ }
+
+ if (!check_redirect)
d.last = i == ovl_numlower(poe) - 1;
else if (d.is_dir || !ofs->numdatalayer)
d.last = lower.layer->idx == ovl_numlower(roe);
@@ -1126,13 +1185,6 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (!this)
continue;
- if ((uppermetacopy || d.metacopy) && !ofs->config.metacopy) {
- dput(this);
- err = -EPERM;
- pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", dentry);
- goto out_put;
- }
-
/*
* If no origin fh is stored in upper of a merge dir, store fh
* of lower dir and set upper parent "impure".
@@ -1185,23 +1237,6 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
ctr++;
}
- /*
- * Following redirects can have security consequences: it's like
- * a symlink into the lower layer without the permission checks.
- * This is only a problem if the upper layer is untrusted (e.g
- * comes from an USB drive). This can allow a non-readable file
- * or directory to become readable.
- *
- * Only following redirects when redirects are enabled disables
- * this attack vector when not necessary.
- */
- err = -EPERM;
- if (d.redirect && !ovl_redirect_follow(ofs)) {
- pr_warn_ratelimited("refusing to follow redirect for (%pd2)\n",
- dentry);
- goto out_put;
- }
-
if (d.stop)
break;
@@ -1212,10 +1247,16 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
}
}
- /* Defer lookup of lowerdata in data-only layers to first access */
+ /*
+ * Defer lookup of lowerdata in data-only layers to first access.
+ * Don't require redirect=follow and metacopy=on in this case.
+ */
if (d.metacopy && ctr && ofs->numdatalayer && d.absolute_redirect) {
d.metacopy = 0;
ctr++;
+ } else if (!ovl_check_follow_redirect(&d)) {
+ err = -EPERM;
+ goto out_put;
}
/*
@@ -1298,20 +1339,26 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
/*
* It's safe to assign upperredirect here: the previous
- * assignment of happens only if upperdentry is non-NULL, and
+ * assignment happens only if upperdentry is non-NULL, and
* this one only if upperdentry is NULL.
*/
- upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0);
- if (IS_ERR(upperredirect)) {
- err = PTR_ERR(upperredirect);
- upperredirect = NULL;
+ d.upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0);
+ if (IS_ERR(d.upperredirect)) {
+ err = PTR_ERR(d.upperredirect);
+ d.upperredirect = NULL;
goto out_free_oe;
}
+
err = ovl_check_metacopy_xattr(ofs, &upperpath, NULL);
if (err < 0)
goto out_free_oe;
- uppermetacopy = err;
+ d.metacopy = uppermetacopy = err;
metacopy_size = err;
+
+ if (!ovl_check_follow_redirect(&d)) {
+ err = -EPERM;
+ goto out_free_oe;
+ }
}
if (upperdentry || ctr) {
@@ -1319,7 +1366,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
.upperdentry = upperdentry,
.oe = oe,
.index = index,
- .redirect = upperredirect,
+ .redirect = d.upperredirect,
};
/* Store lowerdata redirect for lazy lookup */
@@ -1361,7 +1408,7 @@ out_put_upper:
kfree(origin_path);
}
dput(upperdentry);
- kfree(upperredirect);
+ kfree(d.upperredirect);
out:
kfree(d.redirect);
ovl_revert_creds(old_cred);
@@ -1371,7 +1418,7 @@ out:
bool ovl_lower_positive(struct dentry *dentry)
{
struct ovl_entry *poe = OVL_E(dentry->d_parent);
- struct qstr *name = &dentry->d_name;
+ const struct qstr *name = &dentry->d_name;
const struct cred *old_cred;
unsigned int i;
bool positive = false;
@@ -1394,9 +1441,15 @@ bool ovl_lower_positive(struct dentry *dentry)
struct dentry *this;
struct ovl_path *parentpath = &ovl_lowerstack(poe)[i];
+ /*
+ * We need to make a non-const copy of dentry->d_name,
+ * because lookup_one_positive_unlocked() will hash name
+ * with parentpath base, which is on another (lower fs).
+ */
this = lookup_one_positive_unlocked(
mnt_idmap(parentpath->layer->mnt),
- name, parentpath->dentry);
+ &QSTR_LEN(name->name, name->len),
+ parentpath->dentry);
if (IS_ERR(this)) {
switch (PTR_ERR(this)) {
case -ENOENT:
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 8baaba0a3fe5..bb0d7ded8e76 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -246,9 +246,11 @@ static inline struct dentry *ovl_do_mkdir(struct ovl_fs *ofs,
struct dentry *dentry,
umode_t mode)
{
- dentry = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode);
- pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, PTR_ERR_OR_ZERO(dentry));
- return dentry;
+ struct dentry *ret;
+
+ ret = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode);
+ pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, PTR_ERR_OR_ZERO(ret));
+ return ret;
}
static inline int ovl_do_mknod(struct ovl_fs *ofs,
@@ -353,19 +355,19 @@ static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry,
return vfs_remove_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name);
}
-static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir,
- struct dentry *olddentry, struct inode *newdir,
+static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir,
+ struct dentry *olddentry, struct dentry *newdir,
struct dentry *newdentry, unsigned int flags)
{
int err;
struct renamedata rd = {
.old_mnt_idmap = ovl_upper_mnt_idmap(ofs),
- .old_dir = olddir,
- .old_dentry = olddentry,
+ .old_parent = olddir,
+ .old_dentry = olddentry,
.new_mnt_idmap = ovl_upper_mnt_idmap(ofs),
- .new_dir = newdir,
- .new_dentry = newdentry,
- .flags = flags,
+ .new_parent = newdir,
+ .new_dentry = newdentry,
+ .flags = flags,
};
pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags);
@@ -405,6 +407,15 @@ static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs,
return lookup_one(ovl_upper_mnt_idmap(ofs), &QSTR_LEN(name, len), base);
}
+static inline struct dentry *ovl_lookup_upper_unlocked(struct ovl_fs *ofs,
+ const char *name,
+ struct dentry *base,
+ int len)
+{
+ return lookup_one_unlocked(ovl_upper_mnt_idmap(ofs),
+ &QSTR_LEN(name, len), base);
+}
+
static inline bool ovl_open_flags_need_copy_up(int flags)
{
if (!flags)
@@ -414,6 +425,11 @@ static inline bool ovl_open_flags_need_copy_up(int flags)
}
/* util.c */
+int ovl_parent_lock(struct dentry *parent, struct dentry *child);
+static inline void ovl_parent_unlock(struct dentry *parent)
+{
+ inode_unlock(parent->d_inode);
+}
int ovl_get_write_access(struct dentry *dentry);
void ovl_put_write_access(struct dentry *dentry);
void ovl_start_write(struct dentry *dentry);
@@ -446,6 +462,12 @@ void ovl_dentry_init_reval(struct dentry *dentry, struct dentry *upperdentry,
void ovl_dentry_init_flags(struct dentry *dentry, struct dentry *upperdentry,
struct ovl_entry *oe, unsigned int mask);
bool ovl_dentry_weird(struct dentry *dentry);
+
+static inline bool ovl_dentry_casefolded(struct dentry *dentry)
+{
+ return sb_has_encoding(dentry->d_sb) && IS_CASEFOLDED(d_inode(dentry));
+}
+
enum ovl_path_type ovl_path_type(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
@@ -533,7 +555,8 @@ bool ovl_is_inuse(struct dentry *dentry);
bool ovl_need_index(struct dentry *dentry);
int ovl_nlink_start(struct dentry *dentry);
void ovl_nlink_end(struct dentry *dentry);
-int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir);
+int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *work,
+ struct dentry *upperdir, struct dentry *upper);
int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path,
struct ovl_metacopy *data);
int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d,
@@ -721,7 +744,7 @@ void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper,
void ovl_cache_free(struct list_head *list);
void ovl_dir_cache_free(struct inode *inode);
int ovl_check_d_type_supported(const struct path *realpath);
-int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir,
+int ovl_workdir_cleanup(struct ovl_fs *ofs, struct dentry *parent,
struct vfsmount *mnt, struct dentry *dentry, int level);
int ovl_indexdir_cleanup(struct ovl_fs *ofs);
@@ -815,7 +838,7 @@ void ovl_copyattr(struct inode *to);
void ovl_check_protattr(struct inode *inode, struct dentry *upper);
int ovl_set_protattr(struct inode *inode, struct dentry *upper,
- struct fileattr *fa);
+ struct file_kattr *fa);
static inline void ovl_copyflags(struct inode *from, struct inode *to)
{
@@ -826,7 +849,7 @@ static inline void ovl_copyflags(struct inode *from, struct inode *to)
/* dir.c */
extern const struct inode_operations ovl_dir_inode_operations;
-int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir,
+int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct dentry *dir,
struct dentry *dentry);
struct ovl_cattr {
dev_t rdev;
@@ -838,20 +861,20 @@ struct ovl_cattr {
#define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) })
struct dentry *ovl_create_real(struct ovl_fs *ofs,
- struct inode *dir, struct dentry *newdentry,
+ struct dentry *parent, struct dentry *newdentry,
struct ovl_cattr *attr);
-int ovl_cleanup(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry);
+int ovl_cleanup(struct ovl_fs *ofs, struct dentry *workdir, struct dentry *dentry);
struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir);
struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
struct ovl_cattr *attr);
/* file.c */
extern const struct file_operations ovl_file_operations;
-int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa);
-int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa);
-int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int ovl_real_fileattr_get(const struct path *realpath, struct file_kattr *fa);
+int ovl_real_fileattr_set(const struct path *realpath, struct file_kattr *fa);
+int ovl_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
int ovl_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
struct ovl_file;
struct ovl_file *ovl_file_alloc(struct file *realfile);
void ovl_file_free(struct ovl_file *of);
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index cb449ab310a7..4c1bae935ced 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -51,7 +51,7 @@ struct ovl_path {
struct ovl_entry {
unsigned int __numlower;
- struct ovl_path __lowerstack[];
+ struct ovl_path __lowerstack[] __counted_by(__numlower);
};
/* private information held for overlayfs's superblock */
@@ -88,6 +88,7 @@ struct ovl_fs {
/* Shared whiteout cache */
struct dentry *whiteout;
bool no_shared_whiteout;
+ struct mutex whiteout_lock;
/* r/o snapshot of upperdir sb's only taken on volatile mounts */
errseq_t errseq;
};
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index 6759f7d040c8..f4e7fff909ac 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -282,13 +282,11 @@ static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path,
return invalfc(fc, "%s is not a directory", name);
/*
- * Root dentries of case-insensitive capable filesystems might
- * not have the dentry operations set, but still be incompatible
- * with overlayfs. Check explicitly to prevent post-mount
- * failures.
+ * Allow filesystems that are case-folding capable but deny composing
+ * ovl stack from case-folded directories.
*/
- if (sb_has_encoding(path->mnt->mnt_sb))
- return invalfc(fc, "case-insensitive capable filesystem on %s not supported", name);
+ if (ovl_dentry_casefolded(path->dentry))
+ return invalfc(fc, "case-insensitive directory on %s not supported", name);
if (ovl_dentry_weird(path->dentry))
return invalfc(fc, "filesystem on %s not supported", name);
@@ -797,6 +795,8 @@ int ovl_init_fs_context(struct fs_context *fc)
fc->s_fs_info = ofs;
fc->fs_private = ctx;
fc->ops = &ovl_context_ops;
+
+ mutex_init(&ofs->whiteout_lock);
return 0;
out_err:
@@ -871,18 +871,6 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx,
config->uuid = OVL_UUID_NULL;
}
- /* Resolve verity -> metacopy dependency */
- if (config->verity_mode && !config->metacopy) {
- /* Don't allow explicit specified conflicting combinations */
- if (set.metacopy) {
- pr_err("conflicting options: metacopy=off,verity=%s\n",
- ovl_verity_mode(config));
- return -EINVAL;
- }
- /* Otherwise automatically enable metacopy. */
- config->metacopy = true;
- }
-
/*
* This is to make the logic below simpler. It doesn't make any other
* difference, since redirect_dir=on is only used for upper.
@@ -890,18 +878,13 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx,
if (!config->upperdir && config->redirect_mode == OVL_REDIRECT_FOLLOW)
config->redirect_mode = OVL_REDIRECT_ON;
- /* Resolve verity -> metacopy -> redirect_dir dependency */
+ /* metacopy -> redirect_dir dependency */
if (config->metacopy && config->redirect_mode != OVL_REDIRECT_ON) {
if (set.metacopy && set.redirect) {
pr_err("conflicting options: metacopy=on,redirect_dir=%s\n",
ovl_redirect_mode(config));
return -EINVAL;
}
- if (config->verity_mode && set.redirect) {
- pr_err("conflicting options: verity=%s,redirect_dir=%s\n",
- ovl_verity_mode(config), ovl_redirect_mode(config));
- return -EINVAL;
- }
if (set.redirect) {
/*
* There was an explicit redirect_dir=... that resulted
@@ -970,7 +953,7 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx,
}
- /* Resolve userxattr -> !redirect && !metacopy && !verity dependency */
+ /* Resolve userxattr -> !redirect && !metacopy dependency */
if (config->userxattr) {
if (set.redirect &&
config->redirect_mode != OVL_REDIRECT_NOFOLLOW) {
@@ -982,11 +965,6 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx,
pr_err("conflicting options: userxattr,metacopy=on\n");
return -EINVAL;
}
- if (config->verity_mode) {
- pr_err("conflicting options: userxattr,verity=%s\n",
- ovl_verity_mode(config));
- return -EINVAL;
- }
/*
* Silently disable default setting of redirect and metacopy.
* This shall be the default in the future as well: these
@@ -1025,11 +1003,6 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx,
*/
}
- if (ctx->nr_data > 0 && !config->metacopy) {
- pr_err("lower data-only dirs require metacopy support.\n");
- return -EINVAL;
- }
-
return 0;
}
@@ -1078,17 +1051,16 @@ int ovl_show_options(struct seq_file *m, struct dentry *dentry)
seq_printf(m, ",redirect_dir=%s",
ovl_redirect_mode(&ofs->config));
if (ofs->config.index != ovl_index_def)
- seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off");
+ seq_printf(m, ",index=%s", str_on_off(ofs->config.index));
if (ofs->config.uuid != ovl_uuid_def())
seq_printf(m, ",uuid=%s", ovl_uuid_mode(&ofs->config));
if (ofs->config.nfs_export != ovl_nfs_export_def)
- seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ?
- "on" : "off");
+ seq_printf(m, ",nfs_export=%s",
+ str_on_off(ofs->config.nfs_export));
if (ofs->config.xino != ovl_xino_def() && !ovl_same_fs(ofs))
seq_printf(m, ",xino=%s", ovl_xino_mode(&ofs->config));
if (ofs->config.metacopy != ovl_metacopy_def)
- seq_printf(m, ",metacopy=%s",
- ofs->config.metacopy ? "on" : "off");
+ seq_printf(m, ",metacopy=%s", str_on_off(ofs->config.metacopy));
if (ofs->config.ovl_volatile)
seq_puts(m, ",volatile");
if (ofs->config.userxattr)
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 44e208da417c..b65cdfce31ce 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -13,6 +13,7 @@
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/ratelimit.h>
+#include <linux/overflow.h>
#include "overlayfs.h"
struct ovl_cache_entry {
@@ -147,9 +148,8 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
- size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
- p = kmalloc(size, GFP_KERNEL);
+ p = kmalloc(struct_size(p, name, len + 1), GFP_KERNEL);
if (!p)
return NULL;
@@ -1034,14 +1034,13 @@ void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper,
{
struct ovl_cache_entry *p;
- inode_lock_nested(upper->d_inode, I_MUTEX_CHILD);
list_for_each_entry(p, list, l_node) {
struct dentry *dentry;
if (WARN_ON(!p->is_whiteout || !p->is_upper))
continue;
- dentry = ovl_lookup_upper(ofs, p->name, upper, p->len);
+ dentry = ovl_lookup_upper_unlocked(ofs, p->name, upper, p->len);
if (IS_ERR(dentry)) {
pr_err("lookup '%s/%.*s' failed (%i)\n",
upper->d_name.name, p->len, p->name,
@@ -1049,10 +1048,9 @@ void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper,
continue;
}
if (dentry->d_inode)
- ovl_cleanup(ofs, upper->d_inode, dentry);
+ ovl_cleanup(ofs, upper, dentry);
dput(dentry);
}
- inode_unlock(upper->d_inode);
}
static bool ovl_check_d_type(struct dir_context *ctx, const char *name,
@@ -1098,7 +1096,6 @@ static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *pa
int level)
{
int err;
- struct inode *dir = path->dentry->d_inode;
LIST_HEAD(list);
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
@@ -1124,7 +1121,6 @@ static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *pa
if (err)
goto out;
- inode_lock_nested(dir, I_MUTEX_PARENT);
list_for_each_entry(p, &list, l_node) {
struct dentry *dentry;
@@ -1139,39 +1135,40 @@ static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *pa
err = -EINVAL;
break;
}
- dentry = ovl_lookup_upper(ofs, p->name, path->dentry, p->len);
+ dentry = ovl_lookup_upper_unlocked(ofs, p->name, path->dentry, p->len);
if (IS_ERR(dentry))
continue;
if (dentry->d_inode)
- err = ovl_workdir_cleanup(ofs, dir, path->mnt, dentry, level);
+ err = ovl_workdir_cleanup(ofs, path->dentry, path->mnt,
+ dentry, level);
dput(dentry);
if (err)
break;
}
- inode_unlock(dir);
out:
ovl_cache_free(&list);
return err;
}
-int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir,
+int ovl_workdir_cleanup(struct ovl_fs *ofs, struct dentry *parent,
struct vfsmount *mnt, struct dentry *dentry, int level)
{
int err;
- if (!d_is_dir(dentry) || level > 1) {
- return ovl_cleanup(ofs, dir, dentry);
- }
+ if (!d_is_dir(dentry) || level > 1)
+ return ovl_cleanup(ofs, parent, dentry);
- err = ovl_do_rmdir(ofs, dir, dentry);
+ err = ovl_parent_lock(parent, dentry);
+ if (err)
+ return err;
+ err = ovl_do_rmdir(ofs, parent->d_inode, dentry);
+ ovl_parent_unlock(parent);
if (err) {
struct path path = { .mnt = mnt, .dentry = dentry };
- inode_unlock(dir);
err = ovl_workdir_cleanup_recurse(ofs, &path, level + 1);
- inode_lock_nested(dir, I_MUTEX_PARENT);
if (!err)
- err = ovl_cleanup(ofs, dir, dentry);
+ err = ovl_cleanup(ofs, parent, dentry);
}
return err;
@@ -1182,7 +1179,6 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
int err;
struct dentry *indexdir = ofs->workdir;
struct dentry *index = NULL;
- struct inode *dir = indexdir->d_inode;
struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir };
LIST_HEAD(list);
struct ovl_cache_entry *p;
@@ -1196,7 +1192,6 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
if (err)
goto out;
- inode_lock_nested(dir, I_MUTEX_PARENT);
list_for_each_entry(p, &list, l_node) {
if (p->name[0] == '.') {
if (p->len == 1)
@@ -1204,7 +1199,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
if (p->len == 2 && p->name[1] == '.')
continue;
}
- index = ovl_lookup_upper(ofs, p->name, indexdir, p->len);
+ index = ovl_lookup_upper_unlocked(ofs, p->name, indexdir, p->len);
if (IS_ERR(index)) {
err = PTR_ERR(index);
index = NULL;
@@ -1212,7 +1207,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
}
/* Cleanup leftover from index create/cleanup attempt */
if (index->d_name.name[0] == '#') {
- err = ovl_workdir_cleanup(ofs, dir, path.mnt, index, 1);
+ err = ovl_workdir_cleanup(ofs, indexdir, path.mnt, index, 1);
if (err)
break;
goto next;
@@ -1222,7 +1217,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
goto next;
} else if (err == -ESTALE) {
/* Cleanup stale index entries */
- err = ovl_cleanup(ofs, dir, index);
+ err = ovl_cleanup(ofs, indexdir, index);
} else if (err != -ENOENT) {
/*
* Abort mount to avoid corrupting the index if
@@ -1235,10 +1230,10 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
* Whiteout orphan index to block future open by
* handle after overlay nlink dropped to zero.
*/
- err = ovl_cleanup_and_whiteout(ofs, dir, index);
+ err = ovl_cleanup_and_whiteout(ofs, indexdir, index);
} else {
/* Cleanup orphan index entries */
- err = ovl_cleanup(ofs, dir, index);
+ err = ovl_cleanup(ofs, indexdir, index);
}
if (err)
@@ -1249,7 +1244,6 @@ next:
index = NULL;
}
dput(index);
- inode_unlock(dir);
out:
ovl_cache_free(&list);
if (err)
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index e19940d649ca..df85a76597e9 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -299,8 +299,8 @@ static struct dentry *ovl_workdir_create(struct ovl_fs *ofs,
int err;
bool retried = false;
- inode_lock_nested(dir, I_MUTEX_PARENT);
retry:
+ inode_lock_nested(dir, I_MUTEX_PARENT);
work = ovl_lookup_upper(ofs, name, ofs->workbasedir, strlen(name));
if (!IS_ERR(work)) {
@@ -311,23 +311,24 @@ retry:
if (work->d_inode) {
err = -EEXIST;
+ inode_unlock(dir);
if (retried)
goto out_dput;
if (persist)
- goto out_unlock;
+ return work;
retried = true;
- err = ovl_workdir_cleanup(ofs, dir, mnt, work, 0);
+ err = ovl_workdir_cleanup(ofs, ofs->workbasedir, mnt, work, 0);
dput(work);
- if (err == -EINVAL) {
- work = ERR_PTR(err);
- goto out_unlock;
- }
+ if (err == -EINVAL)
+ return ERR_PTR(err);
+
goto retry;
}
work = ovl_do_mkdir(ofs, dir, work, attr.ia_mode);
+ inode_unlock(dir);
err = PTR_ERR(work);
if (IS_ERR(work))
goto out_err;
@@ -365,11 +366,10 @@ retry:
if (err)
goto out_dput;
} else {
+ inode_unlock(dir);
err = PTR_ERR(work);
goto out_err;
}
-out_unlock:
- inode_unlock(dir);
return work;
out_dput:
@@ -377,8 +377,7 @@ out_dput:
out_err:
pr_warn("failed to create directory %s/%s (errno: %i); mounting read-only\n",
ofs->config.workdir, name, -err);
- work = NULL;
- goto out_unlock;
+ return NULL;
}
static int ovl_check_namelen(const struct path *path, struct ovl_fs *ofs,
@@ -557,37 +556,42 @@ out:
static int ovl_check_rename_whiteout(struct ovl_fs *ofs)
{
struct dentry *workdir = ofs->workdir;
- struct inode *dir = d_inode(workdir);
struct dentry *temp;
struct dentry *dest;
struct dentry *whiteout;
struct name_snapshot name;
int err;
- inode_lock_nested(dir, I_MUTEX_PARENT);
-
temp = ovl_create_temp(ofs, workdir, OVL_CATTR(S_IFREG | 0));
err = PTR_ERR(temp);
if (IS_ERR(temp))
- goto out_unlock;
+ return err;
+ err = ovl_parent_lock(workdir, temp);
+ if (err) {
+ dput(temp);
+ return err;
+ }
dest = ovl_lookup_temp(ofs, workdir);
err = PTR_ERR(dest);
if (IS_ERR(dest)) {
dput(temp);
- goto out_unlock;
+ ovl_parent_unlock(workdir);
+ return err;
}
/* Name is inline and stable - using snapshot as a copy helper */
take_dentry_name_snapshot(&name, temp);
- err = ovl_do_rename(ofs, dir, temp, dir, dest, RENAME_WHITEOUT);
+ err = ovl_do_rename(ofs, workdir, temp, workdir, dest, RENAME_WHITEOUT);
+ ovl_parent_unlock(workdir);
if (err) {
if (err == -EINVAL)
err = 0;
goto cleanup_temp;
}
- whiteout = ovl_lookup_upper(ofs, name.name.name, workdir, name.name.len);
+ whiteout = ovl_lookup_upper_unlocked(ofs, name.name.name,
+ workdir, name.name.len);
err = PTR_ERR(whiteout);
if (IS_ERR(whiteout))
goto cleanup_temp;
@@ -596,18 +600,15 @@ static int ovl_check_rename_whiteout(struct ovl_fs *ofs)
/* Best effort cleanup of whiteout and temp file */
if (err)
- ovl_cleanup(ofs, dir, whiteout);
+ ovl_cleanup(ofs, workdir, whiteout);
dput(whiteout);
cleanup_temp:
- ovl_cleanup(ofs, dir, temp);
+ ovl_cleanup(ofs, workdir, temp);
release_dentry_name_snapshot(&name);
dput(temp);
dput(dest);
-out_unlock:
- inode_unlock(dir);
-
return err;
}
@@ -621,8 +622,7 @@ static struct dentry *ovl_lookup_or_create(struct ovl_fs *ofs,
inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
child = ovl_lookup_upper(ofs, name, parent, len);
if (!IS_ERR(child) && !child->d_inode)
- child = ovl_create_real(ofs, parent->d_inode, child,
- OVL_CATTR(mode));
+ child = ovl_create_real(ofs, parent, child, OVL_CATTR(mode));
inode_unlock(parent->d_inode);
dput(parent);
@@ -1322,7 +1322,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
if (WARN_ON(fc->user_ns != current_user_ns()))
goto out_err;
- sb->s_d_op = &ovl_dentry_operations;
+ set_default_d_op(sb, &ovl_dentry_operations);
err = -ENOMEM;
if (!ofs->creator_cred)
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 0819c739cc2f..a33115e7384c 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -15,6 +15,7 @@
#include <linux/uuid.h>
#include <linux/namei.h>
#include <linux/ratelimit.h>
+#include <linux/overflow.h>
#include "overlayfs.h"
/* Get write access to upper mnt - may fail if upper sb was remounted ro */
@@ -145,9 +146,9 @@ void ovl_stack_free(struct ovl_path *stack, unsigned int n)
struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
{
- size_t size = offsetof(struct ovl_entry, __lowerstack[numlower]);
- struct ovl_entry *oe = kzalloc(size, GFP_KERNEL);
+ struct ovl_entry *oe;
+ oe = kzalloc(struct_size(oe, __lowerstack, numlower), GFP_KERNEL);
if (oe)
oe->__numlower = numlower;
@@ -205,10 +206,17 @@ bool ovl_dentry_weird(struct dentry *dentry)
if (!d_can_lookup(dentry) && !d_is_file(dentry) && !d_is_symlink(dentry))
return true;
- return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT |
- DCACHE_MANAGE_TRANSIT |
- DCACHE_OP_HASH |
- DCACHE_OP_COMPARE);
+ if (dentry->d_flags & (DCACHE_NEED_AUTOMOUNT | DCACHE_MANAGE_TRANSIT))
+ return true;
+
+ /*
+ * Allow filesystems that are case-folding capable but deny composing
+ * ovl stack from case-folded directories.
+ */
+ if (sb_has_encoding(dentry->d_sb))
+ return IS_CASEFOLDED(d_inode(dentry));
+
+ return dentry->d_flags & (DCACHE_OP_HASH | DCACHE_OP_COMPARE);
}
enum ovl_path_type ovl_path_type(struct dentry *dentry)
@@ -305,7 +313,9 @@ enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path)
struct dentry *ovl_dentry_upper(struct dentry *dentry)
{
- return ovl_upperdentry_dereference(OVL_I(d_inode(dentry)));
+ struct inode *inode = d_inode(dentry);
+
+ return inode ? ovl_upperdentry_dereference(OVL_I(inode)) : NULL;
}
struct dentry *ovl_dentry_lower(struct dentry *dentry)
@@ -956,7 +966,7 @@ void ovl_check_protattr(struct inode *inode, struct dentry *upper)
}
int ovl_set_protattr(struct inode *inode, struct dentry *upper,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct ovl_fs *ofs = OVL_FS(inode->i_sb);
char buf[OVL_PROTATTR_MAX];
@@ -1068,7 +1078,6 @@ static void ovl_cleanup_index(struct dentry *dentry)
{
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *indexdir = ovl_indexdir(dentry->d_sb);
- struct inode *dir = indexdir->d_inode;
struct dentry *lowerdentry = ovl_dentry_lower(dentry);
struct dentry *upperdentry = ovl_dentry_upper(dentry);
struct dentry *index = NULL;
@@ -1104,21 +1113,18 @@ static void ovl_cleanup_index(struct dentry *dentry)
goto out;
}
- inode_lock_nested(dir, I_MUTEX_PARENT);
- index = ovl_lookup_upper(ofs, name.name, indexdir, name.len);
+ index = ovl_lookup_upper_unlocked(ofs, name.name, indexdir, name.len);
err = PTR_ERR(index);
if (IS_ERR(index)) {
index = NULL;
} else if (ovl_index_all(dentry->d_sb)) {
/* Whiteout orphan index to block future open by handle */
err = ovl_cleanup_and_whiteout(OVL_FS(dentry->d_sb),
- dir, index);
+ indexdir, index);
} else {
/* Cleanup orphan index entries */
- err = ovl_cleanup(ofs, dir, index);
+ err = ovl_cleanup(ofs, indexdir, index);
}
-
- inode_unlock(dir);
if (err)
goto fail;
@@ -1217,20 +1223,21 @@ void ovl_nlink_end(struct dentry *dentry)
ovl_inode_unlock(inode);
}
-int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir)
+int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *work,
+ struct dentry *upperdir, struct dentry *upper)
{
struct dentry *trap;
- /* Workdir should not be the same as upperdir */
- if (workdir == upperdir)
- goto err;
-
/* Workdir should not be subdir of upperdir and vice versa */
trap = lock_rename(workdir, upperdir);
if (IS_ERR(trap))
goto err;
if (trap)
goto err_unlock;
+ if (work && work->d_parent != workdir)
+ goto err_unlock;
+ if (upper && upper->d_parent != upperdir)
+ goto err_unlock;
return 0;
@@ -1541,3 +1548,13 @@ void ovl_copyattr(struct inode *inode)
i_size_write(inode, i_size_read(realinode));
spin_unlock(&inode->i_lock);
}
+
+int ovl_parent_lock(struct dentry *parent, struct dentry *child)
+{
+ inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
+ if (!child || child->d_parent == parent)
+ return 0;
+
+ inode_unlock(parent->d_inode);
+ return -EINVAL;
+}
diff --git a/fs/pidfs.c b/fs/pidfs.c
index c1f0a067be40..edc35522d75c 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -21,11 +21,23 @@
#include <linux/utsname.h>
#include <net/net_namespace.h>
#include <linux/coredump.h>
+#include <linux/xattr.h>
#include "internal.h"
#include "mount.h"
-static struct kmem_cache *pidfs_cachep __ro_after_init;
+#define PIDFS_PID_DEAD ERR_PTR(-ESRCH)
+
+static struct kmem_cache *pidfs_attr_cachep __ro_after_init;
+static struct kmem_cache *pidfs_xattr_cachep __ro_after_init;
+
+static struct path pidfs_root_path = {};
+
+void pidfs_get_root(struct path *path)
+{
+ *path = pidfs_root_path;
+ path_get(path);
+}
/*
* Stashes information that userspace needs to access even after the
@@ -37,17 +49,12 @@ struct pidfs_exit_info {
__u32 coredump_mask;
};
-struct pidfs_inode {
+struct pidfs_attr {
+ struct simple_xattrs *xattrs;
struct pidfs_exit_info __pei;
struct pidfs_exit_info *exit_info;
- struct inode vfs_inode;
};
-static inline struct pidfs_inode *pidfs_i(struct inode *inode)
-{
- return container_of(inode, struct pidfs_inode, vfs_inode);
-}
-
static struct rb_root pidfs_ino_tree = RB_ROOT;
#if BITS_PER_LONG == 32
@@ -125,6 +132,7 @@ void pidfs_add_pid(struct pid *pid)
pid->ino = pidfs_ino_nr;
pid->stashed = NULL;
+ pid->attr = NULL;
pidfs_ino_nr++;
write_seqcount_begin(&pidmap_lock_seq);
@@ -139,6 +147,33 @@ void pidfs_remove_pid(struct pid *pid)
write_seqcount_end(&pidmap_lock_seq);
}
+void pidfs_free_pid(struct pid *pid)
+{
+ struct pidfs_attr *attr __free(kfree) = no_free_ptr(pid->attr);
+ struct simple_xattrs *xattrs __free(kfree) = NULL;
+
+ /*
+ * Any dentry must've been wiped from the pid by now.
+ * Otherwise there's a reference count bug.
+ */
+ VFS_WARN_ON_ONCE(pid->stashed);
+
+ /*
+ * This if an error occurred during e.g., task creation that
+ * causes us to never go through the exit path.
+ */
+ if (unlikely(!attr))
+ return;
+
+ /* This never had a pidfd created. */
+ if (IS_ERR(attr))
+ return;
+
+ xattrs = no_free_ptr(attr->xattrs);
+ if (xattrs)
+ simple_xattrs_free(xattrs, NULL);
+}
+
#ifdef CONFIG_PROC_FS
/**
* pidfd_show_fdinfo - print information about a pidfd
@@ -261,13 +296,13 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags)
static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
{
struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
- struct inode *inode = file_inode(file);
struct pid *pid = pidfd_pid(file);
size_t usize = _IOC_SIZE(cmd);
struct pidfd_info kinfo = {};
struct pidfs_exit_info *exit_info;
struct user_namespace *user_ns;
struct task_struct *task;
+ struct pidfs_attr *attr;
const struct cred *c;
__u64 mask;
@@ -286,8 +321,9 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
if (!pid_in_current_pidns(pid))
return -ESRCH;
+ attr = READ_ONCE(pid->attr);
if (mask & PIDFD_INFO_EXIT) {
- exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
+ exit_info = READ_ONCE(attr->exit_info);
if (exit_info) {
kinfo.mask |= PIDFD_INFO_EXIT;
#ifdef CONFIG_CGROUPS
@@ -300,7 +336,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
if (mask & PIDFD_INFO_COREDUMP) {
kinfo.mask |= PIDFD_INFO_COREDUMP;
- kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask);
+ kinfo.coredump_mask = READ_ONCE(attr->__pei.coredump_mask);
}
task = get_pid_task(pid, PIDTYPE_PID);
@@ -319,7 +355,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
if (!c)
return -ESRCH;
- if (!(kinfo.mask & PIDFD_INFO_COREDUMP)) {
+ if ((kinfo.mask & PIDFD_INFO_COREDUMP) && !(kinfo.coredump_mask)) {
task_lock(task);
if (task->mm)
kinfo.coredump_mask = pidfs_coredump_mask(task->mm->flags);
@@ -366,7 +402,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
kinfo.pid = task_pid_vnr(task);
kinfo.mask |= PIDFD_INFO_PID;
- if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1))
+ if (kinfo.pid == 0 || kinfo.tgid == 0)
return -ESRCH;
copy_out:
@@ -552,41 +588,61 @@ struct pid *pidfd_pid(const struct file *file)
* task has been reaped which cannot happen until we're out of
* release_task().
*
- * If this struct pid is referred to by a pidfd then
- * stashed_dentry_get() will return the dentry and inode for that struct
- * pid. Since we've taken a reference on it there's now an additional
- * reference from the exit path on it. Which is fine. We're going to put
- * it again in a second and we know that the pid is kept alive anyway.
+ * If this struct pid has at least once been referred to by a pidfd then
+ * pid->attr will be allocated. If not we mark the struct pid as dead so
+ * anyone who is trying to register it with pidfs will fail to do so.
+ * Otherwise we would hand out pidfs for reaped tasks without having
+ * exit information available.
*
- * Worst case is that we've filled in the info and immediately free the
- * dentry and inode afterwards since the pidfd has been closed. Since
+ * Worst case is that we've filled in the info and the pid gets freed
+ * right away in free_pid() when no one holds a pidfd anymore. Since
* pidfs_exit() currently is placed after exit_task_work() we know that
- * it cannot be us aka the exiting task holding a pidfd to ourselves.
+ * it cannot be us aka the exiting task holding a pidfd to itself.
*/
void pidfs_exit(struct task_struct *tsk)
{
- struct dentry *dentry;
+ struct pid *pid = task_pid(tsk);
+ struct pidfs_attr *attr;
+ struct pidfs_exit_info *exit_info;
+#ifdef CONFIG_CGROUPS
+ struct cgroup *cgrp;
+#endif
might_sleep();
- dentry = stashed_dentry_get(&task_pid(tsk)->stashed);
- if (dentry) {
- struct inode *inode = d_inode(dentry);
- struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei;
-#ifdef CONFIG_CGROUPS
- struct cgroup *cgrp;
+ guard(spinlock_irq)(&pid->wait_pidfd.lock);
+ attr = pid->attr;
+ if (!attr) {
+ /*
+ * No one ever held a pidfd for this struct pid.
+ * Mark it as dead so no one can add a pidfs
+ * entry anymore. We're about to be reaped and
+ * so no exit information would be available.
+ */
+ pid->attr = PIDFS_PID_DEAD;
+ return;
+ }
- rcu_read_lock();
- cgrp = task_dfl_cgroup(tsk);
- exit_info->cgroupid = cgroup_id(cgrp);
- rcu_read_unlock();
+ /*
+ * If @pid->attr is set someone might still legitimately hold a
+ * pidfd to @pid or someone might concurrently still be getting
+ * a reference to an already stashed dentry from @pid->stashed.
+ * So defer cleaning @pid->attr until the last reference to @pid
+ * is put
+ */
+
+ exit_info = &attr->__pei;
+
+#ifdef CONFIG_CGROUPS
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(tsk);
+ exit_info->cgroupid = cgroup_id(cgrp);
+ rcu_read_unlock();
#endif
- exit_info->exit_code = tsk->exit_code;
+ exit_info->exit_code = tsk->exit_code;
- /* Ensure that PIDFD_GET_INFO sees either all or nothing. */
- smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei);
- dput(dentry);
- }
+ /* Ensure that PIDFD_GET_INFO sees either all or nothing. */
+ smp_store_release(&attr->exit_info, &attr->__pei);
}
#ifdef CONFIG_COREDUMP
@@ -594,16 +650,15 @@ void pidfs_coredump(const struct coredump_params *cprm)
{
struct pid *pid = cprm->pid;
struct pidfs_exit_info *exit_info;
- struct dentry *dentry;
- struct inode *inode;
+ struct pidfs_attr *attr;
__u32 coredump_mask = 0;
- dentry = pid->stashed;
- if (WARN_ON_ONCE(!dentry))
- return;
+ attr = READ_ONCE(pid->attr);
- inode = d_inode(dentry);
- exit_info = &pidfs_i(inode)->__pei;
+ VFS_WARN_ON_ONCE(!attr);
+ VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD);
+
+ exit_info = &attr->__pei;
/* Note how we were coredumped. */
coredump_mask = pidfs_coredump_mask(cprm->mm_flags);
/* Note that we actually did coredump. */
@@ -634,9 +689,24 @@ static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path,
return anon_inode_getattr(idmap, path, stat, request_mask, query_flags);
}
+static ssize_t pidfs_listxattr(struct dentry *dentry, char *buf, size_t size)
+{
+ struct inode *inode = d_inode(dentry);
+ struct pid *pid = inode->i_private;
+ struct pidfs_attr *attr = pid->attr;
+ struct simple_xattrs *xattrs;
+
+ xattrs = READ_ONCE(attr->xattrs);
+ if (!xattrs)
+ return 0;
+
+ return simple_xattr_list(inode, xattrs, buf, size);
+}
+
static const struct inode_operations pidfs_inode_operations = {
- .getattr = pidfs_getattr,
- .setattr = pidfs_setattr,
+ .getattr = pidfs_getattr,
+ .setattr = pidfs_setattr,
+ .listxattr = pidfs_listxattr,
};
static void pidfs_evict_inode(struct inode *inode)
@@ -647,30 +717,9 @@ static void pidfs_evict_inode(struct inode *inode)
put_pid(pid);
}
-static struct inode *pidfs_alloc_inode(struct super_block *sb)
-{
- struct pidfs_inode *pi;
-
- pi = alloc_inode_sb(sb, pidfs_cachep, GFP_KERNEL);
- if (!pi)
- return NULL;
-
- memset(&pi->__pei, 0, sizeof(pi->__pei));
- pi->exit_info = NULL;
-
- return &pi->vfs_inode;
-}
-
-static void pidfs_free_inode(struct inode *inode)
-{
- kmem_cache_free(pidfs_cachep, pidfs_i(inode));
-}
-
static const struct super_operations pidfs_sops = {
- .alloc_inode = pidfs_alloc_inode,
.drop_inode = generic_delete_inode,
.evict_inode = pidfs_evict_inode,
- .free_inode = pidfs_free_inode,
.statfs = simple_statfs,
};
@@ -770,6 +819,8 @@ static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
if (ret < 0)
return ERR_PTR(ret);
+ VFS_WARN_ON_ONCE(!pid->attr);
+
mntput(path.mnt);
return path.dentry;
}
@@ -796,53 +847,8 @@ static int pidfs_export_permission(struct handle_to_path_ctx *ctx,
return 0;
}
-static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path,
- unsigned int flags)
-{
- enum pid_type type;
-
- if (flags & PIDFD_STALE)
- return true;
-
- /*
- * Make sure that if a pidfd is created PIDFD_INFO_EXIT
- * information will be available. So after an inode for the
- * pidfd has been allocated perform another check that the pid
- * is still alive. If it is exit information is available even
- * if the task gets reaped before the pidfd is returned to
- * userspace. The only exception are indicated by PIDFD_STALE:
- *
- * (1) The kernel is in the middle of task creation and thus no
- * task linkage has been established yet.
- * (2) The caller knows @pid has been registered in pidfs at a
- * time when the task was still alive.
- *
- * In both cases exit information will have been reported.
- */
- if (flags & PIDFD_THREAD)
- type = PIDTYPE_PID;
- else
- type = PIDTYPE_TGID;
-
- /*
- * Since pidfs_exit() is called before struct pid's task linkage
- * is removed the case where the task got reaped but a dentry
- * was already attached to struct pid and exit information was
- * recorded and published can be handled correctly.
- */
- if (unlikely(!pid_has_task(pid, type))) {
- struct inode *inode = d_inode(path->dentry);
- return !!READ_ONCE(pidfs_i(inode)->exit_info);
- }
-
- return true;
-}
-
static struct file *pidfs_export_open(struct path *path, unsigned int oflags)
{
- if (!pidfs_pid_valid(d_inode(path->dentry)->i_private, path, oflags))
- return ERR_PTR(-ESRCH);
-
/*
* Clear O_LARGEFILE as open_by_handle_at() forces it and raise
* O_RDWR as pidfds always are.
@@ -864,6 +870,8 @@ static int pidfs_init_inode(struct inode *inode, void *data)
inode->i_private = data;
inode->i_flags |= S_PRIVATE | S_ANON_INODE;
+ /* We allow to set xattrs. */
+ inode->i_flags &= ~S_IMMUTABLE;
inode->i_mode |= S_IRWXU;
inode->i_op = &pidfs_inode_operations;
inode->i_fop = &pidfs_file_operations;
@@ -878,9 +886,127 @@ static void pidfs_put_data(void *data)
put_pid(pid);
}
+/**
+ * pidfs_register_pid - register a struct pid in pidfs
+ * @pid: pid to pin
+ *
+ * Register a struct pid in pidfs.
+ *
+ * Return: On success zero, on error a negative error code is returned.
+ */
+int pidfs_register_pid(struct pid *pid)
+{
+ struct pidfs_attr *new_attr __free(kfree) = NULL;
+ struct pidfs_attr *attr;
+
+ might_sleep();
+
+ if (!pid)
+ return 0;
+
+ attr = READ_ONCE(pid->attr);
+ if (unlikely(attr == PIDFS_PID_DEAD))
+ return PTR_ERR(PIDFS_PID_DEAD);
+ if (attr)
+ return 0;
+
+ new_attr = kmem_cache_zalloc(pidfs_attr_cachep, GFP_KERNEL);
+ if (!new_attr)
+ return -ENOMEM;
+
+ /* Synchronize with pidfs_exit(). */
+ guard(spinlock_irq)(&pid->wait_pidfd.lock);
+
+ attr = pid->attr;
+ if (unlikely(attr == PIDFS_PID_DEAD))
+ return PTR_ERR(PIDFS_PID_DEAD);
+ if (unlikely(attr))
+ return 0;
+
+ pid->attr = no_free_ptr(new_attr);
+ return 0;
+}
+
+static struct dentry *pidfs_stash_dentry(struct dentry **stashed,
+ struct dentry *dentry)
+{
+ int ret;
+ struct pid *pid = d_inode(dentry)->i_private;
+
+ VFS_WARN_ON_ONCE(stashed != &pid->stashed);
+
+ ret = pidfs_register_pid(pid);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return stash_dentry(stashed, dentry);
+}
+
static const struct stashed_operations pidfs_stashed_ops = {
- .init_inode = pidfs_init_inode,
- .put_data = pidfs_put_data,
+ .stash_dentry = pidfs_stash_dentry,
+ .init_inode = pidfs_init_inode,
+ .put_data = pidfs_put_data,
+};
+
+static int pidfs_xattr_get(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *suffix, void *value, size_t size)
+{
+ struct pid *pid = inode->i_private;
+ struct pidfs_attr *attr = pid->attr;
+ const char *name;
+ struct simple_xattrs *xattrs;
+
+ xattrs = READ_ONCE(attr->xattrs);
+ if (!xattrs)
+ return 0;
+
+ name = xattr_full_name(handler, suffix);
+ return simple_xattr_get(xattrs, name, value, size);
+}
+
+static int pidfs_xattr_set(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap, struct dentry *unused,
+ struct inode *inode, const char *suffix,
+ const void *value, size_t size, int flags)
+{
+ struct pid *pid = inode->i_private;
+ struct pidfs_attr *attr = pid->attr;
+ const char *name;
+ struct simple_xattrs *xattrs;
+ struct simple_xattr *old_xattr;
+
+ /* Ensure we're the only one to set @attr->xattrs. */
+ WARN_ON_ONCE(!inode_is_locked(inode));
+
+ xattrs = READ_ONCE(attr->xattrs);
+ if (!xattrs) {
+ xattrs = kmem_cache_zalloc(pidfs_xattr_cachep, GFP_KERNEL);
+ if (!xattrs)
+ return -ENOMEM;
+
+ simple_xattrs_init(xattrs);
+ smp_store_release(&pid->attr->xattrs, xattrs);
+ }
+
+ name = xattr_full_name(handler, suffix);
+ old_xattr = simple_xattr_set(xattrs, name, value, size, flags);
+ if (IS_ERR(old_xattr))
+ return PTR_ERR(old_xattr);
+
+ simple_xattr_free(old_xattr);
+ return 0;
+}
+
+static const struct xattr_handler pidfs_trusted_xattr_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .get = pidfs_xattr_get,
+ .set = pidfs_xattr_set,
+};
+
+static const struct xattr_handler *const pidfs_xattr_handlers[] = {
+ &pidfs_trusted_xattr_handler,
+ NULL
};
static int pidfs_init_fs_context(struct fs_context *fc)
@@ -891,9 +1017,12 @@ static int pidfs_init_fs_context(struct fs_context *fc)
if (!ctx)
return -ENOMEM;
+ fc->s_iflags |= SB_I_NOEXEC;
+ fc->s_iflags |= SB_I_NODEV;
ctx->ops = &pidfs_sops;
ctx->eops = &pidfs_export_operations;
ctx->dops = &pidfs_dentry_operations;
+ ctx->xattr = pidfs_xattr_handlers;
fc->s_fs_info = (void *)&pidfs_stashed_ops;
return 0;
}
@@ -921,8 +1050,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
if (ret < 0)
return ERR_PTR(ret);
- if (!pidfs_pid_valid(pid, &path, flags))
- return ERR_PTR(-ESRCH);
+ VFS_WARN_ON_ONCE(!pid->attr);
flags &= ~PIDFD_STALE;
flags |= O_RDWR;
@@ -934,79 +1062,21 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
return pidfd_file;
}
-/**
- * pidfs_register_pid - register a struct pid in pidfs
- * @pid: pid to pin
- *
- * Register a struct pid in pidfs. Needs to be paired with
- * pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
- *
- * Return: On success zero, on error a negative error code is returned.
- */
-int pidfs_register_pid(struct pid *pid)
-{
- struct path path __free(path_put) = {};
- int ret;
-
- might_sleep();
-
- if (!pid)
- return 0;
-
- ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
- if (unlikely(ret))
- return ret;
- /* Keep the dentry and only put the reference to the mount. */
- path.dentry = NULL;
- return 0;
-}
-
-/**
- * pidfs_get_pid - pin a struct pid through pidfs
- * @pid: pid to pin
- *
- * Similar to pidfs_register_pid() but only valid if the caller knows
- * there's a reference to the @pid through a dentry already that can't
- * go away.
- */
-void pidfs_get_pid(struct pid *pid)
-{
- if (!pid)
- return;
- WARN_ON_ONCE(!stashed_dentry_get(&pid->stashed));
-}
-
-/**
- * pidfs_put_pid - drop a pidfs reference
- * @pid: pid to drop
- *
- * Drop a reference to @pid via pidfs. This is only safe if the
- * reference has been taken via pidfs_get_pid().
- */
-void pidfs_put_pid(struct pid *pid)
-{
- might_sleep();
-
- if (!pid)
- return;
- VFS_WARN_ON_ONCE(!pid->stashed);
- dput(pid->stashed);
-}
-
-static void pidfs_inode_init_once(void *data)
-{
- struct pidfs_inode *pi = data;
-
- inode_init_once(&pi->vfs_inode);
-}
-
void __init pidfs_init(void)
{
- pidfs_cachep = kmem_cache_create("pidfs_cache", sizeof(struct pidfs_inode), 0,
+ pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0,
(SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
- SLAB_ACCOUNT | SLAB_PANIC),
- pidfs_inode_init_once);
+ SLAB_ACCOUNT | SLAB_PANIC), NULL);
+
+ pidfs_xattr_cachep = kmem_cache_create("pidfs_xattr_cache",
+ sizeof(struct simple_xattrs), 0,
+ (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
+ SLAB_ACCOUNT | SLAB_PANIC), NULL);
+
pidfs_mnt = kern_mount(&pidfs_type);
if (IS_ERR(pidfs_mnt))
panic("Failed to mount pidfs pseudo filesystem");
+
+ pidfs_root_path.mnt = pidfs_mnt;
+ pidfs_root_path.dentry = pidfs_mnt->mnt_root;
}
diff --git a/fs/pipe.c b/fs/pipe.c
index da45edd68c41..731622d0738d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -26,6 +26,7 @@
#include <linux/memcontrol.h>
#include <linux/watch_queue.h>
#include <linux/sysctl.h>
+#include <linux/sort.h>
#include <linux/uaccess.h>
#include <asm/ioctls.h>
@@ -76,8 +77,6 @@ static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
* -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
*/
-#define cmp_int(l, r) ((l > r) - (l < r))
-
#ifdef CONFIG_PROVE_LOCKING
static int pipe_lock_cmp_fn(const struct lockdep_map *a,
const struct lockdep_map *b)
@@ -964,6 +963,11 @@ int create_pipe_files(struct file **res, int flags)
res[1] = f;
stream_open(inode, res[0]);
stream_open(inode, res[1]);
+
+ /* pipe groks IOCB_NOWAIT */
+ res[0]->f_mode |= FMODE_NOWAIT;
+ res[1]->f_mode |= FMODE_NOWAIT;
+
/*
* Disable permission and pre-content events, but enable legacy
* inotify events for legacy users.
@@ -998,9 +1002,6 @@ static int __do_pipe_flags(int *fd, struct file **files, int flags)
audit_fd_pair(fdr, fdw);
fd[0] = fdr;
fd[1] = fdw;
- /* pipe groks IOCB_NOWAIT */
- files[0]->f_mode |= FMODE_NOWAIT;
- files[1]->f_mode |= FMODE_NOWAIT;
return 0;
err_fdr:
diff --git a/fs/pnode.c b/fs/pnode.c
index ffd429b760d5..81f7599bdac4 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -21,17 +21,12 @@ static inline struct mount *next_peer(struct mount *p)
static inline struct mount *first_slave(struct mount *p)
{
- return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
-}
-
-static inline struct mount *last_slave(struct mount *p)
-{
- return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave);
+ return hlist_entry(p->mnt_slave_list.first, struct mount, mnt_slave);
}
static inline struct mount *next_slave(struct mount *p)
{
- return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
+ return hlist_entry(p->mnt_slave.next, struct mount, mnt_slave);
}
static struct mount *get_peer_under_root(struct mount *mnt,
@@ -70,69 +65,90 @@ int get_dominating_id(struct mount *mnt, const struct path *root)
return 0;
}
-static int do_make_slave(struct mount *mnt)
+static inline bool will_be_unmounted(struct mount *m)
{
- struct mount *master, *slave_mnt;
+ return m->mnt.mnt_flags & MNT_UMOUNT;
+}
- if (list_empty(&mnt->mnt_share)) {
- if (IS_MNT_SHARED(mnt)) {
- mnt_release_group_id(mnt);
- CLEAR_MNT_SHARED(mnt);
- }
- master = mnt->mnt_master;
- if (!master) {
- struct list_head *p = &mnt->mnt_slave_list;
- while (!list_empty(p)) {
- slave_mnt = list_first_entry(p,
- struct mount, mnt_slave);
- list_del_init(&slave_mnt->mnt_slave);
- slave_mnt->mnt_master = NULL;
- }
- return 0;
- }
- } else {
+static struct mount *propagation_source(struct mount *mnt)
+{
+ do {
struct mount *m;
- /*
- * slave 'mnt' to a peer mount that has the
- * same root dentry. If none is available then
- * slave it to anything that is available.
- */
- for (m = master = next_peer(mnt); m != mnt; m = next_peer(m)) {
- if (m->mnt.mnt_root == mnt->mnt.mnt_root) {
- master = m;
- break;
- }
+ for (m = next_peer(mnt); m != mnt; m = next_peer(m)) {
+ if (!will_be_unmounted(m))
+ return m;
}
- list_del_init(&mnt->mnt_share);
- mnt->mnt_group_id = 0;
- CLEAR_MNT_SHARED(mnt);
+ mnt = mnt->mnt_master;
+ } while (mnt && will_be_unmounted(mnt));
+ return mnt;
+}
+
+static void transfer_propagation(struct mount *mnt, struct mount *to)
+{
+ struct hlist_node *p = NULL, *n;
+ struct mount *m;
+
+ hlist_for_each_entry_safe(m, n, &mnt->mnt_slave_list, mnt_slave) {
+ m->mnt_master = to;
+ if (!to)
+ hlist_del_init(&m->mnt_slave);
+ else
+ p = &m->mnt_slave;
}
- list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
- slave_mnt->mnt_master = master;
- list_move(&mnt->mnt_slave, &master->mnt_slave_list);
- list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
- INIT_LIST_HEAD(&mnt->mnt_slave_list);
- mnt->mnt_master = master;
- return 0;
+ if (p)
+ hlist_splice_init(&mnt->mnt_slave_list, p, &to->mnt_slave_list);
}
/*
- * vfsmount lock must be held for write
+ * EXCL[namespace_sem]
*/
void change_mnt_propagation(struct mount *mnt, int type)
{
+ struct mount *m = mnt->mnt_master;
+
if (type == MS_SHARED) {
set_mnt_shared(mnt);
return;
}
- do_make_slave(mnt);
- if (type != MS_SLAVE) {
- list_del_init(&mnt->mnt_slave);
+ if (IS_MNT_SHARED(mnt)) {
+ m = propagation_source(mnt);
+ if (list_empty(&mnt->mnt_share)) {
+ mnt_release_group_id(mnt);
+ } else {
+ list_del_init(&mnt->mnt_share);
+ mnt->mnt_group_id = 0;
+ }
+ CLEAR_MNT_SHARED(mnt);
+ transfer_propagation(mnt, m);
+ }
+ hlist_del_init(&mnt->mnt_slave);
+ if (type == MS_SLAVE) {
+ mnt->mnt_master = m;
+ if (m)
+ hlist_add_head(&mnt->mnt_slave, &m->mnt_slave_list);
+ } else {
mnt->mnt_master = NULL;
if (type == MS_UNBINDABLE)
- mnt->mnt.mnt_flags |= MNT_UNBINDABLE;
+ mnt->mnt_t_flags |= T_UNBINDABLE;
else
- mnt->mnt.mnt_flags &= ~MNT_UNBINDABLE;
+ mnt->mnt_t_flags &= ~T_UNBINDABLE;
+ }
+}
+
+static struct mount *__propagation_next(struct mount *m,
+ struct mount *origin)
+{
+ while (1) {
+ struct mount *master = m->mnt_master;
+
+ if (master == origin->mnt_master) {
+ struct mount *next = next_peer(m);
+ return (next == origin) ? NULL : next;
+ } else if (m->mnt_slave.next)
+ return next_slave(m);
+
+ /* back at master */
+ m = master;
}
}
@@ -150,34 +166,24 @@ static struct mount *propagation_next(struct mount *m,
struct mount *origin)
{
/* are there any slaves of this mount? */
- if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+ if (!IS_MNT_NEW(m) && !hlist_empty(&m->mnt_slave_list))
return first_slave(m);
- while (1) {
- struct mount *master = m->mnt_master;
-
- if (master == origin->mnt_master) {
- struct mount *next = next_peer(m);
- return (next == origin) ? NULL : next;
- } else if (m->mnt_slave.next != &master->mnt_slave_list)
- return next_slave(m);
-
- /* back at master */
- m = master;
- }
+ return __propagation_next(m, origin);
}
static struct mount *skip_propagation_subtree(struct mount *m,
struct mount *origin)
{
/*
- * Advance m such that propagation_next will not return
- * the slaves of m.
+ * Advance m past everything that gets propagation from it.
*/
- if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
- m = last_slave(m);
+ struct mount *p = __propagation_next(m, origin);
+
+ while (p && peers(m, p))
+ p = __propagation_next(p, origin);
- return m;
+ return p;
}
static struct mount *next_group(struct mount *m, struct mount *origin)
@@ -185,7 +191,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
while (1) {
while (1) {
struct mount *next;
- if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+ if (!IS_MNT_NEW(m) && !hlist_empty(&m->mnt_slave_list))
return first_slave(m);
next = next_peer(m);
if (m->mnt_group_id == origin->mnt_group_id) {
@@ -198,7 +204,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
/* m is the last peer */
while (1) {
struct mount *master = m->mnt_master;
- if (m->mnt_slave.next != &master->mnt_slave_list)
+ if (m->mnt_slave.next)
return next_slave(m);
m = next_peer(master);
if (master->mnt_group_id == origin->mnt_group_id)
@@ -212,142 +218,113 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
}
}
-/* all accesses are serialized by namespace_sem */
-static struct mount *last_dest, *first_source, *last_source, *dest_master;
-static struct hlist_head *list;
-
-static inline bool peers(const struct mount *m1, const struct mount *m2)
-{
- return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id;
-}
-
-static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
+static bool need_secondary(struct mount *m, struct mountpoint *dest_mp)
{
- struct mount *child;
- int type;
/* skip ones added by this propagate_mnt() */
if (IS_MNT_NEW(m))
- return 0;
+ return false;
/* skip if mountpoint isn't visible in m */
if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root))
- return 0;
+ return false;
/* skip if m is in the anon_ns */
if (is_anon_ns(m->mnt_ns))
- return 0;
+ return false;
+ return true;
+}
- if (peers(m, last_dest)) {
- type = CL_MAKE_SHARED;
- } else {
- struct mount *n, *p;
- bool done;
- for (n = m; ; n = p) {
- p = n->mnt_master;
- if (p == dest_master || IS_MNT_MARKED(p))
- break;
+static struct mount *find_master(struct mount *m,
+ struct mount *last_copy,
+ struct mount *original)
+{
+ struct mount *p;
+
+ // ascend until there's a copy for something with the same master
+ for (;;) {
+ p = m->mnt_master;
+ if (!p || IS_MNT_MARKED(p))
+ break;
+ m = p;
+ }
+ while (!peers(last_copy, original)) {
+ struct mount *parent = last_copy->mnt_parent;
+ if (parent->mnt_master == p) {
+ if (!peers(parent, m))
+ last_copy = last_copy->mnt_master;
+ break;
}
- do {
- struct mount *parent = last_source->mnt_parent;
- if (peers(last_source, first_source))
- break;
- done = parent->mnt_master == p;
- if (done && peers(n, parent))
- break;
- last_source = last_source->mnt_master;
- } while (!done);
-
- type = CL_SLAVE;
- /* beginning of peer group among the slaves? */
- if (IS_MNT_SHARED(m))
- type |= CL_MAKE_SHARED;
+ last_copy = last_copy->mnt_master;
}
-
- child = copy_tree(last_source, last_source->mnt.mnt_root, type);
- if (IS_ERR(child))
- return PTR_ERR(child);
- read_seqlock_excl(&mount_lock);
- mnt_set_mountpoint(m, dest_mp, child);
- if (m->mnt_master != dest_master)
- SET_MNT_MARK(m->mnt_master);
- read_sequnlock_excl(&mount_lock);
- last_dest = m;
- last_source = child;
- hlist_add_head(&child->mnt_hash, list);
- return count_mounts(m->mnt_ns, child);
+ return last_copy;
}
-/*
- * mount 'source_mnt' under the destination 'dest_mnt' at
- * dentry 'dest_dentry'. And propagate that mount to
- * all the peer and slave mounts of 'dest_mnt'.
- * Link all the new mounts into a propagation tree headed at
- * source_mnt. Also link all the new mounts using ->mnt_list
- * headed at source_mnt's ->mnt_list
+/**
+ * propagate_mnt() - create secondary copies for tree attachment
+ * @dest_mnt: destination mount.
+ * @dest_mp: destination mountpoint.
+ * @source_mnt: source mount.
+ * @tree_list: list of secondaries to be attached.
*
- * @dest_mnt: destination mount.
- * @dest_dentry: destination dentry.
- * @source_mnt: source mount.
- * @tree_list : list of heads of trees to be attached.
+ * Create secondary copies for attaching a tree with root @source_mnt
+ * at mount @dest_mnt with mountpoint @dest_mp. Link all new mounts
+ * into a propagation graph. Set mountpoints for all secondaries,
+ * link their roots into @tree_list via ->mnt_hash.
*/
int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
- struct mount *source_mnt, struct hlist_head *tree_list)
+ struct mount *source_mnt, struct hlist_head *tree_list)
{
- struct mount *m, *n;
- int ret = 0;
-
- /*
- * we don't want to bother passing tons of arguments to
- * propagate_one(); everything is serialized by namespace_sem,
- * so globals will do just fine.
- */
- last_dest = dest_mnt;
- first_source = source_mnt;
- last_source = source_mnt;
- list = tree_list;
- dest_master = dest_mnt->mnt_master;
-
- /* all peers of dest_mnt, except dest_mnt itself */
- for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) {
- ret = propagate_one(n, dest_mp);
- if (ret)
- goto out;
- }
-
- /* all slave groups */
- for (m = next_group(dest_mnt, dest_mnt); m;
- m = next_group(m, dest_mnt)) {
- /* everything in that slave group */
- n = m;
+ struct mount *m, *n, *copy, *this;
+ int err = 0, type;
+
+ if (dest_mnt->mnt_master)
+ SET_MNT_MARK(dest_mnt->mnt_master);
+
+ /* iterate over peer groups, depth first */
+ for (m = dest_mnt; m && !err; m = next_group(m, dest_mnt)) {
+ if (m == dest_mnt) { // have one for dest_mnt itself
+ copy = source_mnt;
+ type = CL_MAKE_SHARED;
+ n = next_peer(m);
+ if (n == m)
+ continue;
+ } else {
+ type = CL_SLAVE;
+ /* beginning of peer group among the slaves? */
+ if (IS_MNT_SHARED(m))
+ type |= CL_MAKE_SHARED;
+ n = m;
+ }
do {
- ret = propagate_one(n, dest_mp);
- if (ret)
- goto out;
- n = next_peer(n);
- } while (n != m);
+ if (!need_secondary(n, dest_mp))
+ continue;
+ if (type & CL_SLAVE) // first in this peer group
+ copy = find_master(n, copy, source_mnt);
+ this = copy_tree(copy, copy->mnt.mnt_root, type);
+ if (IS_ERR(this)) {
+ err = PTR_ERR(this);
+ break;
+ }
+ read_seqlock_excl(&mount_lock);
+ mnt_set_mountpoint(n, dest_mp, this);
+ read_sequnlock_excl(&mount_lock);
+ if (n->mnt_master)
+ SET_MNT_MARK(n->mnt_master);
+ copy = this;
+ hlist_add_head(&this->mnt_hash, tree_list);
+ err = count_mounts(n->mnt_ns, this);
+ if (err)
+ break;
+ type = CL_MAKE_SHARED;
+ } while ((n = next_peer(n)) != m);
}
-out:
- read_seqlock_excl(&mount_lock);
+
hlist_for_each_entry(n, tree_list, mnt_hash) {
m = n->mnt_parent;
- if (m->mnt_master != dest_mnt->mnt_master)
+ if (m->mnt_master)
CLEAR_MNT_MARK(m->mnt_master);
}
- read_sequnlock_excl(&mount_lock);
- return ret;
-}
-
-static struct mount *find_topper(struct mount *mnt)
-{
- /* If there is exactly one mount covering mnt completely return it. */
- struct mount *child;
-
- if (!list_is_singular(&mnt->mnt_mounts))
- return NULL;
-
- child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child);
- if (child->mnt_mountpoint != mnt->mnt.mnt_root)
- return NULL;
-
- return child;
+ if (dest_mnt->mnt_master)
+ CLEAR_MNT_MARK(dest_mnt->mnt_master);
+ return err;
}
/*
@@ -407,12 +384,8 @@ bool propagation_would_overmount(const struct mount *from,
*/
int propagate_mount_busy(struct mount *mnt, int refcnt)
{
- struct mount *m, *child, *topper;
struct mount *parent = mnt->mnt_parent;
- if (mnt == parent)
- return do_refcount_check(mnt, refcnt);
-
/*
* quickly check if the current mount can be unmounted.
* If not, we don't have to go checking for all other
@@ -421,23 +394,27 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt))
return 1;
- for (m = propagation_next(parent, parent); m;
+ if (mnt == parent)
+ return 0;
+
+ for (struct mount *m = propagation_next(parent, parent); m;
m = propagation_next(m, parent)) {
- int count = 1;
- child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
- if (!child)
- continue;
+ struct list_head *head;
+ struct mount *child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
- /* Is there exactly one mount on the child that covers
- * it completely whose reference should be ignored?
- */
- topper = find_topper(child);
- if (topper)
- count += 1;
- else if (!list_empty(&child->mnt_mounts))
+ if (!child)
continue;
- if (do_refcount_check(child, count))
+ head = &child->mnt_mounts;
+ if (!list_empty(head)) {
+ /*
+ * a mount that covers child completely wouldn't prevent
+ * it being pulled out; any other would.
+ */
+ if (!list_is_singular(head) || !child->overmount)
+ continue;
+ }
+ if (do_refcount_check(child, 1))
return 1;
}
return 0;
@@ -463,181 +440,209 @@ void propagate_mount_unlock(struct mount *mnt)
}
}
-static void umount_one(struct mount *mnt, struct list_head *to_umount)
+static inline bool is_candidate(struct mount *m)
{
- CLEAR_MNT_MARK(mnt);
- mnt->mnt.mnt_flags |= MNT_UMOUNT;
- list_del_init(&mnt->mnt_child);
- list_del_init(&mnt->mnt_umounting);
- move_from_ns(mnt, to_umount);
+ return m->mnt_t_flags & T_UMOUNT_CANDIDATE;
}
-/*
- * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
- * parent propagates to.
- */
-static bool __propagate_umount(struct mount *mnt,
- struct list_head *to_umount,
- struct list_head *to_restore)
+static void umount_one(struct mount *m, struct list_head *to_umount)
{
- bool progress = false;
- struct mount *child;
-
- /*
- * The state of the parent won't change if this mount is
- * already unmounted or marked as without children.
- */
- if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED))
- goto out;
+ m->mnt.mnt_flags |= MNT_UMOUNT;
+ list_del_init(&m->mnt_child);
+ move_from_ns(m);
+ list_add_tail(&m->mnt_list, to_umount);
+}
- /* Verify topper is the only grandchild that has not been
- * speculatively unmounted.
- */
- list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
- if (child->mnt_mountpoint == mnt->mnt.mnt_root)
- continue;
- if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child))
- continue;
- /* Found a mounted child */
- goto children;
- }
+static void remove_from_candidate_list(struct mount *m)
+{
+ m->mnt_t_flags &= ~(T_MARKED | T_UMOUNT_CANDIDATE);
+ list_del_init(&m->mnt_list);
+}
- /* Mark mounts that can be unmounted if not locked */
- SET_MNT_MARK(mnt);
- progress = true;
+static void gather_candidates(struct list_head *set,
+ struct list_head *candidates)
+{
+ struct mount *m, *p, *q;
- /* If a mount is without children and not locked umount it. */
- if (!IS_MNT_LOCKED(mnt)) {
- umount_one(mnt, to_umount);
- } else {
-children:
- list_move_tail(&mnt->mnt_umounting, to_restore);
+ list_for_each_entry(m, set, mnt_list) {
+ if (is_candidate(m))
+ continue;
+ m->mnt_t_flags |= T_UMOUNT_CANDIDATE;
+ p = m->mnt_parent;
+ q = propagation_next(p, p);
+ while (q) {
+ struct mount *child = __lookup_mnt(&q->mnt,
+ m->mnt_mountpoint);
+ if (child) {
+ /*
+ * We might've already run into this one. That
+ * must've happened on earlier iteration of the
+ * outer loop; in that case we can skip those
+ * parents that get propagation from q - there
+ * will be nothing new on those as well.
+ */
+ if (is_candidate(child)) {
+ q = skip_propagation_subtree(q, p);
+ continue;
+ }
+ child->mnt_t_flags |= T_UMOUNT_CANDIDATE;
+ if (!will_be_unmounted(child))
+ list_add(&child->mnt_list, candidates);
+ }
+ q = propagation_next(q, p);
+ }
}
-out:
- return progress;
+ list_for_each_entry(m, set, mnt_list)
+ m->mnt_t_flags &= ~T_UMOUNT_CANDIDATE;
}
-static void umount_list(struct list_head *to_umount,
- struct list_head *to_restore)
+/*
+ * We know that some child of @m can't be unmounted. In all places where the
+ * chain of descent of @m has child not overmounting the root of parent,
+ * the parent can't be unmounted either.
+ */
+static void trim_ancestors(struct mount *m)
{
- struct mount *mnt, *child, *tmp;
- list_for_each_entry(mnt, to_umount, mnt_list) {
- list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) {
- /* topper? */
- if (child->mnt_mountpoint == mnt->mnt.mnt_root)
- list_move_tail(&child->mnt_umounting, to_restore);
- else
- umount_one(child, to_umount);
- }
+ struct mount *p;
+
+ for (p = m->mnt_parent; is_candidate(p); m = p, p = p->mnt_parent) {
+ if (IS_MNT_MARKED(m)) // all candidates beneath are overmounts
+ return;
+ SET_MNT_MARK(m);
+ if (m != p->overmount)
+ p->mnt_t_flags &= ~T_UMOUNT_CANDIDATE;
}
}
-static void restore_mounts(struct list_head *to_restore)
+/*
+ * Find and exclude all umount candidates forbidden by @m
+ * (see Documentation/filesystems/propagate_umount.txt)
+ * If we can immediately tell that @m is OK to unmount (unlocked
+ * and all children are already committed to unmounting) commit
+ * to unmounting it.
+ * Only @m itself might be taken from the candidates list;
+ * anything found by trim_ancestors() is marked non-candidate
+ * and left on the list.
+ */
+static void trim_one(struct mount *m, struct list_head *to_umount)
{
- /* Restore mounts to a clean working state */
- while (!list_empty(to_restore)) {
- struct mount *mnt, *parent;
- struct mountpoint *mp;
-
- mnt = list_first_entry(to_restore, struct mount, mnt_umounting);
- CLEAR_MNT_MARK(mnt);
- list_del_init(&mnt->mnt_umounting);
-
- /* Should this mount be reparented? */
- mp = mnt->mnt_mp;
- parent = mnt->mnt_parent;
- while (parent->mnt.mnt_flags & MNT_UMOUNT) {
- mp = parent->mnt_mp;
- parent = parent->mnt_parent;
- }
- if (parent != mnt->mnt_parent) {
- mnt_change_mountpoint(parent, mp, mnt);
- mnt_notify_add(mnt);
+ bool remove_this = false, found = false, umount_this = false;
+ struct mount *n;
+
+ if (!is_candidate(m)) { // trim_ancestors() left it on list
+ remove_from_candidate_list(m);
+ return;
+ }
+
+ list_for_each_entry(n, &m->mnt_mounts, mnt_child) {
+ if (!is_candidate(n)) {
+ found = true;
+ if (n != m->overmount) {
+ remove_this = true;
+ break;
+ }
}
}
+ if (found) {
+ trim_ancestors(m);
+ } else if (!IS_MNT_LOCKED(m) && list_empty(&m->mnt_mounts)) {
+ remove_this = true;
+ umount_this = true;
+ }
+ if (remove_this) {
+ remove_from_candidate_list(m);
+ if (umount_this)
+ umount_one(m, to_umount);
+ }
}
-static void cleanup_umount_visitations(struct list_head *visited)
+static void handle_locked(struct mount *m, struct list_head *to_umount)
{
- while (!list_empty(visited)) {
- struct mount *mnt =
- list_first_entry(visited, struct mount, mnt_umounting);
- list_del_init(&mnt->mnt_umounting);
+ struct mount *cutoff = m, *p;
+
+ if (!is_candidate(m)) { // trim_ancestors() left it on list
+ remove_from_candidate_list(m);
+ return;
+ }
+ for (p = m; is_candidate(p); p = p->mnt_parent) {
+ remove_from_candidate_list(p);
+ if (!IS_MNT_LOCKED(p))
+ cutoff = p->mnt_parent;
+ }
+ if (will_be_unmounted(p))
+ cutoff = p;
+ while (m != cutoff) {
+ umount_one(m, to_umount);
+ m = m->mnt_parent;
}
}
/*
- * collect all mounts that receive propagation from the mount in @list,
- * and return these additional mounts in the same list.
- * @list: the list of mounts to be unmounted.
+ * @m is not to going away, and it overmounts the top of a stack of mounts
+ * that are going away. We know that all of those are fully overmounted
+ * by the one above (@m being the topmost of the chain), so @m can be slid
+ * in place where the bottom of the stack is attached.
*
- * vfsmount lock must be held for write
+ * NOTE: here we temporarily violate a constraint - two mounts end up with
+ * the same parent and mountpoint; that will be remedied as soon as we
+ * return from propagate_umount() - its caller (umount_tree()) will detach
+ * the stack from the parent it (and now @m) is attached to. umount_tree()
+ * might choose to keep unmounted pieces stuck to each other, but it always
+ * detaches them from the mounts that remain in the tree.
*/
-int propagate_umount(struct list_head *list)
+static void reparent(struct mount *m)
{
- struct mount *mnt;
- LIST_HEAD(to_restore);
- LIST_HEAD(to_umount);
- LIST_HEAD(visited);
-
- /* Find candidates for unmounting */
- list_for_each_entry_reverse(mnt, list, mnt_list) {
- struct mount *parent = mnt->mnt_parent;
- struct mount *m;
+ struct mount *p = m;
+ struct mountpoint *mp;
- /*
- * If this mount has already been visited it is known that it's
- * entire peer group and all of their slaves in the propagation
- * tree for the mountpoint has already been visited and there is
- * no need to visit them again.
- */
- if (!list_empty(&mnt->mnt_umounting))
- continue;
+ do {
+ mp = p->mnt_mp;
+ p = p->mnt_parent;
+ } while (will_be_unmounted(p));
- list_add_tail(&mnt->mnt_umounting, &visited);
- for (m = propagation_next(parent, parent); m;
- m = propagation_next(m, parent)) {
- struct mount *child = __lookup_mnt(&m->mnt,
- mnt->mnt_mountpoint);
- if (!child)
- continue;
+ mnt_change_mountpoint(p, mp, m);
+ mnt_notify_add(m);
+}
- if (!list_empty(&child->mnt_umounting)) {
- /*
- * If the child has already been visited it is
- * know that it's entire peer group and all of
- * their slaves in the propgation tree for the
- * mountpoint has already been visited and there
- * is no need to visit this subtree again.
- */
- m = skip_propagation_subtree(m, parent);
- continue;
- } else if (child->mnt.mnt_flags & MNT_UMOUNT) {
- /*
- * We have come across a partially unmounted
- * mount in a list that has not been visited
- * yet. Remember it has been visited and
- * continue about our merry way.
- */
- list_add_tail(&child->mnt_umounting, &visited);
- continue;
- }
+/**
+ * propagate_umount - apply propagation rules to the set of mounts for umount()
+ * @set: the list of mounts to be unmounted.
+ *
+ * Collect all mounts that receive propagation from the mount in @set and have
+ * no obstacles to being unmounted. Add these additional mounts to the set.
+ *
+ * See Documentation/filesystems/propagate_umount.txt if you do anything in
+ * this area.
+ *
+ * Locks held:
+ * mount_lock (write_seqlock), namespace_sem (exclusive).
+ */
+void propagate_umount(struct list_head *set)
+{
+ struct mount *m, *p;
+ LIST_HEAD(to_umount); // committed to unmounting
+ LIST_HEAD(candidates); // undecided umount candidates
- /* Check the child and parents while progress is made */
- while (__propagate_umount(child,
- &to_umount, &to_restore)) {
- /* Is the parent a umount candidate? */
- child = child->mnt_parent;
- if (list_empty(&child->mnt_umounting))
- break;
- }
- }
+ // collect all candidates
+ gather_candidates(set, &candidates);
+
+ // reduce the set until it's non-shifting
+ list_for_each_entry_safe(m, p, &candidates, mnt_list)
+ trim_one(m, &to_umount);
+
+ // ... and non-revealing
+ while (!list_empty(&candidates)) {
+ m = list_first_entry(&candidates,struct mount, mnt_list);
+ handle_locked(m, &to_umount);
}
- umount_list(&to_umount, &to_restore);
- restore_mounts(&to_restore);
- cleanup_umount_visitations(&visited);
- list_splice_tail(&to_umount, list);
+ // now to_umount consists of all acceptable candidates
+ // deal with reparenting of remaining overmounts on those
+ list_for_each_entry(m, &to_umount, mnt_list) {
+ if (m->overmount)
+ reparent(m->overmount);
+ }
- return 0;
+ // and fold them into the set
+ list_splice_tail_init(&to_umount, set);
}
diff --git a/fs/pnode.h b/fs/pnode.h
index 34b6247af01d..00ab153e3e9d 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -10,14 +10,14 @@
#include <linux/list.h>
#include "mount.h"
-#define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED)
+#define IS_MNT_SHARED(m) ((m)->mnt_t_flags & T_SHARED)
#define IS_MNT_SLAVE(m) ((m)->mnt_master)
#define IS_MNT_NEW(m) (!(m)->mnt_ns)
-#define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
-#define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
-#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
-#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
-#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
+#define CLEAR_MNT_SHARED(m) ((m)->mnt_t_flags &= ~T_SHARED)
+#define IS_MNT_UNBINDABLE(m) ((m)->mnt_t_flags & T_UNBINDABLE)
+#define IS_MNT_MARKED(m) ((m)->mnt_t_flags & T_MARKED)
+#define SET_MNT_MARK(m) ((m)->mnt_t_flags |= T_MARKED)
+#define CLEAR_MNT_MARK(m) ((m)->mnt_t_flags &= ~T_MARKED)
#define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
#define CL_EXPIRE 0x01
@@ -25,21 +25,26 @@
#define CL_COPY_UNBINDABLE 0x04
#define CL_MAKE_SHARED 0x08
#define CL_PRIVATE 0x10
-#define CL_SHARED_TO_SLAVE 0x20
#define CL_COPY_MNT_NS_FILE 0x40
-#define CL_COPY_ALL (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE)
-
+/*
+ * EXCL[namespace_sem]
+ */
static inline void set_mnt_shared(struct mount *mnt)
{
- mnt->mnt.mnt_flags &= ~MNT_SHARED_MASK;
- mnt->mnt.mnt_flags |= MNT_SHARED;
+ mnt->mnt_t_flags &= ~T_SHARED_MASK;
+ mnt->mnt_t_flags |= T_SHARED;
+}
+
+static inline bool peers(const struct mount *m1, const struct mount *m2)
+{
+ return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id;
}
void change_mnt_propagation(struct mount *, int);
int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
struct hlist_head *);
-int propagate_umount(struct list_head *);
+void propagate_umount(struct list_head *);
int propagate_mount_busy(struct mount *, int);
void propagate_mount_unlock(struct mount *);
void mnt_release_group_id(struct mount *);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index fe33a5843fbd..62d35631ba8c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -827,7 +827,13 @@ static const struct file_operations proc_single_file_operations = {
.release = single_release,
};
-
+/*
+ * proc_mem_open() can return errno, NULL or mm_struct*.
+ *
+ * - Returns NULL if the task has no mm (PF_KTHREAD or PF_EXITING)
+ * - Returns mm_struct* on success
+ * - Returns error code on failure
+ */
struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
{
struct task_struct *task = get_proc_task(inode);
@@ -854,8 +860,8 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
{
struct mm_struct *mm = proc_mem_open(inode, mode);
- if (IS_ERR(mm))
- return PTR_ERR(mm);
+ if (IS_ERR_OR_NULL(mm))
+ return mm ? PTR_ERR(mm) : -ESRCH;
file->private_data = mm;
return 0;
@@ -2698,8 +2704,7 @@ static struct dentry *proc_pident_instantiate(struct dentry *dentry,
inode->i_fop = p->fop;
ei->op = p->op;
pid_update_inode(task, inode);
- d_set_d_op(dentry, &pid_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}
static struct dentry *proc_pident_lookup(struct inode *dir,
@@ -3285,7 +3290,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
}
#endif /* CONFIG_KSM */
-#ifdef CONFIG_STACKLEAK_METRICS
+#ifdef CONFIG_KSTACK_ERASE_METRICS
static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
@@ -3298,7 +3303,7 @@ static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
prev_depth, depth);
return 0;
}
-#endif /* CONFIG_STACKLEAK_METRICS */
+#endif /* CONFIG_KSTACK_ERASE_METRICS */
/*
* Thread groups
@@ -3405,7 +3410,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_LIVEPATCH
ONE("patch_state", S_IRUSR, proc_pid_patch_state),
#endif
-#ifdef CONFIG_STACKLEAK_METRICS
+#ifdef CONFIG_KSTACK_ERASE_METRICS
ONE("stack_depth", S_IRUGO, proc_stack_depth),
#endif
#ifdef CONFIG_PROC_PID_ARCH_STATUS
@@ -3495,8 +3500,7 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry,
set_nlink(inode, nlink_tgid);
pid_update_inode(task, inode);
- d_set_d_op(dentry, &pid_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}
struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
@@ -3798,8 +3802,7 @@ static struct dentry *proc_task_instantiate(struct dentry *dentry,
set_nlink(inode, nlink_tid);
pid_update_inode(task, inode);
- d_set_d_op(dentry, &pid_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}
static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 37aa778d1af7..9eeccff49b2a 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -352,18 +352,9 @@ static int proc_fd_getattr(struct mnt_idmap *idmap,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
- int rv = 0;
generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
-
- /* If it's a directory, put the number of open fds there */
- if (S_ISDIR(inode->i_mode)) {
- rv = proc_readfd_count(inode, &stat->size);
- if (rv < 0)
- return rv;
- }
-
- return rv;
+ return proc_readfd_count(inode, &stat->size);
}
const struct inode_operations proc_fd_inode_operations = {
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index a3e22803cddf..76e800e38c8f 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -254,8 +254,11 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry,
inode = proc_get_inode(dir->i_sb, de);
if (!inode)
return ERR_PTR(-ENOMEM);
- d_set_d_op(dentry, de->proc_dops);
- return d_splice_alias(inode, dentry);
+ if (de->flags & PROC_ENTRY_FORCE_LOOKUP)
+ return d_splice_alias_ops(inode, dentry,
+ &proc_net_dentry_ops);
+ return d_splice_alias_ops(inode, dentry,
+ &proc_misc_dentry_ops);
}
read_unlock(&proc_subdir_lock);
return ERR_PTR(-ENOENT);
@@ -448,9 +451,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
INIT_LIST_HEAD(&ent->pde_openers);
proc_set_user(ent, (*parent)->uid, (*parent)->gid);
- ent->proc_dops = &proc_misc_dentry_ops;
/* Revalidate everything under /proc/${pid}/net */
- if ((*parent)->proc_dops == &proc_net_dentry_ops)
+ if ((*parent)->flags & PROC_ENTRY_FORCE_LOOKUP)
pde_force_lookup(ent);
out:
@@ -569,6 +571,8 @@ static void pde_set_flags(struct proc_dir_entry *pde)
if (pde->proc_ops->proc_compat_ioctl)
pde->flags |= PROC_ENTRY_proc_compat_ioctl;
#endif
+ if (pde->proc_ops->proc_lseek)
+ pde->flags |= PROC_ENTRY_proc_lseek;
}
struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index a3eb3b740f76..129490151be1 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -42,7 +42,7 @@ static void proc_evict_inode(struct inode *inode)
head = ei->sysctl;
if (head) {
- RCU_INIT_POINTER(ei->sysctl, NULL);
+ WRITE_ONCE(ei->sysctl, NULL);
proc_sys_evict_inode(inode, head);
}
}
@@ -473,7 +473,7 @@ static int proc_reg_open(struct inode *inode, struct file *file)
typeof_member(struct proc_ops, proc_open) open;
struct pde_opener *pdeo;
- if (!pde->proc_ops->proc_lseek)
+ if (!pde_has_proc_lseek(pde))
file->f_mode &= ~FMODE_LSEEK;
if (pde_is_permanent(pde)) {
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 96122e91c645..e737401d7383 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -44,7 +44,6 @@ struct proc_dir_entry {
const struct proc_ops *proc_ops;
const struct file_operations *proc_dir_ops;
};
- const struct dentry_operations *proc_dops;
union {
const struct seq_operations *seq_ops;
int (*single_show)(struct seq_file *, void *);
@@ -99,6 +98,11 @@ static inline bool pde_has_proc_compat_ioctl(const struct proc_dir_entry *pde)
#endif
}
+static inline bool pde_has_proc_lseek(const struct proc_dir_entry *pde)
+{
+ return pde->flags & PROC_ENTRY_proc_lseek;
+}
+
extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);
@@ -379,6 +383,11 @@ struct proc_maps_private {
struct task_struct *task;
struct mm_struct *mm;
struct vma_iterator iter;
+ loff_t last_pos;
+#ifdef CONFIG_PER_VMA_LOCK
+ bool mmap_locked;
+ struct vm_area_struct *locked_vma;
+#endif
#ifdef CONFIG_NUMA
struct mempolicy *task_mempolicy;
#endif
@@ -403,7 +412,7 @@ extern const struct dentry_operations proc_net_dentry_ops;
static inline void pde_force_lookup(struct proc_dir_entry *pde)
{
/* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
- pde->proc_dops = &proc_net_dentry_ops;
+ pde->flags |= PROC_ENTRY_FORCE_LOOKUP;
}
/*
@@ -414,7 +423,6 @@ static inline void pde_force_lookup(struct proc_dir_entry *pde)
static inline struct dentry *proc_splice_unmountable(struct inode *inode,
struct dentry *dentry, const struct dentry_operations *d_ops)
{
- d_set_d_op(dentry, d_ops);
dont_mount(dentry);
- return d_splice_alias(inode, dentry);
+ return d_splice_alias_ops(inode, dentry, d_ops);
}
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index bc2bc60c36cc..a458f1e112fd 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -121,8 +121,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "NFS_Unstable: ", 0);
show_val_kb(m, "Bounce: ", 0);
- show_val_kb(m, "WritebackTmp: ",
- global_node_page_state(NR_WRITEBACK_TEMP));
+ show_val_kb(m, "WritebackTmp: ", 0);
show_val_kb(m, "CommitLimit: ", vm_commit_limit());
show_val_kb(m, "Committed_AS: ", committed);
seq_printf(m, "VmallocTotal: %8lu kB\n",
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index c610224faf10..4403a2e20c16 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -111,8 +111,7 @@ static struct dentry *proc_ns_instantiate(struct dentry *dentry,
ei->ns_ops = ns_ops;
pid_update_inode(task, inode);
- d_set_d_op(dentry, &pid_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}
static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 23fc771100ae..ba3568e97fd1 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -22,6 +22,12 @@
#define KPMMASK (KPMSIZE - 1)
#define KPMBITS (KPMSIZE * BITS_PER_BYTE)
+enum kpage_operation {
+ KPAGE_FLAGS,
+ KPAGE_COUNT,
+ KPAGE_CGROUP,
+};
+
static inline unsigned long get_max_dump_pfn(void)
{
#ifdef CONFIG_SPARSEMEM
@@ -37,19 +43,33 @@ static inline unsigned long get_max_dump_pfn(void)
#endif
}
-/* /proc/kpagecount - an array exposing page mapcounts
- *
- * Each entry is a u64 representing the corresponding
- * physical page mapcount.
- */
-static ssize_t kpagecount_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+static u64 get_kpage_count(const struct page *page)
+{
+ struct page_snapshot ps;
+ u64 ret;
+
+ snapshot_page(&ps, page);
+
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ ret = folio_precise_page_mapcount(&ps.folio_snapshot,
+ &ps.page_snapshot);
+ else
+ ret = folio_average_page_mapcount(&ps.folio_snapshot);
+
+ return ret;
+}
+
+static ssize_t kpage_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos,
+ enum kpage_operation op)
{
const unsigned long max_dump_pfn = get_max_dump_pfn();
u64 __user *out = (u64 __user *)buf;
+ struct page *page;
unsigned long src = *ppos;
unsigned long pfn;
ssize_t ret = 0;
+ u64 info;
pfn = src / KPMSIZE;
if (src & KPMMASK || count & KPMMASK)
@@ -59,24 +79,31 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
while (count > 0) {
- struct page *page;
- u64 mapcount = 0;
-
/*
* TODO: ZONE_DEVICE support requires to identify
* memmaps that were actually initialized.
*/
page = pfn_to_online_page(pfn);
- if (page) {
- struct folio *folio = page_folio(page);
- if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
- mapcount = folio_precise_page_mapcount(folio, page);
- else
- mapcount = folio_average_page_mapcount(folio);
- }
-
- if (put_user(mapcount, out)) {
+ if (page) {
+ switch (op) {
+ case KPAGE_FLAGS:
+ info = stable_page_flags(page);
+ break;
+ case KPAGE_COUNT:
+ info = get_kpage_count(page);
+ break;
+ case KPAGE_CGROUP:
+ info = page_cgroup_ino(page);
+ break;
+ default:
+ info = 0;
+ break;
+ }
+ } else
+ info = 0;
+
+ if (put_user(info, out)) {
ret = -EFAULT;
break;
}
@@ -94,17 +121,23 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
return ret;
}
+/* /proc/kpagecount - an array exposing page mapcounts
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page mapcount.
+ */
+static ssize_t kpagecount_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return kpage_read(file, buf, count, ppos, KPAGE_COUNT);
+}
+
static const struct proc_ops kpagecount_proc_ops = {
.proc_flags = PROC_ENTRY_PERMANENT,
.proc_lseek = mem_lseek,
.proc_read = kpagecount_read,
};
-/* /proc/kpageflags - an array exposing page flags
- *
- * Each entry is a u64 representing the corresponding
- * physical page flags.
- */
static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
{
@@ -114,6 +147,7 @@ static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
u64 stable_page_flags(const struct page *page)
{
const struct folio *folio;
+ struct page_snapshot ps;
unsigned long k;
unsigned long mapping;
bool is_anon;
@@ -125,20 +159,22 @@ u64 stable_page_flags(const struct page *page)
*/
if (!page)
return 1 << KPF_NOPAGE;
- folio = page_folio(page);
+
+ snapshot_page(&ps, page);
+ folio = &ps.folio_snapshot;
k = folio->flags;
mapping = (unsigned long)folio->mapping;
- is_anon = mapping & PAGE_MAPPING_ANON;
+ is_anon = mapping & FOLIO_MAPPING_ANON;
/*
* pseudo flags for the well known (anonymous) memory mapped pages
*/
- if (page_mapped(page))
+ if (folio_mapped(folio))
u |= 1 << KPF_MMAP;
if (is_anon) {
u |= 1 << KPF_ANON;
- if (mapping & PAGE_MAPPING_KSM)
+ if (mapping & FOLIO_MAPPING_KSM)
u |= 1 << KPF_KSM;
}
@@ -146,7 +182,7 @@ u64 stable_page_flags(const struct page *page)
* compound pages: export both head/tail info
* they together define a compound page's start/end pos and order
*/
- if (page == &folio->page)
+ if (ps.idx == 0)
u |= kpf_copy_bit(k, KPF_COMPOUND_HEAD, PG_head);
else
u |= 1 << KPF_COMPOUND_TAIL;
@@ -156,25 +192,19 @@ u64 stable_page_flags(const struct page *page)
folio_test_large_rmappable(folio)) {
/* Note: we indicate any THPs here, not just PMD-sized ones */
u |= 1 << KPF_THP;
- } else if (is_huge_zero_folio(folio)) {
+ } else if (is_huge_zero_pfn(ps.pfn)) {
u |= 1 << KPF_ZERO_PAGE;
u |= 1 << KPF_THP;
- } else if (is_zero_folio(folio)) {
+ } else if (is_zero_pfn(ps.pfn)) {
u |= 1 << KPF_ZERO_PAGE;
}
- /*
- * Caveats on high order pages: PG_buddy and PG_slab will only be set
- * on the head page.
- */
- if (PageBuddy(page))
- u |= 1 << KPF_BUDDY;
- else if (page_count(page) == 0 && is_free_buddy_page(page))
+ if (ps.flags & PAGE_SNAPSHOT_PG_BUDDY)
u |= 1 << KPF_BUDDY;
- if (PageOffline(page))
+ if (folio_test_offline(folio))
u |= 1 << KPF_OFFLINE;
- if (PageTable(page))
+ if (folio_test_pgtable(folio))
u |= 1 << KPF_PGTABLE;
if (folio_test_slab(folio))
u |= 1 << KPF_SLAB;
@@ -182,7 +212,7 @@ u64 stable_page_flags(const struct page *page)
#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
u |= kpf_copy_bit(k, KPF_IDLE, PG_idle);
#else
- if (folio_test_idle(folio))
+ if (ps.flags & PAGE_SNAPSHOT_PG_IDLE)
u |= 1 << KPF_IDLE;
#endif
@@ -208,7 +238,7 @@ u64 stable_page_flags(const struct page *page)
if (u & (1 << KPF_HUGE))
u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
else
- u |= kpf_copy_bit(page->flags, KPF_HWPOISON, PG_hwpoison);
+ u |= kpf_copy_bit(ps.page_snapshot.flags, KPF_HWPOISON, PG_hwpoison);
#endif
u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved);
@@ -225,47 +255,17 @@ u64 stable_page_flags(const struct page *page)
#endif
return u;
-};
+}
+/* /proc/kpageflags - an array exposing page flags
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page flags.
+ */
static ssize_t kpageflags_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+ size_t count, loff_t *ppos)
{
- const unsigned long max_dump_pfn = get_max_dump_pfn();
- u64 __user *out = (u64 __user *)buf;
- unsigned long src = *ppos;
- unsigned long pfn;
- ssize_t ret = 0;
-
- pfn = src / KPMSIZE;
- if (src & KPMMASK || count & KPMMASK)
- return -EINVAL;
- if (src >= max_dump_pfn * KPMSIZE)
- return 0;
- count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
-
- while (count > 0) {
- /*
- * TODO: ZONE_DEVICE support requires to identify
- * memmaps that were actually initialized.
- */
- struct page *page = pfn_to_online_page(pfn);
-
- if (put_user(stable_page_flags(page), out)) {
- ret = -EFAULT;
- break;
- }
-
- pfn++;
- out++;
- count -= KPMSIZE;
-
- cond_resched();
- }
-
- *ppos += (char __user *)out - buf;
- if (!ret)
- ret = (char __user *)out - buf;
- return ret;
+ return kpage_read(file, buf, count, ppos, KPAGE_FLAGS);
}
static const struct proc_ops kpageflags_proc_ops = {
@@ -276,53 +276,10 @@ static const struct proc_ops kpageflags_proc_ops = {
#ifdef CONFIG_MEMCG
static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+ size_t count, loff_t *ppos)
{
- const unsigned long max_dump_pfn = get_max_dump_pfn();
- u64 __user *out = (u64 __user *)buf;
- struct page *ppage;
- unsigned long src = *ppos;
- unsigned long pfn;
- ssize_t ret = 0;
- u64 ino;
-
- pfn = src / KPMSIZE;
- if (src & KPMMASK || count & KPMMASK)
- return -EINVAL;
- if (src >= max_dump_pfn * KPMSIZE)
- return 0;
- count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
-
- while (count > 0) {
- /*
- * TODO: ZONE_DEVICE support requires to identify
- * memmaps that were actually initialized.
- */
- ppage = pfn_to_online_page(pfn);
-
- if (ppage)
- ino = page_cgroup_ino(ppage);
- else
- ino = 0;
-
- if (put_user(ino, out)) {
- ret = -EFAULT;
- break;
- }
-
- pfn++;
- out++;
- count -= KPMSIZE;
-
- cond_resched();
- }
-
- *ppos += (char __user *)out - buf;
- if (!ret)
- ret = (char __user *)out - buf;
- return ret;
+ return kpage_read(file, buf, count, ppos, KPAGE_CGROUP);
}
-
static const struct proc_ops kpagecgroup_proc_ops = {
.proc_flags = PROC_ENTRY_PERMANENT,
.proc_lseek = mem_lseek,
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index cc9d74a06ff0..49ab74e0bfde 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -540,9 +540,8 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
- d_set_d_op(dentry, &proc_sys_dentry_operations);
inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
- err = d_splice_alias(inode, dentry);
+ err = d_splice_alias_ops(inode, dentry, &proc_sys_dentry_operations);
out:
if (h)
@@ -699,9 +698,9 @@ static bool proc_sys_fill_cache(struct file *file,
return false;
if (d_in_lookup(child)) {
struct dentry *res;
- d_set_d_op(child, &proc_sys_dentry_operations);
inode = proc_sys_make_inode(dir->d_sb, head, table);
- res = d_splice_alias(inode, child);
+ res = d_splice_alias_ops(inode, child,
+ &proc_sys_dentry_operations);
d_lookup_done(child);
if (unlikely(res)) {
dput(child);
@@ -918,17 +917,21 @@ static int proc_sys_compare(const struct dentry *dentry,
struct ctl_table_header *head;
struct inode *inode;
- /* Although proc doesn't have negative dentries, rcu-walk means
- * that inode here can be NULL */
- /* AV: can it, indeed? */
- inode = d_inode_rcu(dentry);
- if (!inode)
- return 1;
if (name->len != len)
return 1;
if (memcmp(name->name, str, len))
return 1;
- head = rcu_dereference(PROC_I(inode)->sysctl);
+
+ // false positive is fine here - we'll recheck anyway
+ if (d_in_lookup(dentry))
+ return 0;
+
+ inode = d_inode_rcu(dentry);
+ // we just might have run into dentry in the middle of __dentry_kill()
+ if (!inode)
+ return 1;
+
+ head = READ_ONCE(PROC_I(inode)->sysctl);
return !head || !sysctl_is_seen(head);
}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 06a297a27ba3..ed86ac710384 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -363,12 +363,12 @@ static const struct inode_operations proc_root_inode_operations = {
* This is the root "inode" in the /proc tree..
*/
struct proc_dir_entry proc_root = {
- .low_ino = PROC_ROOT_INO,
- .namelen = 5,
- .mode = S_IFDIR | S_IRUGO | S_IXUGO,
- .nlink = 2,
+ .low_ino = PROCFS_ROOT_INO,
+ .namelen = 5,
+ .mode = S_IFDIR | S_IRUGO | S_IXUGO,
+ .nlink = 2,
.refcnt = REFCOUNT_INIT(1),
- .proc_iops = &proc_root_inode_operations,
+ .proc_iops = &proc_root_inode_operations,
.proc_dir_ops = &proc_root_operations,
.parent = &proc_root,
.subdir = RB_ROOT,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 994cde10e3f4..3d6d8a9f13fc 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -29,6 +29,9 @@
#include <asm/tlbflush.h>
#include "internal.h"
+#define SENTINEL_VMA_END -1
+#define SENTINEL_VMA_GATE -2
+
#define SEQ_PUT_DEC(str, val) \
seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8)
void task_mem(struct seq_file *m, struct mm_struct *mm)
@@ -36,9 +39,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
unsigned long text, lib, swap, anon, file, shmem;
unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
- anon = get_mm_counter(mm, MM_ANONPAGES);
- file = get_mm_counter(mm, MM_FILEPAGES);
- shmem = get_mm_counter(mm, MM_SHMEMPAGES);
+ anon = get_mm_counter_sum(mm, MM_ANONPAGES);
+ file = get_mm_counter_sum(mm, MM_FILEPAGES);
+ shmem = get_mm_counter_sum(mm, MM_SHMEMPAGES);
/*
* Note: to minimize their overhead, mm maintains hiwater_vm and
@@ -59,7 +62,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
text = min(text, mm->exec_vm << PAGE_SHIFT);
lib = (mm->exec_vm << PAGE_SHIFT) - text;
- swap = get_mm_counter(mm, MM_SWAPENTS);
+ swap = get_mm_counter_sum(mm, MM_SWAPENTS);
SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
@@ -92,12 +95,12 @@ unsigned long task_statm(struct mm_struct *mm,
unsigned long *shared, unsigned long *text,
unsigned long *data, unsigned long *resident)
{
- *shared = get_mm_counter(mm, MM_FILEPAGES) +
- get_mm_counter(mm, MM_SHMEMPAGES);
+ *shared = get_mm_counter_sum(mm, MM_FILEPAGES) +
+ get_mm_counter_sum(mm, MM_SHMEMPAGES);
*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
>> PAGE_SHIFT;
*data = mm->data_vm + mm->stack_vm;
- *resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
+ *resident = *shared + get_mm_counter_sum(mm, MM_ANONPAGES);
return mm->total_vm;
}
@@ -127,15 +130,134 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
}
#endif
-static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
- loff_t *ppos)
+#ifdef CONFIG_PER_VMA_LOCK
+
+static void unlock_vma(struct proc_maps_private *priv)
+{
+ if (priv->locked_vma) {
+ vma_end_read(priv->locked_vma);
+ priv->locked_vma = NULL;
+ }
+}
+
+static const struct seq_operations proc_pid_maps_op;
+
+static inline bool lock_vma_range(struct seq_file *m,
+ struct proc_maps_private *priv)
+{
+ /*
+ * smaps and numa_maps perform page table walk, therefore require
+ * mmap_lock but maps can be read with locking just the vma and
+ * walking the vma tree under rcu read protection.
+ */
+ if (m->op != &proc_pid_maps_op) {
+ if (mmap_read_lock_killable(priv->mm))
+ return false;
+
+ priv->mmap_locked = true;
+ } else {
+ rcu_read_lock();
+ priv->locked_vma = NULL;
+ priv->mmap_locked = false;
+ }
+
+ return true;
+}
+
+static inline void unlock_vma_range(struct proc_maps_private *priv)
{
- struct vm_area_struct *vma = vma_next(&priv->iter);
+ if (priv->mmap_locked) {
+ mmap_read_unlock(priv->mm);
+ } else {
+ unlock_vma(priv);
+ rcu_read_unlock();
+ }
+}
+
+static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
+ loff_t last_pos)
+{
+ struct vm_area_struct *vma;
+
+ if (priv->mmap_locked)
+ return vma_next(&priv->iter);
+
+ unlock_vma(priv);
+ vma = lock_next_vma(priv->mm, &priv->iter, last_pos);
+ if (!IS_ERR_OR_NULL(vma))
+ priv->locked_vma = vma;
+
+ return vma;
+}
+
+static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
+ loff_t pos)
+{
+ if (priv->mmap_locked)
+ return false;
+
+ rcu_read_unlock();
+ mmap_read_lock(priv->mm);
+ /* Reinitialize the iterator after taking mmap_lock */
+ vma_iter_set(&priv->iter, pos);
+ priv->mmap_locked = true;
+
+ return true;
+}
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline bool lock_vma_range(struct seq_file *m,
+ struct proc_maps_private *priv)
+{
+ return mmap_read_lock_killable(priv->mm) == 0;
+}
+
+static inline void unlock_vma_range(struct proc_maps_private *priv)
+{
+ mmap_read_unlock(priv->mm);
+}
+
+static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
+ loff_t last_pos)
+{
+ return vma_next(&priv->iter);
+}
+
+static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
+ loff_t pos)
+{
+ return false;
+}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
+static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos)
+{
+ struct proc_maps_private *priv = m->private;
+ struct vm_area_struct *vma;
+
+retry:
+ vma = get_next_vma(priv, *ppos);
+ /* EINTR of EAGAIN is possible */
+ if (IS_ERR(vma)) {
+ if (PTR_ERR(vma) == -EAGAIN && fallback_to_mmap_lock(priv, *ppos))
+ goto retry;
+
+ return vma;
+ }
+
+ /* Store previous position to be able to restart if needed */
+ priv->last_pos = *ppos;
if (vma) {
- *ppos = vma->vm_start;
+ /*
+ * Track the end of the reported vma to ensure position changes
+ * even if previous vma was merged with the next vma and we
+ * found the extended vma with the same vm_start.
+ */
+ *ppos = vma->vm_end;
} else {
- *ppos = -2UL;
+ *ppos = SENTINEL_VMA_GATE;
vma = get_gate_vma(priv->mm);
}
@@ -145,11 +267,11 @@ static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
- unsigned long last_addr = *ppos;
+ loff_t last_addr = *ppos;
struct mm_struct *mm;
/* See m_next(). Zero at the start or after lseek. */
- if (last_addr == -1UL)
+ if (last_addr == SENTINEL_VMA_END)
return NULL;
priv->task = get_proc_task(priv->inode);
@@ -163,28 +285,34 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
return NULL;
}
- if (mmap_read_lock_killable(mm)) {
+ if (!lock_vma_range(m, priv)) {
mmput(mm);
put_task_struct(priv->task);
priv->task = NULL;
return ERR_PTR(-EINTR);
}
- vma_iter_init(&priv->iter, mm, last_addr);
+ /*
+ * Reset current position if last_addr was set before
+ * and it's not a sentinel.
+ */
+ if (last_addr > 0)
+ *ppos = last_addr = priv->last_pos;
+ vma_iter_init(&priv->iter, mm, (unsigned long)last_addr);
hold_task_mempolicy(priv);
- if (last_addr == -2UL)
+ if (last_addr == SENTINEL_VMA_GATE)
return get_gate_vma(mm);
- return proc_get_vma(priv, ppos);
+ return proc_get_vma(m, ppos);
}
static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
{
- if (*ppos == -2UL) {
- *ppos = -1UL;
+ if (*ppos == SENTINEL_VMA_GATE) {
+ *ppos = SENTINEL_VMA_END;
return NULL;
}
- return proc_get_vma(m->private, ppos);
+ return proc_get_vma(m, ppos);
}
static void m_stop(struct seq_file *m, void *v)
@@ -196,7 +324,7 @@ static void m_stop(struct seq_file *m, void *v)
return;
release_task_mempolicy(priv);
- mmap_read_unlock(mm);
+ unlock_vma_range(priv);
mmput(mm);
put_task_struct(priv->task);
priv->task = NULL;
@@ -212,8 +340,8 @@ static int proc_maps_open(struct inode *inode, struct file *file,
priv->inode = inode;
priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR(priv->mm)) {
- int err = PTR_ERR(priv->mm);
+ if (IS_ERR_OR_NULL(priv->mm)) {
+ int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH;
seq_release_private(inode, file);
return err;
@@ -1325,8 +1453,8 @@ static int smaps_rollup_open(struct inode *inode, struct file *file)
priv->inode = inode;
priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR(priv->mm)) {
- ret = PTR_ERR(priv->mm);
+ if (IS_ERR_OR_NULL(priv->mm)) {
+ ret = priv->mm ? PTR_ERR(priv->mm) : -ESRCH;
single_release(inode, file);
goto out_free;
@@ -2069,8 +2197,8 @@ static int pagemap_open(struct inode *inode, struct file *file)
struct mm_struct *mm;
mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR(mm))
- return PTR_ERR(mm);
+ if (IS_ERR_OR_NULL(mm))
+ return mm ? PTR_ERR(mm) : -ESRCH;
file->private_data = mm;
return 0;
}
@@ -2087,7 +2215,8 @@ static int pagemap_release(struct inode *inode, struct file *file)
#define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \
PAGE_IS_FILE | PAGE_IS_PRESENT | \
PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \
- PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY)
+ PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY | \
+ PAGE_IS_GUARD)
#define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)
struct pagemap_scan_private {
@@ -2128,12 +2257,14 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
if (!pte_swp_uffd_wp_any(pte))
categories |= PAGE_IS_WRITTEN;
- if (p->masks_of_interest & PAGE_IS_FILE) {
- swp = pte_to_swp_entry(pte);
- if (is_pfn_swap_entry(swp) &&
- !folio_test_anon(pfn_swap_entry_folio(swp)))
- categories |= PAGE_IS_FILE;
- }
+ swp = pte_to_swp_entry(pte);
+ if (is_guard_swp_entry(swp))
+ categories |= PAGE_IS_GUARD;
+ else if ((p->masks_of_interest & PAGE_IS_FILE) &&
+ is_pfn_swap_entry(swp) &&
+ !folio_test_anon(pfn_swap_entry_folio(swp)))
+ categories |= PAGE_IS_FILE;
+
if (pte_swp_soft_dirty(pte))
categories |= PAGE_IS_SOFT_DIRTY;
}
@@ -2179,7 +2310,7 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
categories |= PAGE_IS_FILE;
}
- if (is_zero_pfn(pmd_pfn(pmd)))
+ if (is_huge_zero_pmd(pmd))
categories |= PAGE_IS_PFNZERO;
if (pmd_soft_dirty(pmd))
categories |= PAGE_IS_SOFT_DIRTY;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index bce674533000..59bfd61d653a 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -260,8 +260,8 @@ static int maps_open(struct inode *inode, struct file *file,
priv->inode = inode;
priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR(priv->mm)) {
- int err = PTR_ERR(priv->mm);
+ if (IS_ERR_OR_NULL(priv->mm)) {
+ int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH;
seq_release_private(inode, file);
return err;
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index bb3b769edc71..1a2e1185426c 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -300,7 +300,7 @@ static struct dentry *psinfo_lock_root(void)
return NULL;
root = pstore_sb->s_root;
- inode_lock(d_inode(root));
+ inode_lock_nested(d_inode(root), I_MUTEX_PARENT);
return root;
}
@@ -318,8 +318,7 @@ int pstore_put_backend_records(struct pstore_info *psi)
list_for_each_entry_safe(pos, tmp, &records_list, list) {
if (pos->record->psi == psi) {
list_del_init(&pos->list);
- d_invalidate(pos->dentry);
- simple_unlink(d_inode(root), pos->dentry);
+ locked_recursive_removal(pos->dentry, NULL);
pos->dentry = NULL;
}
}
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index b45c7edc3225..b11f5b20b78b 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -41,7 +41,7 @@ static unsigned long ramfs_mmu_get_unmapped_area(struct file *file,
const struct file_operations ramfs_file_operations = {
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.fsync = noop_fsync,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 7a6d980e614d..77b8ca2757e0 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -28,7 +28,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
unsigned long len,
unsigned long pgoff,
unsigned long flags);
-static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
+static int ramfs_nommu_mmap_prepare(struct vm_area_desc *desc);
static unsigned ramfs_mmap_capabilities(struct file *file)
{
@@ -38,7 +38,7 @@ static unsigned ramfs_mmap_capabilities(struct file *file)
const struct file_operations ramfs_file_operations = {
.mmap_capabilities = ramfs_mmap_capabilities,
- .mmap = ramfs_nommu_mmap,
+ .mmap_prepare = ramfs_nommu_mmap_prepare,
.get_unmapped_area = ramfs_nommu_get_unmapped_area,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
@@ -262,12 +262,12 @@ out:
/*
* set up a mapping for shared memory segments
*/
-static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
+static int ramfs_nommu_mmap_prepare(struct vm_area_desc *desc)
{
- if (!is_nommu_shared_mapping(vma->vm_flags))
+ if (!is_nommu_shared_mapping(desc->vm_flags))
return -ENOSYS;
- file_accessed(file);
- vma->vm_ops = &generic_file_vm_ops;
+ file_accessed(desc->file);
+ desc->vm_ops = &generic_file_vm_ops;
return 0;
}
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 775fa905fda0..f8874c3b8c1e 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -269,6 +269,7 @@ static int ramfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = RAMFS_MAGIC;
sb->s_op = &ramfs_ops;
+ sb->s_d_flags = DCACHE_DONTCACHE;
sb->s_time_gran = 1;
inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
diff --git a/fs/read_write.c b/fs/read_write.c
index 0ef70e128c4a..c5b6265d984b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -28,7 +28,7 @@
const struct file_operations generic_ro_fops = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
- .mmap = generic_file_readonly_mmap,
+ .mmap_prepare = generic_file_readonly_mmap_prepare,
.splice_read = filemap_splice_read,
};
@@ -237,7 +237,7 @@ EXPORT_SYMBOL(generic_llseek_cookie);
* @offset: file offset to seek to
* @whence: type of seek
*
- * This is a generic implemenation of ->llseek useable for all normal local
+ * This is a generic implementation of ->llseek useable for all normal local
* filesystems. It just updates the file offset to the value specified by
* @offset and @whence.
*/
diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c
index 6ed2dfd4dbbd..d98e0d2de09f 100644
--- a/fs/resctrl/ctrlmondata.c
+++ b/fs/resctrl/ctrlmondata.c
@@ -594,9 +594,10 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
struct rmid_read rr = {0};
struct rdt_mon_domain *d;
struct rdtgroup *rdtgrp;
+ int domid, cpu, ret = 0;
struct rdt_resource *r;
+ struct cacheinfo *ci;
struct mon_data *md;
- int domid, ret = 0;
rdtgrp = rdtgroup_kn_lock_live(of->kn);
if (!rdtgrp) {
@@ -623,10 +624,14 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
* one that matches this cache id.
*/
list_for_each_entry(d, &r->mon_domains, hdr.list) {
- if (d->ci->id == domid) {
- rr.ci = d->ci;
+ if (d->ci_id == domid) {
+ rr.ci_id = d->ci_id;
+ cpu = cpumask_any(&d->hdr.cpu_mask);
+ ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
+ if (!ci)
+ continue;
mon_event_read(&rr, r, NULL, rdtgrp,
- &d->ci->shared_cpu_map, evtid, false);
+ &ci->shared_cpu_map, evtid, false);
goto checkresult;
}
}
diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h
index 9a8cf6f11151..0a1eedba2b03 100644
--- a/fs/resctrl/internal.h
+++ b/fs/resctrl/internal.h
@@ -98,7 +98,7 @@ struct mon_data {
* domains in @r sharing L3 @ci.id
* @evtid: Which monitor event to read.
* @first: Initialize MBM counter when true.
- * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains.
+ * @ci_id: Cacheinfo id for L3. Only set when @d is NULL. Used when summing domains.
* @err: Error encountered when reading counter.
* @val: Returned value of event counter. If @rgrp is a parent resource group,
* @val includes the sum of event counts from its child resource groups.
@@ -112,7 +112,7 @@ struct rmid_read {
struct rdt_mon_domain *d;
enum resctrl_event_id evtid;
bool first;
- struct cacheinfo *ci;
+ unsigned int ci_id;
int err;
u64 val;
void *arch_mon_ctx;
diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c
index bde2801289d3..f5637855c3ac 100644
--- a/fs/resctrl/monitor.c
+++ b/fs/resctrl/monitor.c
@@ -361,6 +361,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
{
int cpu = smp_processor_id();
struct rdt_mon_domain *d;
+ struct cacheinfo *ci;
struct mbm_state *m;
int err, ret;
u64 tval = 0;
@@ -388,7 +389,8 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
}
/* Summing domains that share a cache, must be on a CPU for that cache. */
- if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map))
+ ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
+ if (!ci || ci->id != rr->ci_id)
return -EINVAL;
/*
@@ -400,7 +402,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
*/
ret = -EINVAL;
list_for_each_entry(d, &rr->r->mon_domains, hdr.list) {
- if (d->ci->id != rr->ci->id)
+ if (d->ci_id != rr->ci_id)
continue;
err = resctrl_arch_rmid_read(rr->r, d, closid, rmid,
rr->evtid, &tval, rr->arch_mon_ctx);
diff --git a/fs/resctrl/pseudo_lock.c b/fs/resctrl/pseudo_lock.c
index ccc2f9213b4b..87bbc2605de1 100644
--- a/fs/resctrl/pseudo_lock.c
+++ b/fs/resctrl/pseudo_lock.c
@@ -764,13 +764,9 @@ static ssize_t pseudo_lock_measure_trigger(struct file *file,
if (ret == 0) {
if (sel != 1 && sel != 2 && sel != 3)
return -EINVAL;
- ret = debugfs_file_get(file->f_path.dentry);
- if (ret)
- return ret;
ret = pseudo_lock_measure_cycles(rdtgrp, sel);
if (ret == 0)
ret = count;
- debugfs_file_put(file->f_path.dentry);
}
return ret;
diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c
index cc37f58b47dd..77d08229d855 100644
--- a/fs/resctrl/rdtgroup.c
+++ b/fs/resctrl/rdtgroup.c
@@ -536,6 +536,8 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
goto unlock;
}
+ rdt_last_cmd_clear();
+
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
ret = -EINVAL;
@@ -3034,7 +3036,7 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
char name[32];
snc_mode = r->mon_scope == RESCTRL_L3_NODE;
- sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
+ sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id);
if (snc_mode)
sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id);
@@ -3059,7 +3061,7 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d,
return -EPERM;
list_for_each_entry(mevt, &r->evt_list, list) {
- domid = do_sum ? d->ci->id : d->hdr.id;
+ domid = do_sum ? d->ci_id : d->hdr.id;
priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum);
if (WARN_ON_ONCE(!priv))
return -EINVAL;
@@ -3087,7 +3089,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
lockdep_assert_held(&rdtgroup_mutex);
snc_mode = r->mon_scope == RESCTRL_L3_NODE;
- sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
+ sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id);
kn = kernfs_find_and_get(parent_kn, name);
if (kn) {
/*
@@ -3472,6 +3474,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
goto out_unlock;
}
+ rdt_last_cmd_clear();
+
/*
* Check that the parent directory for a monitor group is a "mon_groups"
* directory.
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index 4520ca413867..4b77c6dc4418 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -61,9 +61,9 @@ static unsigned long romfs_get_unmapped_area(struct file *file,
* permit a R/O mapping to be made directly through onto an MTD device if
* possible
*/
-static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
+static int romfs_mmap_prepare(struct vm_area_desc *desc)
{
- return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -ENOSYS;
+ return is_nommu_shared_mapping(desc->vm_flags) ? 0 : -ENOSYS;
}
static unsigned romfs_mmap_capabilities(struct file *file)
@@ -79,7 +79,7 @@ const struct file_operations romfs_ro_fops = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.splice_read = filemap_splice_read,
- .mmap = romfs_mmap,
+ .mmap_prepare = romfs_mmap_prepare,
.get_unmapped_area = romfs_get_unmapped_area,
.mmap_capabilities = romfs_mmap_capabilities,
};
diff --git a/fs/select.c b/fs/select.c
index 9fb650d03d52..082cf60c7e23 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -192,7 +192,7 @@ static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *k
* and is paired with smp_store_mb() in poll_schedule_timeout.
*/
smp_wmb();
- pwq->triggered = 1;
+ WRITE_ONCE(pwq->triggered, 1);
/*
* Perform the default wake up operation using a dummy
@@ -237,7 +237,7 @@ static int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
int rc = -EINTR;
set_current_state(state);
- if (!pwq->triggered)
+ if (!READ_ONCE(pwq->triggered))
rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
__set_current_state(TASK_RUNNING);
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index 89d2dbbb742c..b69daeb1301b 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -155,6 +155,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
struct cached_fids *cfids;
const char *npath;
int retries = 0, cur_sleep = 1;
+ __le32 lease_flags = 0;
if (cifs_sb->root == NULL)
return -ENOENT;
@@ -194,6 +195,7 @@ replay_again:
* from @cfids->entries. Caller will put last reference if the latter.
*/
if (cfid->has_lease && cfid->time) {
+ cfid->last_access_time = jiffies;
spin_unlock(&cfids->cfid_list_lock);
*ret_cfid = cfid;
kfree(utf16_path);
@@ -201,6 +203,8 @@ replay_again:
}
spin_unlock(&cfids->cfid_list_lock);
+ pfid = &cfid->fid;
+
/*
* Skip any prefix paths in @path as lookup_noperm_positive_unlocked() ends up
* calling ->lookup() which already adds those through
@@ -222,6 +226,25 @@ replay_again:
rc = -ENOENT;
goto out;
}
+ if (dentry->d_parent && server->dialect >= SMB30_PROT_ID) {
+ struct cached_fid *parent_cfid;
+
+ spin_lock(&cfids->cfid_list_lock);
+ list_for_each_entry(parent_cfid, &cfids->entries, entry) {
+ if (parent_cfid->dentry == dentry->d_parent) {
+ cifs_dbg(FYI, "found a parent cached file handle\n");
+ if (parent_cfid->has_lease && parent_cfid->time) {
+ lease_flags
+ |= SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE;
+ memcpy(pfid->parent_lease_key,
+ parent_cfid->fid.lease_key,
+ SMB2_LEASE_KEY_SIZE);
+ }
+ break;
+ }
+ }
+ spin_unlock(&cfids->cfid_list_lock);
+ }
}
cfid->dentry = dentry;
cfid->tcon = tcon;
@@ -236,7 +259,6 @@ replay_again:
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- pfid = &cfid->fid;
server->ops->new_lease_key(pfid);
memset(rqst, 0, sizeof(rqst));
@@ -256,6 +278,7 @@ replay_again:
FILE_READ_EA,
.disposition = FILE_OPEN,
.fid = pfid,
+ .lease_flags = lease_flags,
.replay = !!(retries),
};
@@ -341,6 +364,7 @@ replay_again:
cfid->file_all_info_is_valid = true;
cfid->time = jiffies;
+ cfid->last_access_time = jiffies;
spin_unlock(&cfids->cfid_list_lock);
/* At this point the directory handle is fully cached */
rc = 0;
@@ -487,8 +511,17 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb)
spin_lock(&cfids->cfid_list_lock);
list_for_each_entry(cfid, &cfids->entries, entry) {
tmp_list = kmalloc(sizeof(*tmp_list), GFP_ATOMIC);
- if (tmp_list == NULL)
- break;
+ if (tmp_list == NULL) {
+ /*
+ * If the malloc() fails, we won't drop all
+ * dentries, and unmounting is likely to trigger
+ * a 'Dentry still in use' error.
+ */
+ cifs_tcon_dbg(VFS, "Out of memory while dropping dentries\n");
+ spin_unlock(&cfids->cfid_list_lock);
+ spin_unlock(&cifs_sb->tlink_tree_lock);
+ goto done;
+ }
spin_lock(&cfid->fid_lock);
tmp_list->dentry = cfid->dentry;
cfid->dentry = NULL;
@@ -500,6 +533,7 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb)
}
spin_unlock(&cifs_sb->tlink_tree_lock);
+done:
list_for_each_entry_safe(tmp_list, q, &entry, entry) {
list_del(&tmp_list->entry);
dput(tmp_list->dentry);
@@ -585,7 +619,7 @@ static void cached_dir_put_work(struct work_struct *work)
queue_work(serverclose_wq, &cfid->close_work);
}
-int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16])
+bool cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16])
{
struct cached_fids *cfids = tcon->cfids;
struct cached_fid *cfid;
@@ -698,8 +732,8 @@ static void cfids_laundromat_worker(struct work_struct *work)
spin_lock(&cfids->cfid_list_lock);
list_for_each_entry_safe(cfid, q, &cfids->entries, entry) {
- if (cfid->time &&
- time_after(jiffies, cfid->time + HZ * dir_cache_timeout)) {
+ if (cfid->last_access_time &&
+ time_after(jiffies, cfid->last_access_time + HZ * dir_cache_timeout)) {
cfid->on_list = false;
list_move(&cfid->entry, &entry);
cfids->num_entries--;
diff --git a/fs/smb/client/cached_dir.h b/fs/smb/client/cached_dir.h
index 1dfe79d947a6..46b5a2fdf15b 100644
--- a/fs/smb/client/cached_dir.h
+++ b/fs/smb/client/cached_dir.h
@@ -14,19 +14,18 @@ struct cached_dirent {
char *name;
int namelen;
loff_t pos;
-
struct cifs_fattr fattr;
};
struct cached_dirents {
bool is_valid:1;
bool is_failed:1;
- struct dir_context *ctx; /*
- * Only used to make sure we only take entries
- * from a single context. Never dereferenced.
- */
+ struct file *file; /*
+ * Used to associate the cache with a single
+ * open file instance.
+ */
struct mutex de_mutex;
- int pos; /* Expected ctx->pos */
+ loff_t pos; /* Expected ctx->pos */
struct list_head entries;
};
@@ -39,6 +38,7 @@ struct cached_fid {
bool on_list:1;
bool file_all_info_is_valid:1;
unsigned long time; /* jiffies of when lease was taken */
+ unsigned long last_access_time; /* jiffies of when last accessed */
struct kref refcount;
struct cifs_fid fid;
spinlock_t fid_lock;
@@ -80,6 +80,6 @@ extern void drop_cached_dir_by_name(const unsigned int xid,
struct cifs_sb_info *cifs_sb);
extern void close_all_cached_dirs(struct cifs_sb_info *cifs_sb);
extern void invalidate_all_cached_dirs(struct cifs_tcon *tcon);
-extern int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]);
+extern bool cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]);
#endif /* _CACHED_DIR_H */
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index e03c890de0a0..f1cea365b6f1 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -26,6 +26,7 @@
#include "smbdirect.h"
#endif
#include "cifs_swn.h"
+#include "cached_dir.h"
void
cifs_dump_mem(char *label, void *data, int length)
@@ -280,6 +281,54 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
return 0;
}
+static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v)
+{
+ struct list_head *stmp, *tmp, *tmp1;
+ struct TCP_Server_Info *server;
+ struct cifs_ses *ses;
+ struct cifs_tcon *tcon;
+ struct cached_fids *cfids;
+ struct cached_fid *cfid;
+ LIST_HEAD(entry);
+
+ seq_puts(m, "# Version:1\n");
+ seq_puts(m, "# Format:\n");
+ seq_puts(m, "# <tree id> <sess id> <persistent fid> <path>\n");
+
+ spin_lock(&cifs_tcp_ses_lock);
+ list_for_each(stmp, &cifs_tcp_ses_list) {
+ server = list_entry(stmp, struct TCP_Server_Info,
+ tcp_ses_list);
+ list_for_each(tmp, &server->smb_ses_list) {
+ ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+ list_for_each(tmp1, &ses->tcon_list) {
+ tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
+ cfids = tcon->cfids;
+ spin_lock(&cfids->cfid_list_lock); /* check lock ordering */
+ seq_printf(m, "Num entries: %d\n", cfids->num_entries);
+ list_for_each_entry(cfid, &cfids->entries, entry) {
+ seq_printf(m, "0x%x 0x%llx 0x%llx %s",
+ tcon->tid,
+ ses->Suid,
+ cfid->fid.persistent_fid,
+ cfid->path);
+ if (cfid->file_all_info_is_valid)
+ seq_printf(m, "\tvalid file info");
+ if (cfid->dirents.is_valid)
+ seq_printf(m, ", valid dirents");
+ seq_printf(m, "\n");
+ }
+ spin_unlock(&cfids->cfid_list_lock);
+
+
+ }
+ }
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
+ seq_putc(m, '\n');
+ return 0;
+}
+
static __always_inline const char *compression_alg_str(__le16 alg)
{
switch (alg) {
@@ -362,6 +411,10 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
c = 0;
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ struct smbdirect_socket_parameters *sp;
+#endif
+
/* channel info will be printed as a part of sessions below */
if (SERVER_IS_CHAN(server))
continue;
@@ -383,25 +436,26 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
seq_printf(m, "\nSMBDirect transport not available");
goto skip_rdma;
}
+ sp = &server->smbd_conn->socket.parameters;
seq_printf(m, "\nSMBDirect (in hex) protocol version: %x "
"transport status: %x",
server->smbd_conn->protocol,
- server->smbd_conn->transport_status);
+ server->smbd_conn->socket.status);
seq_printf(m, "\nConn receive_credit_max: %x "
"send_credit_target: %x max_send_size: %x",
- server->smbd_conn->receive_credit_max,
- server->smbd_conn->send_credit_target,
- server->smbd_conn->max_send_size);
+ sp->recv_credit_max,
+ sp->send_credit_target,
+ sp->max_send_size);
seq_printf(m, "\nConn max_fragmented_recv_size: %x "
"max_fragmented_send_size: %x max_receive_size:%x",
- server->smbd_conn->max_fragmented_recv_size,
- server->smbd_conn->max_fragmented_send_size,
- server->smbd_conn->max_receive_size);
+ sp->max_fragmented_recv_size,
+ sp->max_fragmented_send_size,
+ sp->max_recv_size);
seq_printf(m, "\nConn keep_alive_interval: %x "
"max_readwrite_size: %x rdma_readwrite_threshold: %x",
- server->smbd_conn->keep_alive_interval,
- server->smbd_conn->max_readwrite_size,
+ sp->keepalive_interval_msec * 1000,
+ sp->max_read_write_size,
server->smbd_conn->rdma_readwrite_threshold);
seq_printf(m, "\nDebug count_get_receive_buffer: %x "
"count_put_receive_buffer: %x count_send_empty: %x",
@@ -858,6 +912,9 @@ cifs_proc_init(void)
proc_create_single("open_files", 0400, proc_fs_cifs,
cifs_debug_files_proc_show);
+ proc_create_single("open_dirs", 0400, proc_fs_cifs,
+ cifs_debug_dirs_proc_show);
+
proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_ops);
proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_ops);
proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_ops);
@@ -902,6 +959,7 @@ cifs_proc_clean(void)
remove_proc_entry("DebugData", proc_fs_cifs);
remove_proc_entry("open_files", proc_fs_cifs);
+ remove_proc_entry("open_dirs", proc_fs_cifs);
remove_proc_entry("cifsFYI", proc_fs_cifs);
remove_proc_entry("traceSMB", proc_fs_cifs);
remove_proc_entry("Stats", proc_fs_cifs);
@@ -1100,7 +1158,7 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
if ((count < 1) || (count > 11))
return -EINVAL;
- memset(flags_string, 0, 12);
+ memset(flags_string, 0, sizeof(flags_string));
if (copy_from_user(flags_string, buffer, count))
return -EFAULT;
diff --git a/fs/smb/client/cifs_ioctl.h b/fs/smb/client/cifs_ioctl.h
index 26327442e383..b51ce64fcccf 100644
--- a/fs/smb/client/cifs_ioctl.h
+++ b/fs/smb/client/cifs_ioctl.h
@@ -61,7 +61,7 @@ struct smb_query_info {
struct smb3_key_debug_info {
__u64 Suid;
__u16 cipher_type;
- __u8 auth_key[16]; /* SMB2_NTLMV2_SESSKEY_SIZE */
+ __u8 auth_key[SMB2_NTLMV2_SESSKEY_SIZE];
__u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE];
__u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE];
} __packed;
diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c
index 35892df7335c..3cc686246908 100644
--- a/fs/smb/client/cifsencrypt.c
+++ b/fs/smb/client/cifsencrypt.c
@@ -343,7 +343,7 @@ static struct ntlmssp2_name *find_next_av(struct cifs_ses *ses,
len = AV_LEN(av);
if (AV_TYPE(av) == NTLMSSP_AV_EOL)
return NULL;
- if (!len || (u8 *)av + sizeof(*av) + len > end)
+ if ((u8 *)av + sizeof(*av) + len > end)
return NULL;
return av;
}
@@ -363,7 +363,7 @@ static int find_av_name(struct cifs_ses *ses, u16 type, char **name, u16 maxlen)
av_for_each_entry(ses, av) {
len = AV_LEN(av);
- if (AV_TYPE(av) != type)
+ if (AV_TYPE(av) != type || !len)
continue;
if (!IS_ALIGNED(len, sizeof(__le16))) {
cifs_dbg(VFS | ONCE, "%s: bad length(%u) for type %u\n",
@@ -532,17 +532,67 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash, struct shash_
return rc;
}
+/*
+ * Set up NTLMv2 response blob with SPN (cifs/<hostname>) appended to the
+ * existing list of AV pairs.
+ */
+static int set_auth_key_response(struct cifs_ses *ses)
+{
+ size_t baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp);
+ size_t len, spnlen, tilen = 0, num_avs = 2 /* SPN + EOL */;
+ struct TCP_Server_Info *server = ses->server;
+ char *spn __free(kfree) = NULL;
+ struct ntlmssp2_name *av;
+ char *rsp = NULL;
+ int rc;
+
+ spnlen = strlen(server->hostname);
+ len = sizeof("cifs/") + spnlen;
+ spn = kmalloc(len, GFP_KERNEL);
+ if (!spn) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ spnlen = scnprintf(spn, len, "cifs/%.*s",
+ (int)spnlen, server->hostname);
+
+ av_for_each_entry(ses, av)
+ tilen += sizeof(*av) + AV_LEN(av);
+
+ len = baselen + tilen + spnlen * sizeof(__le16) + num_avs * sizeof(*av);
+ rsp = kmalloc(len, GFP_KERNEL);
+ if (!rsp) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ memcpy(rsp + baselen, ses->auth_key.response, tilen);
+ av = (void *)(rsp + baselen + tilen);
+ av->type = cpu_to_le16(NTLMSSP_AV_TARGET_NAME);
+ av->length = cpu_to_le16(spnlen * sizeof(__le16));
+ cifs_strtoUTF16((__le16 *)av->data, spn, spnlen, ses->local_nls);
+ av = (void *)((__u8 *)av + sizeof(*av) + AV_LEN(av));
+ av->type = cpu_to_le16(NTLMSSP_AV_EOL);
+ av->length = 0;
+
+ rc = 0;
+ ses->auth_key.len = len;
+out:
+ ses->auth_key.response = rsp;
+ return rc;
+}
+
int
setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
{
struct shash_desc *hmacmd5 = NULL;
- int rc;
- int baselen;
- unsigned int tilen;
+ unsigned char *tiblob = NULL; /* target info blob */
struct ntlmv2_resp *ntlmv2;
char ntlmv2_hash[16];
- unsigned char *tiblob = NULL; /* target info blob */
__le64 rsp_timestamp;
+ __u64 cc;
+ int rc;
if (nls_cp == NULL) {
cifs_dbg(VFS, "%s called with nls_cp==NULL\n", __func__);
@@ -588,32 +638,25 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
* (as Windows 7 does)
*/
rsp_timestamp = find_timestamp(ses);
+ get_random_bytes(&cc, sizeof(cc));
- baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp);
- tilen = ses->auth_key.len;
- tiblob = ses->auth_key.response;
+ cifs_server_lock(ses->server);
- ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL);
- if (!ses->auth_key.response) {
- rc = -ENOMEM;
+ tiblob = ses->auth_key.response;
+ rc = set_auth_key_response(ses);
+ if (rc) {
ses->auth_key.len = 0;
- goto setup_ntlmv2_rsp_ret;
+ goto unlock;
}
- ses->auth_key.len += baselen;
ntlmv2 = (struct ntlmv2_resp *)
(ses->auth_key.response + CIFS_SESS_KEY_SIZE);
ntlmv2->blob_signature = cpu_to_le32(0x00000101);
ntlmv2->reserved = 0;
ntlmv2->time = rsp_timestamp;
-
- get_random_bytes(&ntlmv2->client_chal, sizeof(ntlmv2->client_chal));
+ ntlmv2->client_chal = cc;
ntlmv2->reserved2 = 0;
- memcpy(ses->auth_key.response + baselen, tiblob, tilen);
-
- cifs_server_lock(ses->server);
-
rc = cifs_alloc_hash("hmac(md5)", &hmacmd5);
if (rc) {
cifs_dbg(VFS, "Could not allocate HMAC-MD5, rc=%d\n", rc);
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index fb04e263611c..0fdadd668a81 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -70,7 +70,6 @@ bool require_gcm_256; /* false by default */
bool enable_negotiate_signing; /* false by default */
unsigned int global_secflags = CIFSSEC_DEF;
/* unsigned int ntlmv2_support = 0; */
-unsigned int sign_CIFS_PDUs = 1;
/*
* Global transaction id (XID) information
@@ -261,9 +260,9 @@ cifs_read_super(struct super_block *sb)
}
if (tcon->nocase)
- sb->s_d_op = &cifs_ci_dentry_ops;
+ set_default_d_op(sb, &cifs_ci_dentry_ops);
else
- sb->s_d_op = &cifs_dentry_ops;
+ set_default_d_op(sb, &cifs_dentry_ops);
sb->s_root = d_make_root(inode);
if (!sb->s_root) {
@@ -1526,7 +1525,7 @@ const struct file_operations cifs_file_ops = {
.flock = cifs_flock,
.fsync = cifs_fsync,
.flush = cifs_flush,
- .mmap = cifs_file_mmap,
+ .mmap_prepare = cifs_file_mmap_prepare,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
.llseek = cifs_llseek,
@@ -1546,7 +1545,7 @@ const struct file_operations cifs_file_strict_ops = {
.flock = cifs_flock,
.fsync = cifs_strict_fsync,
.flush = cifs_flush,
- .mmap = cifs_file_strict_mmap,
+ .mmap_prepare = cifs_file_strict_mmap_prepare,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
.llseek = cifs_llseek,
@@ -1566,7 +1565,7 @@ const struct file_operations cifs_file_direct_ops = {
.flock = cifs_flock,
.fsync = cifs_fsync,
.flush = cifs_flush,
- .mmap = cifs_file_mmap,
+ .mmap_prepare = cifs_file_mmap_prepare,
.splice_read = copy_splice_read,
.splice_write = iter_file_splice_write,
.unlocked_ioctl = cifs_ioctl,
@@ -1584,7 +1583,7 @@ const struct file_operations cifs_file_nobrl_ops = {
.release = cifs_close,
.fsync = cifs_fsync,
.flush = cifs_flush,
- .mmap = cifs_file_mmap,
+ .mmap_prepare = cifs_file_mmap_prepare,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
.llseek = cifs_llseek,
@@ -1602,7 +1601,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
.release = cifs_close,
.fsync = cifs_strict_fsync,
.flush = cifs_flush,
- .mmap = cifs_file_strict_mmap,
+ .mmap_prepare = cifs_file_strict_mmap_prepare,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
.llseek = cifs_llseek,
@@ -1620,7 +1619,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
.release = cifs_close,
.fsync = cifs_fsync,
.flush = cifs_flush,
- .mmap = cifs_file_mmap,
+ .mmap_prepare = cifs_file_mmap_prepare,
.splice_read = copy_splice_read,
.splice_write = iter_file_splice_write,
.unlocked_ioctl = cifs_ioctl,
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index ca435a3841b8..487f39cff77e 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -103,8 +103,8 @@ extern int cifs_lock(struct file *, int, struct file_lock *);
extern int cifs_fsync(struct file *, loff_t, loff_t, int);
extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int);
extern int cifs_flush(struct file *, fl_owner_t id);
-extern int cifs_file_mmap(struct file *file, struct vm_area_struct *vma);
-extern int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma);
+int cifs_file_mmap_prepare(struct vm_area_desc *desc);
+int cifs_file_strict_mmap_prepare(struct vm_area_desc *desc);
extern const struct file_operations cifs_dir_ops;
extern int cifs_readdir(struct file *file, struct dir_context *ctx);
@@ -145,6 +145,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 54
-#define CIFS_VERSION "2.54"
+#define SMB3_PRODUCT_BUILD 55
+#define CIFS_VERSION "2.55"
#endif /* _CIFSFS_H */
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 3b32116b0b49..19dd901fe8ab 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -556,7 +556,7 @@ struct smb_version_operations {
void (*set_oplock_level)(struct cifsInodeInfo *cinode, __u32 oplock, __u16 epoch,
bool *purge_cache);
/* create lease context buffer for CREATE request */
- char * (*create_lease_buf)(u8 *lease_key, u8 oplock);
+ char * (*create_lease_buf)(u8 *lease_key, u8 oplock, u8 *parent_lease_key, __le32 le_flags);
/* parse lease context buffer and return oplock/epoch info */
__u8 (*parse_lease_buf)(void *buf, __u16 *epoch, char *lkey);
ssize_t (*copychunk_range)(const unsigned int,
@@ -627,12 +627,14 @@ struct smb_version_operations {
bool (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv);
struct reparse_data_buffer * (*get_reparse_point_buffer)(const struct kvec *rsp_iov,
u32 *plen);
- int (*create_reparse_symlink)(const unsigned int xid,
- struct inode *inode,
- struct dentry *dentry,
- struct cifs_tcon *tcon,
- const char *full_path,
- const char *symname);
+ struct inode * (*create_reparse_inode)(struct cifs_open_info_data *data,
+ struct super_block *sb,
+ const unsigned int xid,
+ struct cifs_tcon *tcon,
+ const char *full_path,
+ bool directory,
+ struct kvec *reparse_iov,
+ struct kvec *xattr_iov);
};
struct smb_version_values {
@@ -709,6 +711,7 @@ inc_rfc1001_len(void *buf, int count)
struct TCP_Server_Info {
struct list_head tcp_ses_list;
struct list_head smb_ses_list;
+ struct list_head rlist; /* reconnect list */
spinlock_t srv_lock; /* protect anything here that is not protected */
__u64 conn_id; /* connection identifier (useful for debugging) */
int srv_count; /* reference counter */
@@ -773,8 +776,10 @@ struct TCP_Server_Info {
char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
__u32 sequence_number; /* for signing, protected by srv_mutex */
__u32 reconnect_instance; /* incremented on each reconnect */
+ __le32 session_key_id; /* retrieved from negotiate response and send in session setup request */
struct session_key session_key;
unsigned long lstrp; /* when we got last response from this server */
+ unsigned long neg_start; /* when negotiate started (jiffies) */
struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
#define CIFS_NEGFLAVOR_UNENCAP 1 /* wct == 17, but no ext_sec */
#define CIFS_NEGFLAVOR_EXTENDED 2 /* wct == 17, ext_sec bit set */
@@ -1084,6 +1089,7 @@ struct cifs_chan {
};
#define CIFS_SES_FLAG_SCALE_CHANNELS (0x1)
+#define CIFS_SES_FLAGS_PENDING_QUERY_INTERFACES (0x2)
/*
* Session structure. One of these for each uid session with a particular host
@@ -1300,6 +1306,7 @@ struct cifs_tcon {
bool use_persistent:1; /* use persistent instead of durable handles */
bool no_lease:1; /* Do not request leases on files or directories */
bool use_witness:1; /* use witness protocol */
+ bool dummy:1; /* dummy tcon used for reconnecting channels */
__le32 capabilities;
__u32 share_flags;
__u32 maximal_access;
@@ -1441,6 +1448,7 @@ struct cifs_open_parms {
bool reconnect:1;
bool replay:1; /* indicates that this open is for a replay */
struct kvec *ea_cctx;
+ __le32 lease_flags;
};
struct cifs_fid {
@@ -1448,6 +1456,7 @@ struct cifs_fid {
__u64 persistent_fid; /* persist file id for smb2 */
__u64 volatile_fid; /* volatile file id for smb2 */
__u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */
+ __u8 parent_lease_key[SMB2_LEASE_KEY_SIZE];
__u8 create_guid[16];
__u32 access;
struct cifs_pending_open *pending_open;
@@ -1988,8 +1997,7 @@ require use of the stronger protocol */
* TCP_Server_Info-> TCP_Server_Info cifs_get_tcp_session
* reconnect_mutex
* TCP_Server_Info->srv_mutex TCP_Server_Info cifs_get_tcp_session
- * cifs_ses->session_mutex cifs_ses sesInfoAlloc
- * cifs_tcon
+ * cifs_ses->session_mutex cifs_ses sesInfoAlloc
* cifs_tcon->open_file_lock cifs_tcon->openFileList tconInfoAlloc
* cifs_tcon->pending_opens
* cifs_tcon->stat_lock cifs_tcon->bytes_read tconInfoAlloc
@@ -2008,21 +2016,25 @@ require use of the stronger protocol */
* ->oplock_credits
* ->reconnect_instance
* cifs_ses->ses_lock (anything that is not protected by another lock and can change)
+ * sesInfoAlloc
* cifs_ses->iface_lock cifs_ses->iface_list sesInfoAlloc
* ->iface_count
* ->iface_last_update
- * cifs_ses->chan_lock cifs_ses->chans
+ * cifs_ses->chan_lock cifs_ses->chans sesInfoAlloc
* ->chans_need_reconnect
* ->chans_in_reconnect
* cifs_tcon->tc_lock (anything that is not protected by another lock and can change)
+ * tcon_info_alloc
* inode->i_rwsem, taken by fs/netfs/locking.c e.g. should be taken before cifsInodeInfo locks
* cifsInodeInfo->open_file_lock cifsInodeInfo->openFileList cifs_alloc_inode
* cifsInodeInfo->writers_lock cifsInodeInfo->writers cifsInodeInfo_alloc
* cifsInodeInfo->lock_sem cifsInodeInfo->llist cifs_init_once
* ->can_cache_brlcks
* cifsInodeInfo->deferred_lock cifsInodeInfo->deferred_closes cifsInodeInfo_alloc
- * cached_fids->cfid_list_lock cifs_tcon->cfids->entries init_cached_dirs
- * cifsFileInfo->fh_mutex cifsFileInfo cifs_new_fileinfo
+ * cached_fids->cfid_list_lock cifs_tcon->cfids->entries init_cached_dirs
+ * cached_fid->fid_lock (anything that is not protected by another lock and can change)
+ * init_cached_dir
+ * cifsFileInfo->fh_mutex cifsFileInfo cifs_new_fileinfo
* cifsFileInfo->file_info_lock cifsFileInfo->count cifs_new_fileinfo
* ->invalidHandle initiate_cifs_search
* ->oplock_break_cancelled
diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h
index 1b79fe07476f..d9cf7db0ac35 100644
--- a/fs/smb/client/cifspdu.h
+++ b/fs/smb/client/cifspdu.h
@@ -597,7 +597,7 @@ typedef union smb_com_session_setup_andx {
__le16 MaxBufferSize;
__le16 MaxMpxCount;
__le16 VcNumber;
- __u32 SessionKey;
+ __le32 SessionKey;
__le16 SecurityBlobLength;
__u32 Reserved;
__le32 Capabilities; /* see below */
@@ -616,7 +616,7 @@ typedef union smb_com_session_setup_andx {
__le16 MaxBufferSize;
__le16 MaxMpxCount;
__le16 VcNumber;
- __u32 SessionKey;
+ __le32 SessionKey;
__le16 CaseInsensitivePasswordLength; /* ASCII password len */
__le16 CaseSensitivePasswordLength; /* Unicode password length*/
__u32 Reserved; /* see below */
@@ -654,7 +654,7 @@ typedef union smb_com_session_setup_andx {
__le16 MaxBufferSize;
__le16 MaxMpxCount;
__le16 VcNumber;
- __u32 SessionKey;
+ __le32 SessionKey;
__le16 PasswordLength;
__u32 Reserved; /* encrypt key len and offset */
__le16 ByteCount;
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index ecf774a8f1ca..40ec0634377f 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -136,6 +136,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
struct smb_hdr *out_buf,
int *bytes_returned);
+void smb2_query_server_interfaces(struct work_struct *work);
void
cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
bool all_channels);
@@ -151,8 +152,7 @@ extern bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 eof,
bool from_readdir);
extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
unsigned int bytes_written);
-void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result,
- bool was_async);
+void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result);
extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, int);
extern int cifs_get_writable_file(struct cifsInodeInfo *cifs_inode,
int flags,
@@ -483,6 +483,14 @@ extern int cifs_query_reparse_point(const unsigned int xid,
const char *full_path,
u32 *tag, struct kvec *rsp,
int *rsp_buftype);
+extern struct inode *cifs_create_reparse_inode(struct cifs_open_info_data *data,
+ struct super_block *sb,
+ const unsigned int xid,
+ struct cifs_tcon *tcon,
+ const char *full_path,
+ bool directory,
+ struct kvec *reparse_iov,
+ struct kvec *xattr_iov);
extern int CIFSSMB_set_compression(const unsigned int xid,
struct cifs_tcon *tcon, __u16 fid);
extern int CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms,
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index f55457b4b82e..6c890db06593 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -498,6 +498,7 @@ CIFSSMBNegotiate(const unsigned int xid,
server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
cifs_dbg(NOISY, "Max buf = %d\n", ses->server->maxBuf);
server->capabilities = le32_to_cpu(pSMBr->Capabilities);
+ server->session_key_id = pSMBr->SessionKey;
server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
server->timeAdj *= 60;
@@ -1333,7 +1334,12 @@ cifs_readv_callback(struct mid_q_entry *mid)
cifs_stats_bytes_read(tcon, rdata->got_bytes);
break;
case MID_REQUEST_SUBMITTED:
+ trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_req_submitted);
+ goto do_retry;
case MID_RETRY_NEEDED:
+ trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_retry_needed);
+do_retry:
+ __set_bit(NETFS_SREQ_NEED_RETRY, &rdata->subreq.flags);
rdata->result = -EAGAIN;
if (server->sign && rdata->got_bytes)
/* reset bytes number since we can not check a sign */
@@ -1342,8 +1348,14 @@ cifs_readv_callback(struct mid_q_entry *mid)
task_io_account_read(rdata->got_bytes);
cifs_stats_bytes_read(tcon, rdata->got_bytes);
break;
+ case MID_RESPONSE_MALFORMED:
+ trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_malformed);
+ rdata->result = -EIO;
+ break;
default:
+ trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_unknown);
rdata->result = -EIO;
+ break;
}
if (rdata->result == -ENODATA) {
@@ -1712,10 +1724,21 @@ cifs_writev_callback(struct mid_q_entry *mid)
}
break;
case MID_REQUEST_SUBMITTED:
+ trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_req_submitted);
+ __set_bit(NETFS_SREQ_NEED_RETRY, &wdata->subreq.flags);
+ result = -EAGAIN;
+ break;
case MID_RETRY_NEEDED:
+ trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_retry_needed);
+ __set_bit(NETFS_SREQ_NEED_RETRY, &wdata->subreq.flags);
result = -EAGAIN;
break;
+ case MID_RESPONSE_MALFORMED:
+ trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_malformed);
+ result = -EIO;
+ break;
default:
+ trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_unknown);
result = -EIO;
break;
}
@@ -1725,7 +1748,7 @@ cifs_writev_callback(struct mid_q_entry *mid)
server->credits, server->in_flight,
0, cifs_trace_rw_credits_write_response_clear);
wdata->credits.value = 0;
- cifs_write_subrequest_terminated(wdata, result, true);
+ cifs_write_subrequest_terminated(wdata, result);
release_mid(mid);
trace_smb3_rw_credits(credits.rreq_debug_id, credits.rreq_debug_index, 0,
server->credits, server->in_flight,
@@ -1813,7 +1836,7 @@ async_writev_out:
out:
if (rc) {
add_credits_and_wake_if(wdata->server, &wdata->credits, 0);
- cifs_write_subrequest_terminated(wdata, rc, false);
+ cifs_write_subrequest_terminated(wdata, rc);
}
}
@@ -2753,10 +2776,10 @@ int cifs_query_reparse_point(const unsigned int xid,
io_req->TotalParameterCount = 0;
io_req->TotalDataCount = 0;
- io_req->MaxParameterCount = cpu_to_le32(2);
+ io_req->MaxParameterCount = cpu_to_le32(0);
/* BB find exact data count max from sess structure BB */
io_req->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
- io_req->MaxSetupCount = 4;
+ io_req->MaxSetupCount = 1;
io_req->Reserved = 0;
io_req->ParameterOffset = 0;
io_req->DataCount = 0;
@@ -2783,6 +2806,22 @@ int cifs_query_reparse_point(const unsigned int xid,
goto error;
}
+ /* SetupCount must be 1, otherwise offset to ByteCount is incorrect. */
+ if (io_rsp->SetupCount != 1) {
+ rc = -EIO;
+ goto error;
+ }
+
+ /*
+ * ReturnedDataLen is output length of executed IOCTL.
+ * DataCount is output length transferred over network.
+ * Check that we have full FSCTL_GET_REPARSE_POINT buffer.
+ */
+ if (data_count != le16_to_cpu(io_rsp->ReturnedDataLen)) {
+ rc = -EIO;
+ goto error;
+ }
+
end = 2 + get_bcc(&io_rsp->hdr) + (__u8 *)&io_rsp->ByteCount;
start = (__u8 *)&io_rsp->hdr.Protocol + data_offset;
if (start >= end) {
@@ -2812,6 +2851,134 @@ error:
return rc;
}
+struct inode *cifs_create_reparse_inode(struct cifs_open_info_data *data,
+ struct super_block *sb,
+ const unsigned int xid,
+ struct cifs_tcon *tcon,
+ const char *full_path,
+ bool directory,
+ struct kvec *reparse_iov,
+ struct kvec *xattr_iov)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ struct cifs_open_parms oparms;
+ TRANSACT_IOCTL_REQ *io_req;
+ struct inode *new = NULL;
+ struct kvec in_iov[2];
+ struct kvec out_iov;
+ struct cifs_fid fid;
+ int io_req_len;
+ int oplock = 0;
+ int buf_type = 0;
+ int rc;
+
+ cifs_tcon_dbg(FYI, "%s: path=%s\n", __func__, full_path);
+
+ /*
+ * If server filesystem does not support reparse points then do not
+ * attempt to create reparse point. This will prevent creating unusable
+ * empty object on the server.
+ */
+ if (!(le32_to_cpu(tcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS))
+ return ERR_PTR(-EOPNOTSUPP);
+
+#ifndef CONFIG_CIFS_XATTR
+ if (xattr_iov)
+ return ERR_PTR(-EOPNOTSUPP);
+#endif
+
+ oparms = CIFS_OPARMS(cifs_sb, tcon, full_path,
+ FILE_READ_ATTRIBUTES | FILE_WRITE_DATA | FILE_WRITE_EA,
+ FILE_CREATE,
+ (directory ? CREATE_NOT_FILE : CREATE_NOT_DIR) | OPEN_REPARSE_POINT,
+ ACL_NO_MODE);
+ oparms.fid = &fid;
+
+ rc = CIFS_open(xid, &oparms, &oplock, NULL);
+ if (rc)
+ return ERR_PTR(rc);
+
+#ifdef CONFIG_CIFS_XATTR
+ if (xattr_iov) {
+ struct smb2_file_full_ea_info *ea;
+
+ ea = &((struct smb2_create_ea_ctx *)xattr_iov->iov_base)->ea;
+ while (1) {
+ rc = CIFSSMBSetEA(xid,
+ tcon,
+ full_path,
+ &ea->ea_data[0],
+ &ea->ea_data[ea->ea_name_length+1],
+ le16_to_cpu(ea->ea_value_length),
+ cifs_sb->local_nls,
+ cifs_sb);
+ if (rc)
+ goto out_close;
+ if (le32_to_cpu(ea->next_entry_offset) == 0)
+ break;
+ ea = (struct smb2_file_full_ea_info *)((u8 *)ea +
+ le32_to_cpu(ea->next_entry_offset));
+ }
+ }
+#endif
+
+ rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **)&io_req, NULL);
+ if (rc)
+ goto out_close;
+
+ inc_rfc1001_len(io_req, sizeof(io_req->Pad));
+
+ io_req_len = be32_to_cpu(io_req->hdr.smb_buf_length) + sizeof(io_req->hdr.smb_buf_length);
+
+ /* NT IOCTL response contains one-word long output setup buffer with size of output data. */
+ io_req->MaxSetupCount = 1;
+ /* NT IOCTL response does not contain output parameters. */
+ io_req->MaxParameterCount = cpu_to_le32(0);
+ /* FSCTL_SET_REPARSE_POINT response contains empty output data. */
+ io_req->MaxDataCount = cpu_to_le32(0);
+
+ io_req->TotalParameterCount = cpu_to_le32(0);
+ io_req->TotalDataCount = cpu_to_le32(reparse_iov->iov_len);
+ io_req->ParameterCount = io_req->TotalParameterCount;
+ io_req->ParameterOffset = cpu_to_le32(0);
+ io_req->DataCount = io_req->TotalDataCount;
+ io_req->DataOffset = cpu_to_le32(offsetof(typeof(*io_req), Data) -
+ sizeof(io_req->hdr.smb_buf_length));
+ io_req->SetupCount = 4;
+ io_req->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL);
+ io_req->FunctionCode = cpu_to_le32(FSCTL_SET_REPARSE_POINT);
+ io_req->Fid = fid.netfid;
+ io_req->IsFsctl = 1;
+ io_req->IsRootFlag = 0;
+ io_req->ByteCount = cpu_to_le16(le32_to_cpu(io_req->DataCount) + sizeof(io_req->Pad));
+
+ inc_rfc1001_len(io_req, reparse_iov->iov_len);
+
+ in_iov[0].iov_base = (char *)io_req;
+ in_iov[0].iov_len = io_req_len;
+ in_iov[1] = *reparse_iov;
+ rc = SendReceive2(xid, tcon->ses, in_iov, ARRAY_SIZE(in_iov), &buf_type,
+ CIFS_NO_RSP_BUF, &out_iov);
+
+ cifs_buf_release(io_req);
+
+ if (!rc)
+ rc = cifs_get_inode_info(&new, full_path, data, sb, xid, NULL);
+
+out_close:
+ CIFSSMBClose(xid, tcon, fid.netfid);
+
+ /*
+ * If CREATE was successful but FSCTL_SET_REPARSE_POINT failed then
+ * remove the intermediate object created by CREATE. Otherwise
+ * empty object stay on the server when reparse call failed.
+ */
+ if (rc)
+ CIFSSMBDelFile(xid, tcon, full_path, cifs_sb, NULL);
+
+ return rc ? ERR_PTR(rc) : new;
+}
+
int
CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
__u16 fid)
@@ -3981,6 +4148,12 @@ findFirstRetry:
pSMB->FileName[name_len] = 0;
pSMB->FileName[name_len+1] = 0;
name_len += 2;
+ } else if (!searchName[0]) {
+ pSMB->FileName[0] = CIFS_DIR_SEP(cifs_sb);
+ pSMB->FileName[1] = 0;
+ pSMB->FileName[2] = 0;
+ pSMB->FileName[3] = 0;
+ name_len = 4;
}
} else {
name_len = copy_path_name(pSMB->FileName, searchName);
@@ -3992,6 +4165,10 @@ findFirstRetry:
pSMB->FileName[name_len] = '*';
pSMB->FileName[name_len+1] = 0;
name_len += 2;
+ } else if (!searchName[0]) {
+ pSMB->FileName[0] = CIFS_DIR_SEP(cifs_sb);
+ pSMB->FileName[1] = 0;
+ name_len = 2;
}
}
@@ -4018,7 +4195,7 @@ findFirstRetry:
pSMB->SearchAttributes =
cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM |
ATTR_DIRECTORY);
- pSMB->SearchCount = cpu_to_le16(CIFSMaxBufSize/sizeof(FILE_UNIX_INFO));
+ pSMB->SearchCount = cpu_to_le16(msearch ? CIFSMaxBufSize/sizeof(FILE_UNIX_INFO) : 1);
pSMB->SearchFlags = cpu_to_le16(search_flags);
pSMB->InformationLevel = cpu_to_le16(psrch_inf->info_level);
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 6bf04d9a5491..5eec8957f2a9 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -97,7 +97,7 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
return rc;
}
-static void smb2_query_server_interfaces(struct work_struct *work)
+void smb2_query_server_interfaces(struct work_struct *work)
{
int rc;
int xid;
@@ -116,18 +116,22 @@ static void smb2_query_server_interfaces(struct work_struct *work)
rc = server->ops->query_server_interfaces(xid, tcon, false);
free_xid(xid);
- if (rc) {
- if (rc == -EOPNOTSUPP)
- return;
-
+ if (rc)
cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n",
__func__, rc);
- }
queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
(SMB_INTERFACE_POLL_INTERVAL * HZ));
}
+#define set_need_reco(server) \
+do { \
+ spin_lock(&server->srv_lock); \
+ if (server->tcpStatus != CifsExiting) \
+ server->tcpStatus = CifsNeedReconnect; \
+ spin_unlock(&server->srv_lock); \
+} while (0)
+
/*
* Update the tcpStatus for the server.
* This is used to signal the cifsd thread to call cifs_reconnect
@@ -141,39 +145,45 @@ void
cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
bool all_channels)
{
- struct TCP_Server_Info *pserver;
+ struct TCP_Server_Info *nserver;
struct cifs_ses *ses;
+ LIST_HEAD(reco);
int i;
- /* If server is a channel, select the primary channel */
- pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
-
/* if we need to signal just this channel */
if (!all_channels) {
- spin_lock(&server->srv_lock);
- if (server->tcpStatus != CifsExiting)
- server->tcpStatus = CifsNeedReconnect;
- spin_unlock(&server->srv_lock);
+ set_need_reco(server);
return;
}
- spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
- if (cifs_ses_exiting(ses))
- continue;
- spin_lock(&ses->chan_lock);
- for (i = 0; i < ses->chan_count; i++) {
- if (!ses->chans[i].server)
+ if (SERVER_IS_CHAN(server))
+ server = server->primary_server;
+ scoped_guard(spinlock, &cifs_tcp_ses_lock) {
+ set_need_reco(server);
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ spin_lock(&ses->ses_lock);
+ if (ses->ses_status == SES_EXITING) {
+ spin_unlock(&ses->ses_lock);
continue;
-
- spin_lock(&ses->chans[i].server->srv_lock);
- if (ses->chans[i].server->tcpStatus != CifsExiting)
- ses->chans[i].server->tcpStatus = CifsNeedReconnect;
- spin_unlock(&ses->chans[i].server->srv_lock);
+ }
+ spin_lock(&ses->chan_lock);
+ for (i = 1; i < ses->chan_count; i++) {
+ nserver = ses->chans[i].server;
+ if (!nserver)
+ continue;
+ nserver->srv_count++;
+ list_add(&nserver->rlist, &reco);
+ }
+ spin_unlock(&ses->chan_lock);
+ spin_unlock(&ses->ses_lock);
}
- spin_unlock(&ses->chan_lock);
}
- spin_unlock(&cifs_tcp_ses_lock);
+
+ list_for_each_entry_safe(server, nserver, &reco, rlist) {
+ list_del_init(&server->rlist);
+ set_need_reco(server);
+ cifs_put_tcp_session(server, 0);
+ }
}
/*
@@ -377,6 +387,13 @@ static int __cifs_reconnect(struct TCP_Server_Info *server,
if (!cifs_tcp_ses_needs_reconnect(server, 1))
return 0;
+ /*
+ * if smb session has been marked for reconnect, also reconnect all
+ * connections. This way, the other connections do not end up bad.
+ */
+ if (mark_smb_session)
+ cifs_signal_cifsd_for_reconnect(server, mark_smb_session);
+
cifs_mark_tcp_ses_conns_for_reconnect(server, mark_smb_session);
cifs_abort_connection(server);
@@ -385,7 +402,8 @@ static int __cifs_reconnect(struct TCP_Server_Info *server,
try_to_freeze();
cifs_server_lock(server);
- if (!cifs_swn_set_server_dstaddr(server)) {
+ if (!cifs_swn_set_server_dstaddr(server) &&
+ !SERVER_IS_CHAN(server)) {
/* resolve the hostname again to make sure that IP address is up-to-date */
rc = reconn_set_ipaddr_from_hostname(server);
cifs_dbg(FYI, "%s: reconn_set_ipaddr_from_hostname: rc=%d\n", __func__, rc);
@@ -661,12 +679,12 @@ server_unresponsive(struct TCP_Server_Info *server)
/*
* If we're in the process of mounting a share or reconnecting a session
* and the server abruptly shut down (e.g. socket wasn't closed, packet
- * had been ACK'ed but no SMB response), don't wait longer than 20s to
- * negotiate protocol.
+ * had been ACK'ed but no SMB response), don't wait longer than 20s from
+ * when negotiate actually started.
*/
spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsInNegotiate &&
- time_after(jiffies, server->lstrp + 20 * HZ)) {
+ time_after(jiffies, server->neg_start + 20 * HZ)) {
spin_unlock(&server->srv_lock);
cifs_reconnect(server, false);
return true;
@@ -2862,20 +2880,14 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
tcon->max_cached_dirs = ctx->max_cached_dirs;
tcon->nodelete = ctx->nodelete;
tcon->local_lease = ctx->local_lease;
- INIT_LIST_HEAD(&tcon->pending_opens);
tcon->status = TID_GOOD;
- INIT_DELAYED_WORK(&tcon->query_interfaces,
- smb2_query_server_interfaces);
if (ses->server->dialect >= SMB30_PROT_ID &&
(ses->server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
/* schedule query interfaces poll */
queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
(SMB_INTERFACE_POLL_INTERVAL * HZ));
}
-#ifdef CONFIG_CIFS_DFS_UPCALL
- INIT_DELAYED_WORK(&tcon->dfs_cache_work, dfs_cache_refresh);
-#endif
spin_lock(&cifs_tcp_ses_lock);
list_add(&tcon->tcon_list, &ses->tcon_list);
spin_unlock(&cifs_tcp_ses_lock);
@@ -3350,18 +3362,15 @@ generic_ip_connect(struct TCP_Server_Info *server)
struct net *net = cifs_net_ns(server);
struct sock *sk;
- rc = __sock_create(net, sfamily, SOCK_STREAM,
- IPPROTO_TCP, &server->ssocket, 1);
+ rc = sock_create_kern(net, sfamily, SOCK_STREAM,
+ IPPROTO_TCP, &server->ssocket);
if (rc < 0) {
cifs_server_dbg(VFS, "Error %d creating socket\n", rc);
return rc;
}
sk = server->ssocket->sk;
- __netns_tracker_free(net, &sk->ns_tracker, false);
- sk->sk_net_refcnt = 1;
- get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
- sock_inuse_add(net, 1);
+ sk_net_refcnt_upgrade(sk);
/* BB other socket options to set KEEPALIVE, NODELAY? */
cifs_dbg(FYI, "Socket created\n");
@@ -3714,9 +3723,15 @@ int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx)
goto out;
}
- /* if new SMB3.11 POSIX extensions are supported do not remap / and \ */
- if (tcon->posix_extensions)
+ /*
+ * if new SMB3.11 POSIX extensions are supported, do not change anything in the
+ * path (i.e., do not remap / and \ and do not map any special characters)
+ */
+ if (tcon->posix_extensions) {
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS;
+ cifs_sb->mnt_cifs_flags &= ~(CIFS_MOUNT_MAP_SFM_CHR |
+ CIFS_MOUNT_MAP_SPECIAL_CHR);
+ }
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
/* tell server which Unix caps we support */
@@ -4189,7 +4204,9 @@ retry:
return 0;
}
+ server->lstrp = jiffies;
server->tcpStatus = CifsInNegotiate;
+ server->neg_start = jiffies;
spin_unlock(&server->srv_lock);
rc = server->ops->negotiate(xid, ses, server);
diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c
index d1e95632ac54..5223edf6d11a 100644
--- a/fs/smb/client/dir.c
+++ b/fs/smb/client/dir.c
@@ -23,6 +23,7 @@
#include "fs_context.h"
#include "cifs_ioctl.h"
#include "fscache.h"
+#include "cached_dir.h"
static void
renew_parental_timestamps(struct dentry *direntry)
@@ -189,7 +190,9 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned
int disposition;
struct TCP_Server_Info *server = tcon->ses->server;
struct cifs_open_parms oparms;
+ struct cached_fid *parent_cfid = NULL;
int rdwr_for_fscache = 0;
+ __le32 lease_flags = 0;
*oplock = 0;
if (tcon->ses->server->oplocks)
@@ -311,7 +314,28 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned
if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
create_options |= CREATE_OPTION_READONLY;
+
retry_open:
+ if (tcon->cfids && direntry->d_parent && server->dialect >= SMB30_PROT_ID) {
+ parent_cfid = NULL;
+ spin_lock(&tcon->cfids->cfid_list_lock);
+ list_for_each_entry(parent_cfid, &tcon->cfids->entries, entry) {
+ if (parent_cfid->dentry == direntry->d_parent) {
+ cifs_dbg(FYI, "found a parent cached file handle\n");
+ if (parent_cfid->has_lease && parent_cfid->time) {
+ lease_flags
+ |= SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE;
+ memcpy(fid->parent_lease_key,
+ parent_cfid->fid.lease_key,
+ SMB2_LEASE_KEY_SIZE);
+ parent_cfid->dirents.is_valid = false;
+ }
+ break;
+ }
+ }
+ spin_unlock(&tcon->cfids->cfid_list_lock);
+ }
+
oparms = (struct cifs_open_parms) {
.tcon = tcon,
.cifs_sb = cifs_sb,
@@ -320,6 +344,7 @@ retry_open:
.disposition = disposition,
.path = full_path,
.fid = fid,
+ .lease_flags = lease_flags,
.mode = mode,
};
rc = server->ops->open(xid, &oparms, oplock, buf);
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 950aa4f912f5..186e061068be 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -52,6 +52,7 @@ static void cifs_prepare_write(struct netfs_io_subrequest *subreq)
struct netfs_io_stream *stream = &req->rreq.io_streams[subreq->stream_nr];
struct TCP_Server_Info *server;
struct cifsFileInfo *open_file = req->cfile;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(wdata->rreq->inode->i_sb);
size_t wsize = req->rreq.wsize;
int rc;
@@ -63,6 +64,10 @@ static void cifs_prepare_write(struct netfs_io_subrequest *subreq)
server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses);
wdata->server = server;
+ if (cifs_sb->ctx->wsize == 0)
+ cifs_negotiate_wsize(server, cifs_sb->ctx,
+ tlink_tcon(req->cfile->tlink));
+
retry:
if (open_file->invalidHandle) {
rc = cifs_reopen_file(open_file, false);
@@ -130,7 +135,7 @@ fail:
else
trace_netfs_sreq(subreq, netfs_sreq_trace_fail);
add_credits_and_wake_if(wdata->server, &wdata->credits, 0);
- cifs_write_subrequest_terminated(wdata, rc, false);
+ cifs_write_subrequest_terminated(wdata, rc);
goto out;
}
@@ -160,10 +165,9 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq)
server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses);
rdata->server = server;
- if (cifs_sb->ctx->rsize == 0) {
+ if (cifs_sb->ctx->rsize == 0)
cifs_negotiate_rsize(server, cifs_sb->ctx,
tlink_tcon(req->cfile->tlink));
- }
rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize,
&size, &rdata->credits);
@@ -219,7 +223,8 @@ static void cifs_issue_read(struct netfs_io_subrequest *subreq)
goto failed;
}
- if (subreq->rreq->origin != NETFS_DIO_READ)
+ if (subreq->rreq->origin != NETFS_UNBUFFERED_READ &&
+ subreq->rreq->origin != NETFS_DIO_READ)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
@@ -998,15 +1003,18 @@ int cifs_open(struct inode *inode, struct file *file)
rc = cifs_get_readable_path(tcon, full_path, &cfile);
}
if (rc == 0) {
- if (file->f_flags == cfile->f_flags) {
+ unsigned int oflags = file->f_flags & ~(O_CREAT|O_EXCL|O_TRUNC);
+ unsigned int cflags = cfile->f_flags & ~(O_CREAT|O_EXCL|O_TRUNC);
+
+ if (cifs_convert_flags(oflags, 0) == cifs_convert_flags(cflags, 0) &&
+ (oflags & (O_SYNC|O_DIRECT)) == (cflags & (O_SYNC|O_DIRECT))) {
file->private_data = cfile;
spin_lock(&CIFS_I(inode)->deferred_lock);
cifs_del_deferred_close(cfile);
spin_unlock(&CIFS_I(inode)->deferred_lock);
goto use_cache;
- } else {
- _cifsFileInfo_put(cfile, true, false);
}
+ _cifsFileInfo_put(cfile, true, false);
} else {
/* hard link on the defeered close file */
rc = cifs_get_hardlink_path(tcon, inode, file);
@@ -2423,8 +2431,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
return rc;
}
-void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result,
- bool was_async)
+void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result)
{
struct netfs_io_request *wreq = wdata->rreq;
struct netfs_inode *ictx = netfs_inode(wreq->inode);
@@ -2441,7 +2448,7 @@ void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t
netfs_resize_file(ictx, wrend, true);
}
- netfs_write_subrequest_terminated(&wdata->subreq, result, was_async);
+ netfs_write_subrequest_terminated(&wdata->subreq, result);
}
struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
@@ -2992,38 +2999,38 @@ static const struct vm_operations_struct cifs_file_vm_ops = {
.page_mkwrite = cifs_page_mkwrite,
};
-int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
+int cifs_file_strict_mmap_prepare(struct vm_area_desc *desc)
{
int xid, rc = 0;
- struct inode *inode = file_inode(file);
+ struct inode *inode = file_inode(desc->file);
xid = get_xid();
if (!CIFS_CACHE_READ(CIFS_I(inode)))
rc = cifs_zap_mapping(inode);
if (!rc)
- rc = generic_file_mmap(file, vma);
+ rc = generic_file_mmap_prepare(desc);
if (!rc)
- vma->vm_ops = &cifs_file_vm_ops;
+ desc->vm_ops = &cifs_file_vm_ops;
free_xid(xid);
return rc;
}
-int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
+int cifs_file_mmap_prepare(struct vm_area_desc *desc)
{
int rc, xid;
xid = get_xid();
- rc = cifs_revalidate_file(file);
+ rc = cifs_revalidate_file(desc->file);
if (rc)
cifs_dbg(FYI, "Validation prior to mmap failed, error=%d\n",
rc);
if (!rc)
- rc = generic_file_mmap(file, vma);
+ rc = generic_file_mmap_prepare(desc);
if (!rc)
- vma->vm_ops = &cifs_file_vm_ops;
+ desc->vm_ops = &cifs_file_vm_ops;
free_xid(xid);
return rc;
@@ -3081,7 +3088,8 @@ void cifs_oplock_break(struct work_struct *work)
struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
oplock_break);
struct inode *inode = d_inode(cfile->dentry);
- struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct super_block *sb = inode->i_sb;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifsInodeInfo *cinode = CIFS_I(inode);
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
@@ -3091,6 +3099,12 @@ void cifs_oplock_break(struct work_struct *work)
__u64 persistent_fid, volatile_fid;
__u16 net_fid;
+ /*
+ * Hold a reference to the superblock to prevent it and its inodes from
+ * being freed while we are accessing cinode. Otherwise, _cifsFileInfo_put()
+ * may release the last reference to the sb and trigger inode eviction.
+ */
+ cifs_sb_active(sb);
wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
TASK_UNINTERRUPTIBLE);
@@ -3163,6 +3177,7 @@ oplock_break_ack:
cifs_put_tlink(tlink);
out:
cifs_done_oplock_break(cinode);
+ cifs_sb_deactive(sb);
}
static int cifs_swap_activate(struct swap_info_struct *sis,
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index a634a34d4086..3f34bb07997b 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -1475,35 +1475,21 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
pr_warn("username too long\n");
goto cifs_parse_mount_err;
}
- ctx->username = kstrdup(param->string, GFP_KERNEL);
- if (ctx->username == NULL) {
- cifs_errorf(fc, "OOM when copying username string\n");
- goto cifs_parse_mount_err;
- }
+ ctx->username = no_free_ptr(param->string);
break;
case Opt_pass:
kfree_sensitive(ctx->password);
ctx->password = NULL;
if (strlen(param->string) == 0)
break;
-
- ctx->password = kstrdup(param->string, GFP_KERNEL);
- if (ctx->password == NULL) {
- cifs_errorf(fc, "OOM when copying password string\n");
- goto cifs_parse_mount_err;
- }
+ ctx->password = no_free_ptr(param->string);
break;
case Opt_pass2:
kfree_sensitive(ctx->password2);
ctx->password2 = NULL;
if (strlen(param->string) == 0)
break;
-
- ctx->password2 = kstrdup(param->string, GFP_KERNEL);
- if (ctx->password2 == NULL) {
- cifs_errorf(fc, "OOM when copying password2 string\n");
- goto cifs_parse_mount_err;
- }
+ ctx->password2 = no_free_ptr(param->string);
break;
case Opt_ip:
if (strlen(param->string) == 0) {
@@ -1526,11 +1512,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
}
kfree(ctx->domainname);
- ctx->domainname = kstrdup(param->string, GFP_KERNEL);
- if (ctx->domainname == NULL) {
- cifs_errorf(fc, "OOM when copying domainname string\n");
- goto cifs_parse_mount_err;
- }
+ ctx->domainname = no_free_ptr(param->string);
cifs_dbg(FYI, "Domain name set\n");
break;
case Opt_srcaddr:
@@ -1550,11 +1532,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
if (strncasecmp(param->string, "default", 7) != 0) {
kfree(ctx->iocharset);
- ctx->iocharset = kstrdup(param->string, GFP_KERNEL);
- if (ctx->iocharset == NULL) {
- cifs_errorf(fc, "OOM when copying iocharset string\n");
- goto cifs_parse_mount_err;
- }
+ ctx->iocharset = no_free_ptr(param->string);
}
/* if iocharset not set then load_nls_default
* is used by caller
@@ -1824,10 +1802,14 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
cifs_errorf(fc, "symlinkroot mount options must be absolute path\n");
goto cifs_parse_mount_err;
}
- kfree(ctx->symlinkroot);
- ctx->symlinkroot = kstrdup(param->string, GFP_KERNEL);
- if (!ctx->symlinkroot)
+ if (strnlen(param->string, PATH_MAX) == PATH_MAX) {
+ cifs_errorf(fc, "symlinkroot path too long (max path length: %u)\n",
+ PATH_MAX - 1);
goto cifs_parse_mount_err;
+ }
+ kfree(ctx->symlinkroot);
+ ctx->symlinkroot = param->string;
+ param->string = NULL;
break;
}
/* case Opt_ignore: - is ignored as expected ... */
@@ -1837,13 +1819,6 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
goto cifs_parse_mount_err;
}
- /*
- * By default resolve all native absolute symlinks relative to "/mnt/".
- * Same default has drvfs driver running in WSL for resolving SMB shares.
- */
- if (!ctx->symlinkroot)
- ctx->symlinkroot = kstrdup("/mnt/", GFP_KERNEL);
-
return 0;
cifs_parse_mount_err:
diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c
index 56439da4f119..0a9935ce05a5 100644
--- a/fs/smb/client/ioctl.c
+++ b/fs/smb/client/ioctl.c
@@ -506,7 +506,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
le16_to_cpu(tcon->ses->server->cipher_type);
pkey_inf.Suid = tcon->ses->Suid;
memcpy(pkey_inf.auth_key, tcon->ses->auth_key.response,
- 16 /* SMB2_NTLMV2_SESSKEY_SIZE */);
+ SMB2_NTLMV2_SESSKEY_SIZE);
memcpy(pkey_inf.smb3decryptionkey,
tcon->ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE);
memcpy(pkey_inf.smb3encryptionkey,
diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c
index 769752ad2c5c..2ecd705e9e8c 100644
--- a/fs/smb/client/link.c
+++ b/fs/smb/client/link.c
@@ -19,6 +19,7 @@
#include "smb2proto.h"
#include "cifs_ioctl.h"
#include "fs_context.h"
+#include "reparse.h"
/*
* M-F Symlink Functions - Begin
@@ -570,7 +571,6 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode,
int rc = -EOPNOTSUPP;
unsigned int xid;
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
- struct TCP_Server_Info *server;
struct tcon_link *tlink;
struct cifs_tcon *pTcon;
const char *full_path;
@@ -593,7 +593,6 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode,
goto symlink_exit;
}
pTcon = tlink_tcon(tlink);
- server = cifs_pick_channel(pTcon->ses);
full_path = build_path_from_dentry(direntry, page);
if (IS_ERR(full_path)) {
@@ -643,13 +642,9 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode,
case CIFS_SYMLINK_TYPE_NATIVE:
case CIFS_SYMLINK_TYPE_NFS:
case CIFS_SYMLINK_TYPE_WSL:
- if (server->ops->create_reparse_symlink &&
- (le32_to_cpu(pTcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS)) {
- rc = server->ops->create_reparse_symlink(xid, inode,
- direntry,
- pTcon,
- full_path,
- symname);
+ if (le32_to_cpu(pTcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS) {
+ rc = create_reparse_symlink(xid, inode, direntry, pTcon,
+ full_path, symname);
goto symlink_exit;
}
break;
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index 7b6ed9b23e71..da23cc12a52c 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -151,6 +151,12 @@ tcon_info_alloc(bool dir_leases_enabled, enum smb3_tcon_ref_trace trace)
#ifdef CONFIG_CIFS_DFS_UPCALL
INIT_LIST_HEAD(&ret_buf->dfs_ses_list);
#endif
+ INIT_LIST_HEAD(&ret_buf->pending_opens);
+ INIT_DELAYED_WORK(&ret_buf->query_interfaces,
+ smb2_query_server_interfaces);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ INIT_DELAYED_WORK(&ret_buf->dfs_cache_work, dfs_cache_refresh);
+#endif
return ret_buf;
}
@@ -326,6 +332,14 @@ check_smb_hdr(struct smb_hdr *smb)
if (smb->Command == SMB_COM_LOCKING_ANDX)
return 0;
+ /*
+ * Windows NT server returns error resposne (e.g. STATUS_DELETE_PENDING
+ * or STATUS_OBJECT_NAME_NOT_FOUND or ERRDOS/ERRbadfile or any other)
+ * for some TRANS2 requests without the RESPONSE flag set in header.
+ */
+ if (smb->Command == SMB_COM_TRANSACTION2 && smb->Status.CifsError != 0)
+ return 0;
+
cifs_dbg(VFS, "Server sent request, not response. mid=%u\n",
get_mid(smb));
return 1;
diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c
index 778daf11f1db..52a520349cb7 100644
--- a/fs/smb/client/namespace.c
+++ b/fs/smb/client/namespace.c
@@ -146,6 +146,9 @@ static char *automount_fullpath(struct dentry *dentry, void *page)
}
spin_unlock(&tcon->tc_lock);
+ if (unlikely(!page))
+ return ERR_PTR(-ENOMEM);
+
s = dentry_path_raw(dentry, page, PATH_MAX);
if (IS_ERR(s))
return s;
diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c
index f9f11cbf89be..4e5460206397 100644
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -264,7 +264,7 @@ cifs_posix_to_fattr(struct cifs_fattr *fattr, struct smb2_posix_info *info,
/* The Mode field in the response can now include the file type as well */
fattr->cf_mode = wire_mode_to_posix(le32_to_cpu(info->Mode),
fattr->cf_cifsattrs & ATTR_DIRECTORY);
- fattr->cf_dtype = S_DT(le32_to_cpu(info->Mode));
+ fattr->cf_dtype = S_DT(fattr->cf_mode);
switch (fattr->cf_mode & S_IFMT) {
case S_IFLNK:
@@ -851,9 +851,9 @@ static bool emit_cached_dirents(struct cached_dirents *cde,
}
static void update_cached_dirents_count(struct cached_dirents *cde,
- struct dir_context *ctx)
+ struct file *file)
{
- if (cde->ctx != ctx)
+ if (cde->file != file)
return;
if (cde->is_valid || cde->is_failed)
return;
@@ -862,9 +862,9 @@ static void update_cached_dirents_count(struct cached_dirents *cde,
}
static void finished_cached_dirents_count(struct cached_dirents *cde,
- struct dir_context *ctx)
+ struct dir_context *ctx, struct file *file)
{
- if (cde->ctx != ctx)
+ if (cde->file != file)
return;
if (cde->is_valid || cde->is_failed)
return;
@@ -877,11 +877,12 @@ static void finished_cached_dirents_count(struct cached_dirents *cde,
static void add_cached_dirent(struct cached_dirents *cde,
struct dir_context *ctx,
const char *name, int namelen,
- struct cifs_fattr *fattr)
+ struct cifs_fattr *fattr,
+ struct file *file)
{
struct cached_dirent *de;
- if (cde->ctx != ctx)
+ if (cde->file != file)
return;
if (cde->is_valid || cde->is_failed)
return;
@@ -911,7 +912,8 @@ static void add_cached_dirent(struct cached_dirents *cde,
static bool cifs_dir_emit(struct dir_context *ctx,
const char *name, int namelen,
struct cifs_fattr *fattr,
- struct cached_fid *cfid)
+ struct cached_fid *cfid,
+ struct file *file)
{
bool rc;
ino_t ino = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
@@ -923,7 +925,7 @@ static bool cifs_dir_emit(struct dir_context *ctx,
if (cfid) {
mutex_lock(&cfid->dirents.de_mutex);
add_cached_dirent(&cfid->dirents, ctx, name, namelen,
- fattr);
+ fattr, file);
mutex_unlock(&cfid->dirents.de_mutex);
}
@@ -1023,7 +1025,7 @@ static int cifs_filldir(char *find_entry, struct file *file,
cifs_prime_dcache(file_dentry(file), &name, &fattr);
return !cifs_dir_emit(ctx, name.name, name.len,
- &fattr, cfid);
+ &fattr, cfid, file);
}
@@ -1074,8 +1076,8 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
* we need to initialize scanning and storing the
* directory content.
*/
- if (ctx->pos == 0 && cfid->dirents.ctx == NULL) {
- cfid->dirents.ctx = ctx;
+ if (ctx->pos == 0 && cfid->dirents.file == NULL) {
+ cfid->dirents.file = file;
cfid->dirents.pos = 2;
}
/*
@@ -1143,7 +1145,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
} else {
if (cfid) {
mutex_lock(&cfid->dirents.de_mutex);
- finished_cached_dirents_count(&cfid->dirents, ctx);
+ finished_cached_dirents_count(&cfid->dirents, ctx, file);
mutex_unlock(&cfid->dirents.de_mutex);
}
cifs_dbg(FYI, "Could not find entry\n");
@@ -1184,7 +1186,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
ctx->pos++;
if (cfid) {
mutex_lock(&cfid->dirents.de_mutex);
- update_cached_dirents_count(&cfid->dirents, ctx);
+ update_cached_dirents_count(&cfid->dirents, file);
mutex_unlock(&cfid->dirents.de_mutex);
}
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index bb25e77c5540..33c1d970747c 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -34,7 +34,7 @@ static int detect_directory_symlink_target(struct cifs_sb_info *cifs_sb,
const char *symname,
bool *directory);
-int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
+int create_reparse_symlink(const unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
const char *full_path, const char *symname)
{
@@ -57,6 +57,7 @@ static int create_native_symlink(const unsigned int xid, struct inode *inode,
struct reparse_symlink_data_buffer *buf = NULL;
struct cifs_open_info_data data = {};
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ const char *symroot = cifs_sb->ctx->symlinkroot;
struct inode *new;
struct kvec iov;
__le16 *path = NULL;
@@ -82,7 +83,8 @@ static int create_native_symlink(const unsigned int xid, struct inode *inode,
.symlink_target = symlink_target,
};
- if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/') {
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) &&
+ symroot && symname[0] == '/') {
/*
* This is a request to create an absolute symlink on the server
* which does not support POSIX paths, and expects symlink in
@@ -92,7 +94,7 @@ static int create_native_symlink(const unsigned int xid, struct inode *inode,
* ensure compatibility of this symlink stored in absolute form
* on the SMB server.
*/
- if (!strstarts(symname, cifs_sb->ctx->symlinkroot)) {
+ if (!strstarts(symname, symroot)) {
/*
* If the absolute Linux symlink target path is not
* inside "symlinkroot" location then there is no way
@@ -101,12 +103,12 @@ static int create_native_symlink(const unsigned int xid, struct inode *inode,
cifs_dbg(VFS,
"absolute symlink '%s' cannot be converted to NT format "
"because it is outside of symlinkroot='%s'\n",
- symname, cifs_sb->ctx->symlinkroot);
+ symname, symroot);
rc = -EINVAL;
goto out;
}
- len = strlen(cifs_sb->ctx->symlinkroot);
- if (cifs_sb->ctx->symlinkroot[len-1] != '/')
+ len = strlen(symroot);
+ if (symroot[len - 1] != '/')
len++;
if (symname[len] >= 'a' && symname[len] <= 'z' &&
(symname[len+1] == '/' || symname[len+1] == '\0')) {
@@ -225,7 +227,8 @@ static int create_native_symlink(const unsigned int xid, struct inode *inode,
iov.iov_base = buf;
iov.iov_len = len;
- new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
+ new = tcon->ses->server->ops->create_reparse_inode(
+ &data, inode->i_sb, xid,
tcon, full_path, directory,
&iov, NULL);
if (!IS_ERR(new))
@@ -397,7 +400,8 @@ static int create_native_socket(const unsigned int xid, struct inode *inode,
struct inode *new;
int rc = 0;
- new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
+ new = tcon->ses->server->ops->create_reparse_inode(
+ &data, inode->i_sb, xid,
tcon, full_path, false, &iov, NULL);
if (!IS_ERR(new))
d_instantiate(dentry, new);
@@ -490,7 +494,8 @@ static int mknod_nfs(unsigned int xid, struct inode *inode,
.symlink_target = kstrdup(symname, GFP_KERNEL),
};
- new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
+ new = tcon->ses->server->ops->create_reparse_inode(
+ &data, inode->i_sb, xid,
tcon, full_path, false, &iov, NULL);
if (!IS_ERR(new))
d_instantiate(dentry, new);
@@ -683,7 +688,8 @@ static int mknod_wsl(unsigned int xid, struct inode *inode,
memcpy(data.wsl.eas, &cc->ea, len);
data.wsl.eas_len = len;
- new = smb2_get_reparse_inode(&data, inode->i_sb,
+ new = tcon->ses->server->ops->create_reparse_inode(
+ &data, inode->i_sb,
xid, tcon, full_path, false,
&reparse_iov, &xattr_iov);
if (!IS_ERR(new))
@@ -696,7 +702,7 @@ static int mknod_wsl(unsigned int xid, struct inode *inode,
return rc;
}
-int smb2_mknod_reparse(unsigned int xid, struct inode *inode,
+int mknod_reparse(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
const char *full_path, umode_t mode, dev_t dev)
{
@@ -782,6 +788,7 @@ int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len,
const char *full_path,
struct cifs_sb_info *cifs_sb)
{
+ const char *symroot = cifs_sb->ctx->symlinkroot;
char sep = CIFS_DIR_SEP(cifs_sb);
char *linux_target = NULL;
char *smb_target = NULL;
@@ -815,7 +822,8 @@ int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len,
goto out;
}
- if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && !relative) {
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) &&
+ symroot && !relative) {
/*
* This is an absolute symlink from the server which does not
* support POSIX paths, so the symlink is in NT-style path.
@@ -875,15 +883,8 @@ globalroot:
abs_path += sizeof("\\DosDevices\\")-1;
else if (strstarts(abs_path, "\\GLOBAL??\\"))
abs_path += sizeof("\\GLOBAL??\\")-1;
- else {
- /* Unhandled absolute symlink, points outside of DOS/Win32 */
- cifs_dbg(VFS,
- "absolute symlink '%s' cannot be converted from NT format "
- "because points to unknown target\n",
- smb_target);
- rc = -EIO;
- goto out;
- }
+ else
+ goto out_unhandled_target;
/* Sometimes path separator after \?? is double backslash */
if (abs_path[0] == '\\')
@@ -910,25 +911,19 @@ globalroot:
abs_path++;
abs_path[0] = drive_letter;
} else {
- /* Unhandled absolute symlink. Report an error. */
- cifs_dbg(VFS,
- "absolute symlink '%s' cannot be converted from NT format "
- "because points to unknown target\n",
- smb_target);
- rc = -EIO;
- goto out;
+ goto out_unhandled_target;
}
abs_path_len = strlen(abs_path)+1;
- symlinkroot_len = strlen(cifs_sb->ctx->symlinkroot);
- if (cifs_sb->ctx->symlinkroot[symlinkroot_len-1] == '/')
+ symlinkroot_len = strlen(symroot);
+ if (symroot[symlinkroot_len - 1] == '/')
symlinkroot_len--;
linux_target = kmalloc(symlinkroot_len + 1 + abs_path_len, GFP_KERNEL);
if (!linux_target) {
rc = -ENOMEM;
goto out;
}
- memcpy(linux_target, cifs_sb->ctx->symlinkroot, symlinkroot_len);
+ memcpy(linux_target, symroot, symlinkroot_len);
linux_target[symlinkroot_len] = '/';
memcpy(linux_target + symlinkroot_len + 1, abs_path, abs_path_len);
} else if (smb_target[0] == sep && relative) {
@@ -966,6 +961,7 @@ globalroot:
* These paths have same format as Linux symlinks, so no
* conversion is needed.
*/
+out_unhandled_target:
linux_target = smb_target;
smb_target = NULL;
}
@@ -1172,7 +1168,6 @@ out:
if (!have_xattr_dev && (tag == IO_REPARSE_TAG_LX_CHR || tag == IO_REPARSE_TAG_LX_BLK))
return false;
- fattr->cf_dtype = S_DT(fattr->cf_mode);
return true;
}
diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h
index 08de853b36a8..66269c10beba 100644
--- a/fs/smb/client/reparse.h
+++ b/fs/smb/client/reparse.h
@@ -129,10 +129,10 @@ static inline bool cifs_open_data_reparse(struct cifs_open_info_data *data)
bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
struct cifs_fattr *fattr,
struct cifs_open_info_data *data);
-int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
+int create_reparse_symlink(const unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
const char *full_path, const char *symname);
-int smb2_mknod_reparse(unsigned int xid, struct inode *inode,
+int mknod_reparse(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
const char *full_path, umode_t mode, dev_t dev);
struct reparse_data_buffer *smb2_get_reparse_point_buffer(const struct kvec *rsp_iov, u32 *len);
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index b3fa9ee26912..0a8c2fcc9ded 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -332,6 +332,7 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
struct cifs_server_iface *old_iface = NULL;
struct cifs_server_iface *last_iface = NULL;
struct sockaddr_storage ss;
+ int retry = 0;
spin_lock(&ses->chan_lock);
chan_index = cifs_ses_get_chan_index(ses, server);
@@ -360,6 +361,7 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
return;
}
+try_again:
last_iface = list_last_entry(&ses->iface_list, struct cifs_server_iface,
iface_head);
iface_min_speed = last_iface->speed;
@@ -397,6 +399,13 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
}
if (list_entry_is_head(iface, &ses->iface_list, iface_head)) {
+ list_for_each_entry(iface, &ses->iface_list, iface_head)
+ iface->weight_fulfilled = 0;
+
+ /* see if it can be satisfied in second attempt */
+ if (!retry++)
+ goto try_again;
+
iface = NULL;
cifs_dbg(FYI, "unable to find a suitable iface\n");
}
@@ -445,6 +454,10 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
ses->chans[chan_index].iface = iface;
spin_unlock(&ses->chan_lock);
+
+ spin_lock(&server->srv_lock);
+ memcpy(&server->dstaddr, &iface->sockaddr, sizeof(server->dstaddr));
+ spin_unlock(&server->srv_lock);
}
static int
@@ -494,8 +507,7 @@ cifs_ses_add_channel(struct cifs_ses *ses,
ctx->domainauto = ses->domainAuto;
ctx->domainname = ses->domainName;
- /* no hostname for extra channels */
- ctx->server_hostname = "";
+ ctx->server_hostname = ses->server->hostname;
ctx->username = ses->user_name;
ctx->password = ses->password;
@@ -628,6 +640,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses,
USHRT_MAX));
pSMB->req.MaxMpxCount = cpu_to_le16(server->maxReq);
pSMB->req.VcNumber = cpu_to_le16(1);
+ pSMB->req.SessionKey = server->session_key_id;
/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
@@ -1684,22 +1697,22 @@ _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data)
pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
capabilities = cifs_ssetup_hdr(ses, server, pSMB);
- if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
- cifs_dbg(VFS, "NTLMSSP requires Unicode support\n");
- return -ENOSYS;
- }
-
pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
capabilities |= CAP_EXTENDED_SECURITY;
pSMB->req.Capabilities |= cpu_to_le32(capabilities);
bcc_ptr = sess_data->iov[2].iov_base;
- /* unicode strings must be word aligned */
- if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) {
- *bcc_ptr = 0;
- bcc_ptr++;
+
+ if (pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) {
+ /* unicode strings must be word aligned */
+ if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) {
+ *bcc_ptr = 0;
+ bcc_ptr++;
+ }
+ unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp);
+ } else {
+ ascii_oslm_strings(&bcc_ptr, sess_data->nls_cp);
}
- unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp);
sess_data->iov[2].iov_len = (long) bcc_ptr -
(long) sess_data->iov[2].iov_base;
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index b27a182629ec..e364b6515af3 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -16,6 +16,7 @@
#include "fs_context.h"
#include "nterr.h"
#include "smberr.h"
+#include "reparse.h"
/*
* An NT cancel request header looks just like the original request except:
@@ -1263,17 +1264,26 @@ cifs_make_node(unsigned int xid, struct inode *inode,
if (rc == 0)
d_instantiate(dentry, newinode);
return rc;
+ } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
+ /*
+ * Check if mounted with mount parm 'sfu' mount parm.
+ * SFU emulation should work with all servers
+ * and was used by default in earlier versions of Windows.
+ */
+ return cifs_sfu_make_node(xid, inode, dentry, tcon,
+ full_path, mode, dev);
+ } else if (le32_to_cpu(tcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS) {
+ /*
+ * mknod via reparse points requires server support for
+ * storing reparse points, which is available since
+ * Windows 2000, but was not widely used until release
+ * of Windows Server 2012 by the Windows NFS server.
+ */
+ return mknod_reparse(xid, inode, dentry, tcon,
+ full_path, mode, dev);
+ } else {
+ return -EOPNOTSUPP;
}
- /*
- * Check if mounted with mount parm 'sfu' mount parm.
- * SFU emulation should work with all servers, but only
- * supports block and char device, socket & fifo,
- * and was used by default in earlier versions of Windows
- */
- if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
- return -EPERM;
- return cifs_sfu_make_node(xid, inode, dentry, tcon,
- full_path, mode, dev);
}
static bool
@@ -1370,6 +1380,7 @@ struct smb_version_operations smb1_operations = {
.create_hardlink = CIFSCreateHardLink,
.query_symlink = cifs_query_symlink,
.get_reparse_point_buffer = cifs_get_reparse_point_buffer,
+ .create_reparse_inode = cifs_create_reparse_inode,
.open = cifs_open_file,
.set_fid = cifs_set_fid,
.close = cifs_close_file,
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index 2a3e46b8e15a..69d251726c02 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -1058,10 +1058,11 @@ int smb2_query_path_info(const unsigned int xid,
* Skip SMB2_OP_GET_REPARSE if symlink already parsed in create
* response.
*/
- if (data->reparse.tag != IO_REPARSE_TAG_SYMLINK)
+ if (data->reparse.tag != IO_REPARSE_TAG_SYMLINK) {
cmds[num_cmds++] = SMB2_OP_GET_REPARSE;
- if (!tcon->posix_extensions)
- cmds[num_cmds++] = SMB2_OP_QUERY_WSL_EA;
+ if (!tcon->posix_extensions)
+ cmds[num_cmds++] = SMB2_OP_QUERY_WSL_EA;
+ }
oparms = CIFS_OPARMS(cifs_sb, tcon, full_path,
FILE_READ_ATTRIBUTES |
@@ -1320,7 +1321,7 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
return rc;
}
-struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
+struct inode *smb2_create_reparse_inode(struct cifs_open_info_data *data,
struct super_block *sb,
const unsigned int xid,
struct cifs_tcon *tcon,
@@ -1346,7 +1347,8 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
* empty object on the server.
*/
if (!(le32_to_cpu(tcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS))
- return ERR_PTR(-EOPNOTSUPP);
+ if (!tcon->posix_extensions)
+ return ERR_PTR(-EOPNOTSUPP);
oparms = CIFS_OPARMS(cifs_sb, tcon, full_path,
SYNCHRONIZE | DELETE |
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 2fe8eeb98535..1b4a31894f43 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -504,6 +504,9 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
wsize = min_t(unsigned int, wsize, server->max_write);
#ifdef CONFIG_CIFS_SMB_DIRECT
if (server->rdma) {
+ struct smbdirect_socket_parameters *sp =
+ &server->smbd_conn->socket.parameters;
+
if (server->sign)
/*
* Account for SMB2 data transfer packet header and
@@ -511,12 +514,12 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
*/
wsize = min_t(unsigned int,
wsize,
- server->smbd_conn->max_fragmented_send_size -
+ sp->max_fragmented_send_size -
SMB2_READWRITE_PDU_HEADER_SIZE -
sizeof(struct smb2_transform_hdr));
else
wsize = min_t(unsigned int,
- wsize, server->smbd_conn->max_readwrite_size);
+ wsize, sp->max_read_write_size);
}
#endif
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
@@ -552,6 +555,9 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
rsize = min_t(unsigned int, rsize, server->max_read);
#ifdef CONFIG_CIFS_SMB_DIRECT
if (server->rdma) {
+ struct smbdirect_socket_parameters *sp =
+ &server->smbd_conn->socket.parameters;
+
if (server->sign)
/*
* Account for SMB2 data transfer packet header and
@@ -559,12 +565,12 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
*/
rsize = min_t(unsigned int,
rsize,
- server->smbd_conn->max_fragmented_recv_size -
+ sp->max_fragmented_recv_size -
SMB2_READWRITE_PDU_HEADER_SIZE -
sizeof(struct smb2_transform_hdr));
else
rsize = min_t(unsigned int,
- rsize, server->smbd_conn->max_readwrite_size);
+ rsize, sp->max_read_write_size);
}
#endif
@@ -4069,7 +4075,7 @@ map_oplock_to_lease(u8 oplock)
}
static char *
-smb2_create_lease_buf(u8 *lease_key, u8 oplock)
+smb2_create_lease_buf(u8 *lease_key, u8 oplock, u8 *parent_lease_key, __le32 flags)
{
struct create_lease *buf;
@@ -4095,7 +4101,7 @@ smb2_create_lease_buf(u8 *lease_key, u8 oplock)
}
static char *
-smb3_create_lease_buf(u8 *lease_key, u8 oplock)
+smb3_create_lease_buf(u8 *lease_key, u8 oplock, u8 *parent_lease_key, __le32 flags)
{
struct create_lease_v2 *buf;
@@ -4105,6 +4111,9 @@ smb3_create_lease_buf(u8 *lease_key, u8 oplock)
memcpy(&buf->lcontext.LeaseKey, lease_key, SMB2_LEASE_KEY_SIZE);
buf->lcontext.LeaseState = map_oplock_to_lease(oplock);
+ buf->lcontext.LeaseFlags = flags;
+ if (flags & SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE)
+ memcpy(&buf->lcontext.ParentLeaseKey, parent_lease_key, SMB2_LEASE_KEY_SIZE);
buf->ccontext.DataOffset = cpu_to_le16(offsetof
(struct create_lease_v2, lcontext));
@@ -4307,6 +4316,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
u8 key[SMB3_ENC_DEC_KEY_SIZE];
struct aead_request *req;
u8 *iv;
+ DECLARE_CRYPTO_WAIT(wait);
unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
void *creq;
size_t sensitive_size;
@@ -4357,7 +4367,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
aead_request_set_crypt(req, sg, sg, crypt_len, iv);
aead_request_set_ad(req, assoc_data_len);
- rc = enc ? crypto_aead_encrypt(req) : crypto_aead_decrypt(req);
+ aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &wait);
+
+ rc = crypto_wait_req(enc ? crypto_aead_encrypt(req)
+ : crypto_aead_decrypt(req), &wait);
if (!rc && enc)
memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE);
@@ -5246,8 +5260,9 @@ static int smb2_make_node(unsigned int xid, struct inode *inode,
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
rc = cifs_sfu_make_node(xid, inode, dentry, tcon,
full_path, mode, dev);
- } else if (le32_to_cpu(tcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS) {
- rc = smb2_mknod_reparse(xid, inode, dentry, tcon,
+ } else if ((le32_to_cpu(tcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS)
+ || (tcon->posix_extensions)) {
+ rc = mknod_reparse(xid, inode, dentry, tcon,
full_path, mode, dev);
}
return rc;
@@ -5306,7 +5321,7 @@ struct smb_version_operations smb20_operations = {
.get_reparse_point_buffer = smb2_get_reparse_point_buffer,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
- .create_reparse_symlink = smb2_create_reparse_symlink,
+ .create_reparse_inode = smb2_create_reparse_inode,
.open = smb2_open_file,
.set_fid = smb2_set_fid,
.close = smb2_close_file,
@@ -5409,7 +5424,7 @@ struct smb_version_operations smb21_operations = {
.get_reparse_point_buffer = smb2_get_reparse_point_buffer,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
- .create_reparse_symlink = smb2_create_reparse_symlink,
+ .create_reparse_inode = smb2_create_reparse_inode,
.open = smb2_open_file,
.set_fid = smb2_set_fid,
.close = smb2_close_file,
@@ -5516,7 +5531,7 @@ struct smb_version_operations smb30_operations = {
.get_reparse_point_buffer = smb2_get_reparse_point_buffer,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
- .create_reparse_symlink = smb2_create_reparse_symlink,
+ .create_reparse_inode = smb2_create_reparse_inode,
.open = smb2_open_file,
.set_fid = smb2_set_fid,
.close = smb2_close_file,
@@ -5632,7 +5647,7 @@ struct smb_version_operations smb311_operations = {
.get_reparse_point_buffer = smb2_get_reparse_point_buffer,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
- .create_reparse_symlink = smb2_create_reparse_symlink,
+ .create_reparse_inode = smb2_create_reparse_inode,
.open = smb2_open_file,
.set_fid = smb2_set_fid,
.close = smb2_close_file,
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 4e28632b5fd6..2df93a75e3b8 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -36,6 +36,7 @@
#include "smb2glob.h"
#include "cifspdu.h"
#include "cifs_spnego.h"
+#include "../common/smbdirect/smbdirect.h"
#include "smbdirect.h"
#include "trace.h"
#ifdef CONFIG_CIFS_DFS_UPCALL
@@ -411,14 +412,23 @@ skip_sess_setup:
if (!rc &&
(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL) &&
server->ops->query_server_interfaces) {
- mutex_unlock(&ses->session_mutex);
-
/*
- * query server network interfaces, in case they change
+ * query server network interfaces, in case they change.
+ * Also mark the session as pending this update while the query
+ * is in progress. This will be used to avoid calling
+ * smb2_reconnect recursively.
*/
+ ses->flags |= CIFS_SES_FLAGS_PENDING_QUERY_INTERFACES;
xid = get_xid();
rc = server->ops->query_server_interfaces(xid, tcon, false);
free_xid(xid);
+ ses->flags &= ~CIFS_SES_FLAGS_PENDING_QUERY_INTERFACES;
+
+ if (!tcon->ipc && !tcon->dummy)
+ queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
+ (SMB_INTERFACE_POLL_INTERVAL * HZ));
+
+ mutex_unlock(&ses->session_mutex);
if (rc == -EOPNOTSUPP && ses->chan_count > 1) {
/*
@@ -438,11 +448,8 @@ skip_sess_setup:
if (ses->chan_max > ses->chan_count &&
ses->iface_count &&
!SERVER_IS_CHAN(server)) {
- if (ses->chan_count == 1) {
+ if (ses->chan_count == 1)
cifs_server_dbg(VFS, "supports multichannel now\n");
- queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
- (SMB_INTERFACE_POLL_INTERVAL * HZ));
- }
cifs_try_adding_channels(ses);
}
@@ -560,11 +567,18 @@ static int smb2_ioctl_req_init(u32 opcode, struct cifs_tcon *tcon,
struct TCP_Server_Info *server,
void **request_buf, unsigned int *total_len)
{
- /* Skip reconnect only for FSCTL_VALIDATE_NEGOTIATE_INFO IOCTLs */
- if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) {
+ /*
+ * Skip reconnect in one of the following cases:
+ * 1. For FSCTL_VALIDATE_NEGOTIATE_INFO IOCTLs
+ * 2. For FSCTL_QUERY_NETWORK_INTERFACE_INFO IOCTL when called from
+ * smb2_reconnect (indicated by CIFS_SES_FLAG_SCALE_CHANNELS ses flag)
+ */
+ if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO ||
+ (opcode == FSCTL_QUERY_NETWORK_INTERFACE_INFO &&
+ (tcon->ses->flags & CIFS_SES_FLAGS_PENDING_QUERY_INTERFACES)))
return __smb2_plain_req_init(SMB2_IOCTL, tcon, server,
request_buf, total_len);
- }
+
return smb2_plain_req_init(SMB2_IOCTL, tcon, server,
request_buf, total_len);
}
@@ -2392,11 +2406,16 @@ static int
add_lease_context(struct TCP_Server_Info *server,
struct smb2_create_req *req,
struct kvec *iov,
- unsigned int *num_iovec, u8 *lease_key, __u8 *oplock)
+ unsigned int *num_iovec,
+ u8 *lease_key,
+ __u8 *oplock,
+ u8 *parent_lease_key,
+ __le32 flags)
{
unsigned int num = *num_iovec;
- iov[num].iov_base = server->ops->create_lease_buf(lease_key, *oplock);
+ iov[num].iov_base = server->ops->create_lease_buf(lease_key, *oplock,
+ parent_lease_key, flags);
if (iov[num].iov_base == NULL)
return -ENOMEM;
iov[num].iov_len = server->vals->create_lease_size;
@@ -3069,7 +3088,9 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
req->RequestedOplockLevel = *oplock; /* no srv lease support */
else {
rc = add_lease_context(server, req, iov, &n_iov,
- oparms->fid->lease_key, oplock);
+ oparms->fid->lease_key, oplock,
+ oparms->fid->parent_lease_key,
+ oparms->lease_flags);
if (rc)
return rc;
}
@@ -4208,10 +4229,8 @@ void smb2_reconnect_server(struct work_struct *work)
}
goto done;
}
-
tcon->status = TID_GOOD;
- tcon->retry = false;
- tcon->need_reconnect = false;
+ tcon->dummy = true;
/* now reconnect sessions for necessary channels */
list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) {
@@ -4442,10 +4461,10 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
#ifdef CONFIG_CIFS_SMB_DIRECT
/*
* If we want to do a RDMA write, fill in and append
- * smbd_buffer_descriptor_v1 to the end of read request
+ * smbdirect_buffer_descriptor_v1 to the end of read request
*/
if (rdata && smb3_use_rdma_offload(io_parms)) {
- struct smbd_buffer_descriptor_v1 *v1;
+ struct smbdirect_buffer_descriptor_v1 *v1;
bool need_invalidate = server->dialect == SMB30_PROT_ID;
rdata->mr = smbd_register_mr(server->smbd_conn, &rdata->subreq.io_iter,
@@ -4459,8 +4478,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
req->ReadChannelInfoOffset =
cpu_to_le16(offsetof(struct smb2_read_req, Buffer));
req->ReadChannelInfoLength =
- cpu_to_le16(sizeof(struct smbd_buffer_descriptor_v1));
- v1 = (struct smbd_buffer_descriptor_v1 *) &req->Buffer[0];
+ cpu_to_le16(sizeof(struct smbdirect_buffer_descriptor_v1));
+ v1 = (struct smbdirect_buffer_descriptor_v1 *) &req->Buffer[0];
v1->offset = cpu_to_le64(rdata->mr->mr->iova);
v1->token = cpu_to_le32(rdata->mr->mr->rkey);
v1->length = cpu_to_le32(rdata->mr->mr->length);
@@ -4546,7 +4565,11 @@ smb2_readv_callback(struct mid_q_entry *mid)
cifs_stats_bytes_read(tcon, rdata->got_bytes);
break;
case MID_REQUEST_SUBMITTED:
+ trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_req_submitted);
+ goto do_retry;
case MID_RETRY_NEEDED:
+ trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_retry_needed);
+do_retry:
__set_bit(NETFS_SREQ_NEED_RETRY, &rdata->subreq.flags);
rdata->result = -EAGAIN;
if (server->sign && rdata->got_bytes)
@@ -4557,11 +4580,15 @@ smb2_readv_callback(struct mid_q_entry *mid)
cifs_stats_bytes_read(tcon, rdata->got_bytes);
break;
case MID_RESPONSE_MALFORMED:
+ trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_malformed);
credits.value = le16_to_cpu(shdr->CreditRequest);
credits.instance = server->reconnect_instance;
- fallthrough;
+ rdata->result = -EIO;
+ break;
default:
+ trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_unknown);
rdata->result = -EIO;
+ break;
}
#ifdef CONFIG_CIFS_SMB_DIRECT
/*
@@ -4814,11 +4841,14 @@ smb2_writev_callback(struct mid_q_entry *mid)
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
+ trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_progress);
credits.value = le16_to_cpu(rsp->hdr.CreditRequest);
credits.instance = server->reconnect_instance;
result = smb2_check_receive(mid, server, 0);
- if (result != 0)
+ if (result != 0) {
+ trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_bad);
break;
+ }
written = le32_to_cpu(rsp->DataLength);
/*
@@ -4840,14 +4870,23 @@ smb2_writev_callback(struct mid_q_entry *mid)
}
break;
case MID_REQUEST_SUBMITTED:
+ trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_req_submitted);
+ __set_bit(NETFS_SREQ_NEED_RETRY, &wdata->subreq.flags);
+ result = -EAGAIN;
+ break;
case MID_RETRY_NEEDED:
+ trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_retry_needed);
+ __set_bit(NETFS_SREQ_NEED_RETRY, &wdata->subreq.flags);
result = -EAGAIN;
break;
case MID_RESPONSE_MALFORMED:
+ trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_malformed);
credits.value = le16_to_cpu(rsp->hdr.CreditRequest);
credits.instance = server->reconnect_instance;
- fallthrough;
+ result = -EIO;
+ break;
default:
+ trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_unknown);
result = -EIO;
break;
}
@@ -4887,8 +4926,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
server->credits, server->in_flight,
0, cifs_trace_rw_credits_write_response_clear);
wdata->credits.value = 0;
- trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_progress);
- cifs_write_subrequest_terminated(wdata, result ?: written, true);
+ cifs_write_subrequest_terminated(wdata, result ?: written);
release_mid(mid);
trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0,
server->credits, server->in_flight,
@@ -4968,10 +5006,10 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
#ifdef CONFIG_CIFS_SMB_DIRECT
/*
* If we want to do a server RDMA read, fill in and append
- * smbd_buffer_descriptor_v1 to the end of write request
+ * smbdirect_buffer_descriptor_v1 to the end of write request
*/
if (smb3_use_rdma_offload(io_parms)) {
- struct smbd_buffer_descriptor_v1 *v1;
+ struct smbdirect_buffer_descriptor_v1 *v1;
bool need_invalidate = server->dialect == SMB30_PROT_ID;
wdata->mr = smbd_register_mr(server->smbd_conn, &wdata->subreq.io_iter,
@@ -4990,8 +5028,8 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
req->WriteChannelInfoOffset =
cpu_to_le16(offsetof(struct smb2_write_req, Buffer));
req->WriteChannelInfoLength =
- cpu_to_le16(sizeof(struct smbd_buffer_descriptor_v1));
- v1 = (struct smbd_buffer_descriptor_v1 *) &req->Buffer[0];
+ cpu_to_le16(sizeof(struct smbdirect_buffer_descriptor_v1));
+ v1 = (struct smbdirect_buffer_descriptor_v1 *) &req->Buffer[0];
v1->offset = cpu_to_le64(wdata->mr->mr->iova);
v1->token = cpu_to_le32(wdata->mr->mr->rkey);
v1->length = cpu_to_le32(wdata->mr->mr->length);
@@ -5061,7 +5099,7 @@ out:
-(int)wdata->credits.value,
cifs_trace_rw_credits_write_response_clear);
add_credits_and_wake_if(wdata->server, &wdata->credits, 0);
- cifs_write_subrequest_terminated(wdata, rc, true);
+ cifs_write_subrequest_terminated(wdata, rc);
}
}
@@ -5917,71 +5955,6 @@ posix_qfsinf_exit:
}
int
-SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid, struct kstatfs *fsdata)
-{
- struct smb_rqst rqst;
- struct smb2_query_info_rsp *rsp = NULL;
- struct kvec iov;
- struct kvec rsp_iov;
- int rc = 0;
- int resp_buftype;
- struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server;
- struct smb2_fs_full_size_info *info = NULL;
- int flags = 0;
- int retries = 0, cur_sleep = 1;
-
-replay_again:
- /* reinitialize for possible replay */
- flags = 0;
- server = cifs_pick_channel(ses);
-
- rc = build_qfs_info_req(&iov, tcon, server,
- FS_FULL_SIZE_INFORMATION,
- sizeof(struct smb2_fs_full_size_info),
- persistent_fid, volatile_fid);
- if (rc)
- return rc;
-
- if (smb3_encryption_required(tcon))
- flags |= CIFS_TRANSFORM_REQ;
-
- memset(&rqst, 0, sizeof(struct smb_rqst));
- rqst.rq_iov = &iov;
- rqst.rq_nvec = 1;
-
- if (retries)
- smb2_set_replay(server, &rqst);
-
- rc = cifs_send_recv(xid, ses, server,
- &rqst, &resp_buftype, flags, &rsp_iov);
- free_qfs_info_req(&iov);
- if (rc) {
- cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
- goto qfsinf_exit;
- }
- rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
-
- info = (struct smb2_fs_full_size_info *)(
- le16_to_cpu(rsp->OutputBufferOffset) + (char *)rsp);
- rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset),
- le32_to_cpu(rsp->OutputBufferLength), &rsp_iov,
- sizeof(struct smb2_fs_full_size_info));
- if (!rc)
- smb2_copy_fs_info_to_kstatfs(info, fsdata);
-
-qfsinf_exit:
- free_rsp_buf(resp_buftype, rsp_iov.iov_base);
-
- if (is_replayable_error(rc) &&
- smb2_should_replay(tcon, &retries, &cur_sleep))
- goto replay_again;
-
- return rc;
-}
-
-int
SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, int level)
{
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index 4662c7e2d259..6e805ece6a7b 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -54,7 +54,7 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server,
extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *path,
__u32 *reparse_tag);
-struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
+struct inode *smb2_create_reparse_inode(struct cifs_open_info_data *data,
struct super_block *sb,
const unsigned int xid,
struct cifs_tcon *tcon,
@@ -259,9 +259,6 @@ extern int smb2_handle_cancelled_close(struct cifs_tcon *tcon,
__u64 volatile_fid);
extern int smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server);
void smb2_cancelled_close_fid(struct work_struct *work);
-extern int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_file_id, u64 volatile_file_id,
- struct kstatfs *FSData);
extern int SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id,
struct kstatfs *FSData);
@@ -317,9 +314,6 @@ int smb311_posix_query_path_info(const unsigned int xid,
int posix_info_parse(const void *beg, const void *end,
struct smb2_posix_info_parsed *out);
int posix_info_sid_size(const void *beg, const void *end);
-int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
- struct dentry *dentry, struct cifs_tcon *tcon,
- const char *full_path, const char *symname);
int smb2_make_nfs_node(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
const char *full_path, umode_t mode, dev_t dev);
diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c
index b0b7254661e9..754e94a0e07f 100644
--- a/fs/smb/client/smbdirect.c
+++ b/fs/smb/client/smbdirect.c
@@ -7,6 +7,7 @@
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/folio_queue.h>
+#include "../common/smbdirect/smbdirect_pdu.h"
#include "smbdirect.h"
#include "cifs_debug.h"
#include "cifsproto.h"
@@ -50,9 +51,6 @@ struct smb_extract_to_rdma {
static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len,
struct smb_extract_to_rdma *rdma);
-/* SMBD version number */
-#define SMBD_V1 0x0100
-
/* Port numbers for SMBD transport */
#define SMB_PORT 445
#define SMBD_PORT 5445
@@ -165,10 +163,11 @@ static void smbd_disconnect_rdma_work(struct work_struct *work)
{
struct smbd_connection *info =
container_of(work, struct smbd_connection, disconnect_work);
+ struct smbdirect_socket *sc = &info->socket;
- if (info->transport_status == SMBD_CONNECTED) {
- info->transport_status = SMBD_DISCONNECTING;
- rdma_disconnect(info->id);
+ if (sc->status == SMBDIRECT_SOCKET_CONNECTED) {
+ sc->status = SMBDIRECT_SOCKET_DISCONNECTING;
+ rdma_disconnect(sc->rdma.cm_id);
}
}
@@ -182,6 +181,7 @@ static int smbd_conn_upcall(
struct rdma_cm_id *id, struct rdma_cm_event *event)
{
struct smbd_connection *info = id->context;
+ struct smbdirect_socket *sc = &info->socket;
log_rdma_event(INFO, "event=%d status=%d\n",
event->event, event->status);
@@ -205,7 +205,7 @@ static int smbd_conn_upcall(
case RDMA_CM_EVENT_ESTABLISHED:
log_rdma_event(INFO, "connected event=%d\n", event->event);
- info->transport_status = SMBD_CONNECTED;
+ sc->status = SMBDIRECT_SOCKET_CONNECTED;
wake_up_interruptible(&info->conn_wait);
break;
@@ -213,20 +213,20 @@ static int smbd_conn_upcall(
case RDMA_CM_EVENT_UNREACHABLE:
case RDMA_CM_EVENT_REJECTED:
log_rdma_event(INFO, "connecting failed event=%d\n", event->event);
- info->transport_status = SMBD_DISCONNECTED;
+ sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
wake_up_interruptible(&info->conn_wait);
break;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
case RDMA_CM_EVENT_DISCONNECTED:
/* This happens when we fail the negotiation */
- if (info->transport_status == SMBD_NEGOTIATE_FAILED) {
- info->transport_status = SMBD_DISCONNECTED;
+ if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_FAILED) {
+ sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
wake_up(&info->conn_wait);
break;
}
- info->transport_status = SMBD_DISCONNECTED;
+ sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
wake_up_interruptible(&info->disconn_wait);
wake_up_interruptible(&info->wait_reassembly_queue);
wake_up_interruptible_all(&info->wait_send_queue);
@@ -275,6 +275,8 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
int i;
struct smbd_request *request =
container_of(wc->wr_cqe, struct smbd_request, cqe);
+ struct smbd_connection *info = request->info;
+ struct smbdirect_socket *sc = &info->socket;
log_rdma_send(INFO, "smbd_request 0x%p completed wc->status=%d\n",
request, wc->status);
@@ -286,7 +288,7 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
}
for (i = 0; i < request->num_sge; i++)
- ib_dma_unmap_single(request->info->id->device,
+ ib_dma_unmap_single(sc->ib.dev,
request->sge[i].addr,
request->sge[i].length,
DMA_TO_DEVICE);
@@ -299,7 +301,7 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
mempool_free(request, request->info->request_mempool);
}
-static void dump_smbd_negotiate_resp(struct smbd_negotiate_resp *resp)
+static void dump_smbdirect_negotiate_resp(struct smbdirect_negotiate_resp *resp)
{
log_rdma_event(INFO, "resp message min_version %u max_version %u negotiated_version %u credits_requested %u credits_granted %u status %u max_readwrite_size %u preferred_send_size %u max_receive_size %u max_fragmented_size %u\n",
resp->min_version, resp->max_version,
@@ -318,15 +320,17 @@ static bool process_negotiation_response(
struct smbd_response *response, int packet_length)
{
struct smbd_connection *info = response->info;
- struct smbd_negotiate_resp *packet = smbd_response_payload(response);
+ struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct smbdirect_negotiate_resp *packet = smbd_response_payload(response);
- if (packet_length < sizeof(struct smbd_negotiate_resp)) {
+ if (packet_length < sizeof(struct smbdirect_negotiate_resp)) {
log_rdma_event(ERR,
"error: packet_length=%d\n", packet_length);
return false;
}
- if (le16_to_cpu(packet->negotiated_version) != SMBD_V1) {
+ if (le16_to_cpu(packet->negotiated_version) != SMBDIRECT_V1) {
log_rdma_event(ERR, "error: negotiated_version=%x\n",
le16_to_cpu(packet->negotiated_version));
return false;
@@ -347,20 +351,20 @@ static bool process_negotiation_response(
atomic_set(&info->receive_credits, 0);
- if (le32_to_cpu(packet->preferred_send_size) > info->max_receive_size) {
+ if (le32_to_cpu(packet->preferred_send_size) > sp->max_recv_size) {
log_rdma_event(ERR, "error: preferred_send_size=%d\n",
le32_to_cpu(packet->preferred_send_size));
return false;
}
- info->max_receive_size = le32_to_cpu(packet->preferred_send_size);
+ sp->max_recv_size = le32_to_cpu(packet->preferred_send_size);
if (le32_to_cpu(packet->max_receive_size) < SMBD_MIN_RECEIVE_SIZE) {
log_rdma_event(ERR, "error: max_receive_size=%d\n",
le32_to_cpu(packet->max_receive_size));
return false;
}
- info->max_send_size = min_t(int, info->max_send_size,
- le32_to_cpu(packet->max_receive_size));
+ sp->max_send_size = min_t(u32, sp->max_send_size,
+ le32_to_cpu(packet->max_receive_size));
if (le32_to_cpu(packet->max_fragmented_size) <
SMBD_MIN_FRAGMENTED_SIZE) {
@@ -368,18 +372,18 @@ static bool process_negotiation_response(
le32_to_cpu(packet->max_fragmented_size));
return false;
}
- info->max_fragmented_send_size =
+ sp->max_fragmented_send_size =
le32_to_cpu(packet->max_fragmented_size);
info->rdma_readwrite_threshold =
- rdma_readwrite_threshold > info->max_fragmented_send_size ?
- info->max_fragmented_send_size :
+ rdma_readwrite_threshold > sp->max_fragmented_send_size ?
+ sp->max_fragmented_send_size :
rdma_readwrite_threshold;
- info->max_readwrite_size = min_t(u32,
+ sp->max_read_write_size = min_t(u32,
le32_to_cpu(packet->max_readwrite_size),
info->max_frmr_depth * PAGE_SIZE);
- info->max_frmr_depth = info->max_readwrite_size / PAGE_SIZE;
+ info->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE;
return true;
}
@@ -393,8 +397,9 @@ static void smbd_post_send_credits(struct work_struct *work)
struct smbd_connection *info =
container_of(work, struct smbd_connection,
post_send_credits_work);
+ struct smbdirect_socket *sc = &info->socket;
- if (info->transport_status != SMBD_CONNECTED) {
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
wake_up(&info->wait_receive_queues);
return;
}
@@ -448,7 +453,7 @@ static void smbd_post_send_credits(struct work_struct *work)
/* Called from softirq, when recv is done */
static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
- struct smbd_data_transfer *data_transfer;
+ struct smbdirect_data_transfer *data_transfer;
struct smbd_response *response =
container_of(wc->wr_cqe, struct smbd_response, cqe);
struct smbd_connection *info = response->info;
@@ -474,7 +479,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
switch (response->type) {
/* SMBD negotiation response */
case SMBD_NEGOTIATE_RESP:
- dump_smbd_negotiate_resp(smbd_response_payload(response));
+ dump_smbdirect_negotiate_resp(smbd_response_payload(response));
info->full_packet_received = true;
info->negotiate_done =
process_negotiation_response(response, wc->byte_len);
@@ -531,7 +536,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
/* Send a KEEP_ALIVE response right away if requested */
info->keep_alive_requested = KEEP_ALIVE_NONE;
if (le16_to_cpu(data_transfer->flags) &
- SMB_DIRECT_RESPONSE_REQUESTED) {
+ SMBDIRECT_FLAG_RESPONSE_REQUESTED) {
info->keep_alive_requested = KEEP_ALIVE_PENDING;
}
@@ -635,32 +640,34 @@ static int smbd_ia_open(
struct smbd_connection *info,
struct sockaddr *dstaddr, int port)
{
+ struct smbdirect_socket *sc = &info->socket;
int rc;
- info->id = smbd_create_id(info, dstaddr, port);
- if (IS_ERR(info->id)) {
- rc = PTR_ERR(info->id);
+ sc->rdma.cm_id = smbd_create_id(info, dstaddr, port);
+ if (IS_ERR(sc->rdma.cm_id)) {
+ rc = PTR_ERR(sc->rdma.cm_id);
goto out1;
}
+ sc->ib.dev = sc->rdma.cm_id->device;
- if (!frwr_is_supported(&info->id->device->attrs)) {
+ if (!frwr_is_supported(&sc->ib.dev->attrs)) {
log_rdma_event(ERR, "Fast Registration Work Requests (FRWR) is not supported\n");
log_rdma_event(ERR, "Device capability flags = %llx max_fast_reg_page_list_len = %u\n",
- info->id->device->attrs.device_cap_flags,
- info->id->device->attrs.max_fast_reg_page_list_len);
+ sc->ib.dev->attrs.device_cap_flags,
+ sc->ib.dev->attrs.max_fast_reg_page_list_len);
rc = -EPROTONOSUPPORT;
goto out2;
}
info->max_frmr_depth = min_t(int,
smbd_max_frmr_depth,
- info->id->device->attrs.max_fast_reg_page_list_len);
+ sc->ib.dev->attrs.max_fast_reg_page_list_len);
info->mr_type = IB_MR_TYPE_MEM_REG;
- if (info->id->device->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
+ if (sc->ib.dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
info->mr_type = IB_MR_TYPE_SG_GAPS;
- info->pd = ib_alloc_pd(info->id->device, 0);
- if (IS_ERR(info->pd)) {
- rc = PTR_ERR(info->pd);
+ sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);
+ if (IS_ERR(sc->ib.pd)) {
+ rc = PTR_ERR(sc->ib.pd);
log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc);
goto out2;
}
@@ -668,8 +675,8 @@ static int smbd_ia_open(
return 0;
out2:
- rdma_destroy_id(info->id);
- info->id = NULL;
+ rdma_destroy_id(sc->rdma.cm_id);
+ sc->rdma.cm_id = NULL;
out1:
return rc;
@@ -683,10 +690,12 @@ out1:
*/
static int smbd_post_send_negotiate_req(struct smbd_connection *info)
{
+ struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
struct ib_send_wr send_wr;
int rc = -ENOMEM;
struct smbd_request *request;
- struct smbd_negotiate_req *packet;
+ struct smbdirect_negotiate_req *packet;
request = mempool_alloc(info->request_mempool, GFP_KERNEL);
if (!request)
@@ -695,29 +704,29 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info)
request->info = info;
packet = smbd_request_payload(request);
- packet->min_version = cpu_to_le16(SMBD_V1);
- packet->max_version = cpu_to_le16(SMBD_V1);
+ packet->min_version = cpu_to_le16(SMBDIRECT_V1);
+ packet->max_version = cpu_to_le16(SMBDIRECT_V1);
packet->reserved = 0;
- packet->credits_requested = cpu_to_le16(info->send_credit_target);
- packet->preferred_send_size = cpu_to_le32(info->max_send_size);
- packet->max_receive_size = cpu_to_le32(info->max_receive_size);
+ packet->credits_requested = cpu_to_le16(sp->send_credit_target);
+ packet->preferred_send_size = cpu_to_le32(sp->max_send_size);
+ packet->max_receive_size = cpu_to_le32(sp->max_recv_size);
packet->max_fragmented_size =
- cpu_to_le32(info->max_fragmented_recv_size);
+ cpu_to_le32(sp->max_fragmented_recv_size);
request->num_sge = 1;
request->sge[0].addr = ib_dma_map_single(
- info->id->device, (void *)packet,
+ sc->ib.dev, (void *)packet,
sizeof(*packet), DMA_TO_DEVICE);
- if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) {
+ if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) {
rc = -EIO;
goto dma_mapping_failed;
}
request->sge[0].length = sizeof(*packet);
- request->sge[0].lkey = info->pd->local_dma_lkey;
+ request->sge[0].lkey = sc->ib.pd->local_dma_lkey;
ib_dma_sync_single_for_device(
- info->id->device, request->sge[0].addr,
+ sc->ib.dev, request->sge[0].addr,
request->sge[0].length, DMA_TO_DEVICE);
request->cqe.done = send_done;
@@ -734,14 +743,14 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info)
request->sge[0].length, request->sge[0].lkey);
atomic_inc(&info->send_pending);
- rc = ib_post_send(info->id->qp, &send_wr, NULL);
+ rc = ib_post_send(sc->ib.qp, &send_wr, NULL);
if (!rc)
return 0;
/* if we reach here, post send failed */
log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
atomic_dec(&info->send_pending);
- ib_dma_unmap_single(info->id->device, request->sge[0].addr,
+ ib_dma_unmap_single(sc->ib.dev, request->sge[0].addr,
request->sge[0].length, DMA_TO_DEVICE);
smbd_disconnect_rdma_connection(info);
@@ -774,10 +783,10 @@ static int manage_credits_prior_sending(struct smbd_connection *info)
/*
* Check if we need to send a KEEP_ALIVE message
* The idle connection timer triggers a KEEP_ALIVE message when expires
- * SMB_DIRECT_RESPONSE_REQUESTED is set in the message flag to have peer send
+ * SMBDIRECT_FLAG_RESPONSE_REQUESTED is set in the message flag to have peer send
* back a response.
* return value:
- * 1 if SMB_DIRECT_RESPONSE_REQUESTED needs to be set
+ * 1 if SMBDIRECT_FLAG_RESPONSE_REQUESTED needs to be set
* 0: otherwise
*/
static int manage_keep_alive_before_sending(struct smbd_connection *info)
@@ -793,6 +802,8 @@ static int manage_keep_alive_before_sending(struct smbd_connection *info)
static int smbd_post_send(struct smbd_connection *info,
struct smbd_request *request)
{
+ struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
struct ib_send_wr send_wr;
int rc, i;
@@ -801,7 +812,7 @@ static int smbd_post_send(struct smbd_connection *info,
"rdma_request sge[%d] addr=0x%llx length=%u\n",
i, request->sge[i].addr, request->sge[i].length);
ib_dma_sync_single_for_device(
- info->id->device,
+ sc->ib.dev,
request->sge[i].addr,
request->sge[i].length,
DMA_TO_DEVICE);
@@ -816,7 +827,7 @@ static int smbd_post_send(struct smbd_connection *info,
send_wr.opcode = IB_WR_SEND;
send_wr.send_flags = IB_SEND_SIGNALED;
- rc = ib_post_send(info->id->qp, &send_wr, NULL);
+ rc = ib_post_send(sc->ib.qp, &send_wr, NULL);
if (rc) {
log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
smbd_disconnect_rdma_connection(info);
@@ -824,7 +835,7 @@ static int smbd_post_send(struct smbd_connection *info,
} else
/* Reset timer for idle connection after packet is sent */
mod_delayed_work(info->workqueue, &info->idle_timer_work,
- info->keep_alive_interval*HZ);
+ msecs_to_jiffies(sp->keepalive_interval_msec));
return rc;
}
@@ -833,22 +844,24 @@ static int smbd_post_send_iter(struct smbd_connection *info,
struct iov_iter *iter,
int *_remaining_data_length)
{
+ struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
int i, rc;
int header_length;
int data_length;
struct smbd_request *request;
- struct smbd_data_transfer *packet;
+ struct smbdirect_data_transfer *packet;
int new_credits = 0;
wait_credit:
/* Wait for send credits. A SMBD packet needs one credit */
rc = wait_event_interruptible(info->wait_send_queue,
atomic_read(&info->send_credits) > 0 ||
- info->transport_status != SMBD_CONNECTED);
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
if (rc)
goto err_wait_credit;
- if (info->transport_status != SMBD_CONNECTED) {
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
log_outgoing(ERR, "disconnected not sending on wait_credit\n");
rc = -EAGAIN;
goto err_wait_credit;
@@ -860,17 +873,17 @@ wait_credit:
wait_send_queue:
wait_event(info->wait_post_send,
- atomic_read(&info->send_pending) < info->send_credit_target ||
- info->transport_status != SMBD_CONNECTED);
+ atomic_read(&info->send_pending) < sp->send_credit_target ||
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
- if (info->transport_status != SMBD_CONNECTED) {
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
log_outgoing(ERR, "disconnected not sending on wait_send_queue\n");
rc = -EAGAIN;
goto err_wait_send_queue;
}
if (unlikely(atomic_inc_return(&info->send_pending) >
- info->send_credit_target)) {
+ sp->send_credit_target)) {
atomic_dec(&info->send_pending);
goto wait_send_queue;
}
@@ -890,12 +903,14 @@ wait_send_queue:
.nr_sge = 1,
.max_sge = SMBDIRECT_MAX_SEND_SGE,
.sge = request->sge,
- .device = info->id->device,
- .local_dma_lkey = info->pd->local_dma_lkey,
+ .device = sc->ib.dev,
+ .local_dma_lkey = sc->ib.pd->local_dma_lkey,
.direction = DMA_TO_DEVICE,
};
+ size_t payload_len = umin(*_remaining_data_length,
+ sp->max_send_size - sizeof(*packet));
- rc = smb_extract_iter_to_rdma(iter, *_remaining_data_length,
+ rc = smb_extract_iter_to_rdma(iter, payload_len,
&extract);
if (rc < 0)
goto err_dma;
@@ -909,7 +924,7 @@ wait_send_queue:
/* Fill in the packet header */
packet = smbd_request_payload(request);
- packet->credits_requested = cpu_to_le16(info->send_credit_target);
+ packet->credits_requested = cpu_to_le16(sp->send_credit_target);
new_credits = manage_credits_prior_sending(info);
atomic_add(new_credits, &info->receive_credits);
@@ -919,7 +934,7 @@ wait_send_queue:
packet->flags = 0;
if (manage_keep_alive_before_sending(info))
- packet->flags |= cpu_to_le16(SMB_DIRECT_RESPONSE_REQUESTED);
+ packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED);
packet->reserved = 0;
if (!data_length)
@@ -938,23 +953,23 @@ wait_send_queue:
le32_to_cpu(packet->remaining_data_length));
/* Map the packet to DMA */
- header_length = sizeof(struct smbd_data_transfer);
+ header_length = sizeof(struct smbdirect_data_transfer);
/* If this is a packet without payload, don't send padding */
if (!data_length)
- header_length = offsetof(struct smbd_data_transfer, padding);
+ header_length = offsetof(struct smbdirect_data_transfer, padding);
- request->sge[0].addr = ib_dma_map_single(info->id->device,
+ request->sge[0].addr = ib_dma_map_single(sc->ib.dev,
(void *)packet,
header_length,
DMA_TO_DEVICE);
- if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) {
+ if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) {
rc = -EIO;
request->sge[0].addr = 0;
goto err_dma;
}
request->sge[0].length = header_length;
- request->sge[0].lkey = info->pd->local_dma_lkey;
+ request->sge[0].lkey = sc->ib.pd->local_dma_lkey;
rc = smbd_post_send(info, request);
if (!rc)
@@ -963,7 +978,7 @@ wait_send_queue:
err_dma:
for (i = 0; i < request->num_sge; i++)
if (request->sge[i].addr)
- ib_dma_unmap_single(info->id->device,
+ ib_dma_unmap_single(sc->ib.dev,
request->sge[i].addr,
request->sge[i].length,
DMA_TO_DEVICE);
@@ -1000,6 +1015,27 @@ static int smbd_post_send_empty(struct smbd_connection *info)
return smbd_post_send_iter(info, NULL, &remaining_data_length);
}
+static int smbd_post_send_full_iter(struct smbd_connection *info,
+ struct iov_iter *iter,
+ int *_remaining_data_length)
+{
+ int rc = 0;
+
+ /*
+ * smbd_post_send_iter() respects the
+ * negotiated max_send_size, so we need to
+ * loop until the full iter is posted
+ */
+
+ while (iov_iter_count(iter) > 0) {
+ rc = smbd_post_send_iter(info, iter, _remaining_data_length);
+ if (rc < 0)
+ break;
+ }
+
+ return rc;
+}
+
/*
* Post a receive request to the transport
* The remote peer can only send data when a receive request is posted
@@ -1008,17 +1044,19 @@ static int smbd_post_send_empty(struct smbd_connection *info)
static int smbd_post_recv(
struct smbd_connection *info, struct smbd_response *response)
{
+ struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
struct ib_recv_wr recv_wr;
int rc = -EIO;
response->sge.addr = ib_dma_map_single(
- info->id->device, response->packet,
- info->max_receive_size, DMA_FROM_DEVICE);
- if (ib_dma_mapping_error(info->id->device, response->sge.addr))
+ sc->ib.dev, response->packet,
+ sp->max_recv_size, DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(sc->ib.dev, response->sge.addr))
return rc;
- response->sge.length = info->max_receive_size;
- response->sge.lkey = info->pd->local_dma_lkey;
+ response->sge.length = sp->max_recv_size;
+ response->sge.lkey = sc->ib.pd->local_dma_lkey;
response->cqe.done = recv_done;
@@ -1027,9 +1065,9 @@ static int smbd_post_recv(
recv_wr.sg_list = &response->sge;
recv_wr.num_sge = 1;
- rc = ib_post_recv(info->id->qp, &recv_wr, NULL);
+ rc = ib_post_recv(sc->ib.qp, &recv_wr, NULL);
if (rc) {
- ib_dma_unmap_single(info->id->device, response->sge.addr,
+ ib_dma_unmap_single(sc->ib.dev, response->sge.addr,
response->sge.length, DMA_FROM_DEVICE);
smbd_disconnect_rdma_connection(info);
log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc);
@@ -1187,9 +1225,10 @@ static struct smbd_response *get_receive_buffer(struct smbd_connection *info)
static void put_receive_buffer(
struct smbd_connection *info, struct smbd_response *response)
{
+ struct smbdirect_socket *sc = &info->socket;
unsigned long flags;
- ib_dma_unmap_single(info->id->device, response->sge.addr,
+ ib_dma_unmap_single(sc->ib.dev, response->sge.addr,
response->sge.length, DMA_FROM_DEVICE);
spin_lock_irqsave(&info->receive_queue_lock, flags);
@@ -1264,6 +1303,8 @@ static void idle_connection_timer(struct work_struct *work)
struct smbd_connection *info = container_of(
work, struct smbd_connection,
idle_timer_work.work);
+ struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
if (info->keep_alive_requested != KEEP_ALIVE_NONE) {
log_keep_alive(ERR,
@@ -1278,7 +1319,7 @@ static void idle_connection_timer(struct work_struct *work)
/* Setup the next idle timeout work */
queue_delayed_work(info->workqueue, &info->idle_timer_work,
- info->keep_alive_interval*HZ);
+ msecs_to_jiffies(sp->keepalive_interval_msec));
}
/*
@@ -1289,6 +1330,8 @@ static void idle_connection_timer(struct work_struct *work)
void smbd_destroy(struct TCP_Server_Info *server)
{
struct smbd_connection *info = server->smbd_conn;
+ struct smbdirect_socket *sc;
+ struct smbdirect_socket_parameters *sp;
struct smbd_response *response;
unsigned long flags;
@@ -1296,19 +1339,22 @@ void smbd_destroy(struct TCP_Server_Info *server)
log_rdma_event(INFO, "rdma session already destroyed\n");
return;
}
+ sc = &info->socket;
+ sp = &sc->parameters;
log_rdma_event(INFO, "destroying rdma session\n");
- if (info->transport_status != SMBD_DISCONNECTED) {
- rdma_disconnect(server->smbd_conn->id);
+ if (sc->status != SMBDIRECT_SOCKET_DISCONNECTED) {
+ rdma_disconnect(sc->rdma.cm_id);
log_rdma_event(INFO, "wait for transport being disconnected\n");
wait_event_interruptible(
info->disconn_wait,
- info->transport_status == SMBD_DISCONNECTED);
+ sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
}
log_rdma_event(INFO, "destroying qp\n");
- ib_drain_qp(info->id->qp);
- rdma_destroy_qp(info->id);
+ ib_drain_qp(sc->ib.qp);
+ rdma_destroy_qp(sc->rdma.cm_id);
+ sc->ib.qp = NULL;
log_rdma_event(INFO, "cancelling idle timer\n");
cancel_delayed_work_sync(&info->idle_timer_work);
@@ -1336,7 +1382,7 @@ void smbd_destroy(struct TCP_Server_Info *server)
log_rdma_event(INFO, "free receive buffers\n");
wait_event(info->wait_receive_queues,
info->count_receive_queue + info->count_empty_packet_queue
- == info->receive_credit_max);
+ == sp->recv_credit_max);
destroy_receive_buffers(info);
/*
@@ -1355,10 +1401,10 @@ void smbd_destroy(struct TCP_Server_Info *server)
}
destroy_mr_list(info);
- ib_free_cq(info->send_cq);
- ib_free_cq(info->recv_cq);
- ib_dealloc_pd(info->pd);
- rdma_destroy_id(info->id);
+ ib_free_cq(sc->ib.send_cq);
+ ib_free_cq(sc->ib.recv_cq);
+ ib_dealloc_pd(sc->ib.pd);
+ rdma_destroy_id(sc->rdma.cm_id);
/* free mempools */
mempool_destroy(info->request_mempool);
@@ -1367,7 +1413,7 @@ void smbd_destroy(struct TCP_Server_Info *server)
mempool_destroy(info->response_mempool);
kmem_cache_destroy(info->response_cache);
- info->transport_status = SMBD_DESTROYED;
+ sc->status = SMBDIRECT_SOCKET_DESTROYED;
destroy_workqueue(info->workqueue);
log_rdma_event(INFO, "rdma session destroyed\n");
@@ -1392,7 +1438,7 @@ int smbd_reconnect(struct TCP_Server_Info *server)
* This is possible if transport is disconnected and we haven't received
* notification from RDMA, but upper layer has detected timeout
*/
- if (server->smbd_conn->transport_status == SMBD_CONNECTED) {
+ if (server->smbd_conn->socket.status == SMBDIRECT_SOCKET_CONNECTED) {
log_rdma_event(INFO, "disconnecting transport\n");
smbd_destroy(server);
}
@@ -1424,37 +1470,47 @@ static void destroy_caches_and_workqueue(struct smbd_connection *info)
#define MAX_NAME_LEN 80
static int allocate_caches_and_workqueue(struct smbd_connection *info)
{
+ struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
char name[MAX_NAME_LEN];
int rc;
+ if (WARN_ON_ONCE(sp->max_recv_size < sizeof(struct smbdirect_data_transfer)))
+ return -ENOMEM;
+
scnprintf(name, MAX_NAME_LEN, "smbd_request_%p", info);
info->request_cache =
kmem_cache_create(
name,
sizeof(struct smbd_request) +
- sizeof(struct smbd_data_transfer),
+ sizeof(struct smbdirect_data_transfer),
0, SLAB_HWCACHE_ALIGN, NULL);
if (!info->request_cache)
return -ENOMEM;
info->request_mempool =
- mempool_create(info->send_credit_target, mempool_alloc_slab,
+ mempool_create(sp->send_credit_target, mempool_alloc_slab,
mempool_free_slab, info->request_cache);
if (!info->request_mempool)
goto out1;
scnprintf(name, MAX_NAME_LEN, "smbd_response_%p", info);
+
+ struct kmem_cache_args response_args = {
+ .align = __alignof__(struct smbd_response),
+ .useroffset = (offsetof(struct smbd_response, packet) +
+ sizeof(struct smbdirect_data_transfer)),
+ .usersize = sp->max_recv_size - sizeof(struct smbdirect_data_transfer),
+ };
info->response_cache =
- kmem_cache_create(
- name,
- sizeof(struct smbd_response) +
- info->max_receive_size,
- 0, SLAB_HWCACHE_ALIGN, NULL);
+ kmem_cache_create(name,
+ sizeof(struct smbd_response) + sp->max_recv_size,
+ &response_args, SLAB_HWCACHE_ALIGN);
if (!info->response_cache)
goto out2;
info->response_mempool =
- mempool_create(info->receive_credit_max, mempool_alloc_slab,
+ mempool_create(sp->recv_credit_max, mempool_alloc_slab,
mempool_free_slab, info->response_cache);
if (!info->response_mempool)
goto out3;
@@ -1464,7 +1520,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info)
if (!info->workqueue)
goto out4;
- rc = allocate_receive_buffers(info, info->receive_credit_max);
+ rc = allocate_receive_buffers(info, sp->recv_credit_max);
if (rc) {
log_rdma_event(ERR, "failed to allocate receive buffers\n");
goto out5;
@@ -1491,6 +1547,8 @@ static struct smbd_connection *_smbd_get_connection(
{
int rc;
struct smbd_connection *info;
+ struct smbdirect_socket *sc;
+ struct smbdirect_socket_parameters *sp;
struct rdma_conn_param conn_param;
struct ib_qp_init_attr qp_attr;
struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr;
@@ -1500,101 +1558,102 @@ static struct smbd_connection *_smbd_get_connection(
info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL);
if (!info)
return NULL;
+ sc = &info->socket;
+ sp = &sc->parameters;
- info->transport_status = SMBD_CONNECTING;
+ sc->status = SMBDIRECT_SOCKET_CONNECTING;
rc = smbd_ia_open(info, dstaddr, port);
if (rc) {
log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc);
goto create_id_failed;
}
- if (smbd_send_credit_target > info->id->device->attrs.max_cqe ||
- smbd_send_credit_target > info->id->device->attrs.max_qp_wr) {
+ if (smbd_send_credit_target > sc->ib.dev->attrs.max_cqe ||
+ smbd_send_credit_target > sc->ib.dev->attrs.max_qp_wr) {
log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
smbd_send_credit_target,
- info->id->device->attrs.max_cqe,
- info->id->device->attrs.max_qp_wr);
+ sc->ib.dev->attrs.max_cqe,
+ sc->ib.dev->attrs.max_qp_wr);
goto config_failed;
}
- if (smbd_receive_credit_max > info->id->device->attrs.max_cqe ||
- smbd_receive_credit_max > info->id->device->attrs.max_qp_wr) {
+ if (smbd_receive_credit_max > sc->ib.dev->attrs.max_cqe ||
+ smbd_receive_credit_max > sc->ib.dev->attrs.max_qp_wr) {
log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
smbd_receive_credit_max,
- info->id->device->attrs.max_cqe,
- info->id->device->attrs.max_qp_wr);
+ sc->ib.dev->attrs.max_cqe,
+ sc->ib.dev->attrs.max_qp_wr);
goto config_failed;
}
- info->receive_credit_max = smbd_receive_credit_max;
- info->send_credit_target = smbd_send_credit_target;
- info->max_send_size = smbd_max_send_size;
- info->max_fragmented_recv_size = smbd_max_fragmented_recv_size;
- info->max_receive_size = smbd_max_receive_size;
- info->keep_alive_interval = smbd_keep_alive_interval;
+ sp->recv_credit_max = smbd_receive_credit_max;
+ sp->send_credit_target = smbd_send_credit_target;
+ sp->max_send_size = smbd_max_send_size;
+ sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size;
+ sp->max_recv_size = smbd_max_receive_size;
+ sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000;
- if (info->id->device->attrs.max_send_sge < SMBDIRECT_MAX_SEND_SGE ||
- info->id->device->attrs.max_recv_sge < SMBDIRECT_MAX_RECV_SGE) {
+ if (sc->ib.dev->attrs.max_send_sge < SMBDIRECT_MAX_SEND_SGE ||
+ sc->ib.dev->attrs.max_recv_sge < SMBDIRECT_MAX_RECV_SGE) {
log_rdma_event(ERR,
"device %.*s max_send_sge/max_recv_sge = %d/%d too small\n",
IB_DEVICE_NAME_MAX,
- info->id->device->name,
- info->id->device->attrs.max_send_sge,
- info->id->device->attrs.max_recv_sge);
+ sc->ib.dev->name,
+ sc->ib.dev->attrs.max_send_sge,
+ sc->ib.dev->attrs.max_recv_sge);
goto config_failed;
}
- info->send_cq = NULL;
- info->recv_cq = NULL;
- info->send_cq =
- ib_alloc_cq_any(info->id->device, info,
- info->send_credit_target, IB_POLL_SOFTIRQ);
- if (IS_ERR(info->send_cq)) {
- info->send_cq = NULL;
+ sc->ib.send_cq =
+ ib_alloc_cq_any(sc->ib.dev, info,
+ sp->send_credit_target, IB_POLL_SOFTIRQ);
+ if (IS_ERR(sc->ib.send_cq)) {
+ sc->ib.send_cq = NULL;
goto alloc_cq_failed;
}
- info->recv_cq =
- ib_alloc_cq_any(info->id->device, info,
- info->receive_credit_max, IB_POLL_SOFTIRQ);
- if (IS_ERR(info->recv_cq)) {
- info->recv_cq = NULL;
+ sc->ib.recv_cq =
+ ib_alloc_cq_any(sc->ib.dev, info,
+ sp->recv_credit_max, IB_POLL_SOFTIRQ);
+ if (IS_ERR(sc->ib.recv_cq)) {
+ sc->ib.recv_cq = NULL;
goto alloc_cq_failed;
}
memset(&qp_attr, 0, sizeof(qp_attr));
qp_attr.event_handler = smbd_qp_async_error_upcall;
qp_attr.qp_context = info;
- qp_attr.cap.max_send_wr = info->send_credit_target;
- qp_attr.cap.max_recv_wr = info->receive_credit_max;
+ qp_attr.cap.max_send_wr = sp->send_credit_target;
+ qp_attr.cap.max_recv_wr = sp->recv_credit_max;
qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SEND_SGE;
qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_RECV_SGE;
qp_attr.cap.max_inline_data = 0;
qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
qp_attr.qp_type = IB_QPT_RC;
- qp_attr.send_cq = info->send_cq;
- qp_attr.recv_cq = info->recv_cq;
+ qp_attr.send_cq = sc->ib.send_cq;
+ qp_attr.recv_cq = sc->ib.recv_cq;
qp_attr.port_num = ~0;
- rc = rdma_create_qp(info->id, info->pd, &qp_attr);
+ rc = rdma_create_qp(sc->rdma.cm_id, sc->ib.pd, &qp_attr);
if (rc) {
log_rdma_event(ERR, "rdma_create_qp failed %i\n", rc);
goto create_qp_failed;
}
+ sc->ib.qp = sc->rdma.cm_id->qp;
memset(&conn_param, 0, sizeof(conn_param));
conn_param.initiator_depth = 0;
conn_param.responder_resources =
- min(info->id->device->attrs.max_qp_rd_atom,
+ min(sc->ib.dev->attrs.max_qp_rd_atom,
SMBD_CM_RESPONDER_RESOURCES);
info->responder_resources = conn_param.responder_resources;
log_rdma_mr(INFO, "responder_resources=%d\n",
info->responder_resources);
/* Need to send IRD/ORD in private data for iWARP */
- info->id->device->ops.get_port_immutable(
- info->id->device, info->id->port_num, &port_immutable);
+ sc->ib.dev->ops.get_port_immutable(
+ sc->ib.dev, sc->rdma.cm_id->port_num, &port_immutable);
if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
ird_ord_hdr[0] = info->responder_resources;
ird_ord_hdr[1] = 1;
@@ -1615,16 +1674,16 @@ static struct smbd_connection *_smbd_get_connection(
init_waitqueue_head(&info->conn_wait);
init_waitqueue_head(&info->disconn_wait);
init_waitqueue_head(&info->wait_reassembly_queue);
- rc = rdma_connect(info->id, &conn_param);
+ rc = rdma_connect(sc->rdma.cm_id, &conn_param);
if (rc) {
log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc);
goto rdma_connect_failed;
}
wait_event_interruptible(
- info->conn_wait, info->transport_status != SMBD_CONNECTING);
+ info->conn_wait, sc->status != SMBDIRECT_SOCKET_CONNECTING);
- if (info->transport_status != SMBD_CONNECTED) {
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
log_rdma_event(ERR, "rdma_connect failed port=%d\n", port);
goto rdma_connect_failed;
}
@@ -1640,7 +1699,7 @@ static struct smbd_connection *_smbd_get_connection(
init_waitqueue_head(&info->wait_send_queue);
INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer);
queue_delayed_work(info->workqueue, &info->idle_timer_work,
- info->keep_alive_interval*HZ);
+ msecs_to_jiffies(sp->keepalive_interval_msec));
init_waitqueue_head(&info->wait_send_pending);
atomic_set(&info->send_pending, 0);
@@ -1675,26 +1734,26 @@ allocate_mr_failed:
negotiation_failed:
cancel_delayed_work_sync(&info->idle_timer_work);
destroy_caches_and_workqueue(info);
- info->transport_status = SMBD_NEGOTIATE_FAILED;
+ sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
init_waitqueue_head(&info->conn_wait);
- rdma_disconnect(info->id);
+ rdma_disconnect(sc->rdma.cm_id);
wait_event(info->conn_wait,
- info->transport_status == SMBD_DISCONNECTED);
+ sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
allocate_cache_failed:
rdma_connect_failed:
- rdma_destroy_qp(info->id);
+ rdma_destroy_qp(sc->rdma.cm_id);
create_qp_failed:
alloc_cq_failed:
- if (info->send_cq)
- ib_free_cq(info->send_cq);
- if (info->recv_cq)
- ib_free_cq(info->recv_cq);
+ if (sc->ib.send_cq)
+ ib_free_cq(sc->ib.send_cq);
+ if (sc->ib.recv_cq)
+ ib_free_cq(sc->ib.recv_cq);
config_failed:
- ib_dealloc_pd(info->pd);
- rdma_destroy_id(info->id);
+ ib_dealloc_pd(sc->ib.pd);
+ rdma_destroy_id(sc->rdma.cm_id);
create_id_failed:
kfree(info);
@@ -1719,34 +1778,39 @@ try_again:
}
/*
- * Receive data from receive reassembly queue
+ * Receive data from the transport's receive reassembly queue
* All the incoming data packets are placed in reassembly queue
- * buf: the buffer to read data into
+ * iter: the buffer to read data into
* size: the length of data to read
* return value: actual data read
- * Note: this implementation copies the data from reassebmly queue to receive
+ *
+ * Note: this implementation copies the data from reassembly queue to receive
* buffers used by upper layer. This is not the optimal code path. A better way
* to do it is to not have upper layer allocate its receive buffers but rather
* borrow the buffer from reassembly queue, and return it after data is
* consumed. But this will require more changes to upper layer code, and also
* need to consider packet boundaries while they still being reassembled.
*/
-static int smbd_recv_buf(struct smbd_connection *info, char *buf,
- unsigned int size)
+int smbd_recv(struct smbd_connection *info, struct msghdr *msg)
{
+ struct smbdirect_socket *sc = &info->socket;
struct smbd_response *response;
- struct smbd_data_transfer *data_transfer;
+ struct smbdirect_data_transfer *data_transfer;
+ size_t size = iov_iter_count(&msg->msg_iter);
int to_copy, to_read, data_read, offset;
u32 data_length, remaining_data_length, data_offset;
int rc;
+ if (WARN_ON_ONCE(iov_iter_rw(&msg->msg_iter) == WRITE))
+ return -EINVAL; /* It's a bug in upper layer to get there */
+
again:
/*
* No need to hold the reassembly queue lock all the time as we are
* the only one reading from the front of the queue. The transport
* may add more entries to the back of the queue at the same time
*/
- log_read(INFO, "size=%d info->reassembly_data_length=%d\n", size,
+ log_read(INFO, "size=%zd info->reassembly_data_length=%d\n", size,
info->reassembly_data_length);
if (info->reassembly_data_length >= size) {
int queue_length;
@@ -1784,7 +1848,10 @@ again:
if (response->first_segment && size == 4) {
unsigned int rfc1002_len =
data_length + remaining_data_length;
- *((__be32 *)buf) = cpu_to_be32(rfc1002_len);
+ __be32 rfc1002_hdr = cpu_to_be32(rfc1002_len);
+ if (copy_to_iter(&rfc1002_hdr, sizeof(rfc1002_hdr),
+ &msg->msg_iter) != sizeof(rfc1002_hdr))
+ return -EFAULT;
data_read = 4;
response->first_segment = false;
log_read(INFO, "returning rfc1002 length %d\n",
@@ -1793,10 +1860,9 @@ again:
}
to_copy = min_t(int, data_length - offset, to_read);
- memcpy(
- buf + data_read,
- (char *)data_transfer + data_offset + offset,
- to_copy);
+ if (copy_to_iter((char *)data_transfer + data_offset + offset,
+ to_copy, &msg->msg_iter) != to_copy)
+ return -EFAULT;
/* move on to the next buffer? */
if (to_copy == data_length - offset) {
@@ -1848,12 +1914,12 @@ read_rfc1002_done:
rc = wait_event_interruptible(
info->wait_reassembly_queue,
info->reassembly_data_length >= size ||
- info->transport_status != SMBD_CONNECTED);
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
/* Don't return any data if interrupted */
if (rc)
return rc;
- if (info->transport_status != SMBD_CONNECTED) {
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
log_read(ERR, "disconnected\n");
return -ECONNABORTED;
}
@@ -1862,89 +1928,6 @@ read_rfc1002_done:
}
/*
- * Receive a page from receive reassembly queue
- * page: the page to read data into
- * to_read: the length of data to read
- * return value: actual data read
- */
-static int smbd_recv_page(struct smbd_connection *info,
- struct page *page, unsigned int page_offset,
- unsigned int to_read)
-{
- int ret;
- char *to_address;
- void *page_address;
-
- /* make sure we have the page ready for read */
- ret = wait_event_interruptible(
- info->wait_reassembly_queue,
- info->reassembly_data_length >= to_read ||
- info->transport_status != SMBD_CONNECTED);
- if (ret)
- return ret;
-
- /* now we can read from reassembly queue and not sleep */
- page_address = kmap_atomic(page);
- to_address = (char *) page_address + page_offset;
-
- log_read(INFO, "reading from page=%p address=%p to_read=%d\n",
- page, to_address, to_read);
-
- ret = smbd_recv_buf(info, to_address, to_read);
- kunmap_atomic(page_address);
-
- return ret;
-}
-
-/*
- * Receive data from transport
- * msg: a msghdr point to the buffer, can be ITER_KVEC or ITER_BVEC
- * return: total bytes read, or 0. SMB Direct will not do partial read.
- */
-int smbd_recv(struct smbd_connection *info, struct msghdr *msg)
-{
- char *buf;
- struct page *page;
- unsigned int to_read, page_offset;
- int rc;
-
- if (iov_iter_rw(&msg->msg_iter) == WRITE) {
- /* It's a bug in upper layer to get there */
- cifs_dbg(VFS, "Invalid msg iter dir %u\n",
- iov_iter_rw(&msg->msg_iter));
- rc = -EINVAL;
- goto out;
- }
-
- switch (iov_iter_type(&msg->msg_iter)) {
- case ITER_KVEC:
- buf = msg->msg_iter.kvec->iov_base;
- to_read = msg->msg_iter.kvec->iov_len;
- rc = smbd_recv_buf(info, buf, to_read);
- break;
-
- case ITER_BVEC:
- page = msg->msg_iter.bvec->bv_page;
- page_offset = msg->msg_iter.bvec->bv_offset;
- to_read = msg->msg_iter.bvec->bv_len;
- rc = smbd_recv_page(info, page, page_offset, to_read);
- break;
-
- default:
- /* It's a bug in upper layer to get there */
- cifs_dbg(VFS, "Invalid msg type %d\n",
- iov_iter_type(&msg->msg_iter));
- rc = -EINVAL;
- }
-
-out:
- /* SMBDirect will read it all or nothing */
- if (rc > 0)
- msg->msg_iter.count = 0;
- return rc;
-}
-
-/*
* Send data to transport
* Each rqst is transported as a SMBDirect payload
* rqst: the data to write
@@ -1954,12 +1937,14 @@ int smbd_send(struct TCP_Server_Info *server,
int num_rqst, struct smb_rqst *rqst_array)
{
struct smbd_connection *info = server->smbd_conn;
+ struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
struct smb_rqst *rqst;
struct iov_iter iter;
unsigned int remaining_data_length, klen;
int rc, i, rqst_idx;
- if (info->transport_status != SMBD_CONNECTED)
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
return -EAGAIN;
/*
@@ -1971,10 +1956,10 @@ int smbd_send(struct TCP_Server_Info *server,
for (i = 0; i < num_rqst; i++)
remaining_data_length += smb_rqst_len(server, &rqst_array[i]);
- if (unlikely(remaining_data_length > info->max_fragmented_send_size)) {
+ if (unlikely(remaining_data_length > sp->max_fragmented_send_size)) {
/* assertion: payload never exceeds negotiated maximum */
log_write(ERR, "payload size %d > max size %d\n",
- remaining_data_length, info->max_fragmented_send_size);
+ remaining_data_length, sp->max_fragmented_send_size);
return -EINVAL;
}
@@ -2000,14 +1985,14 @@ int smbd_send(struct TCP_Server_Info *server,
klen += rqst->rq_iov[i].iov_len;
iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen);
- rc = smbd_post_send_iter(info, &iter, &remaining_data_length);
+ rc = smbd_post_send_full_iter(info, &iter, &remaining_data_length);
if (rc < 0)
break;
if (iov_iter_count(&rqst->rq_iter) > 0) {
/* And then the data pages if there are any */
- rc = smbd_post_send_iter(info, &rqst->rq_iter,
- &remaining_data_length);
+ rc = smbd_post_send_full_iter(info, &rqst->rq_iter,
+ &remaining_data_length);
if (rc < 0)
break;
}
@@ -2053,6 +2038,7 @@ static void smbd_mr_recovery_work(struct work_struct *work)
{
struct smbd_connection *info =
container_of(work, struct smbd_connection, mr_recovery_work);
+ struct smbdirect_socket *sc = &info->socket;
struct smbd_mr *smbdirect_mr;
int rc;
@@ -2070,7 +2056,7 @@ static void smbd_mr_recovery_work(struct work_struct *work)
}
smbdirect_mr->mr = ib_alloc_mr(
- info->pd, info->mr_type,
+ sc->ib.pd, info->mr_type,
info->max_frmr_depth);
if (IS_ERR(smbdirect_mr->mr)) {
log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n",
@@ -2099,12 +2085,13 @@ static void smbd_mr_recovery_work(struct work_struct *work)
static void destroy_mr_list(struct smbd_connection *info)
{
+ struct smbdirect_socket *sc = &info->socket;
struct smbd_mr *mr, *tmp;
cancel_work_sync(&info->mr_recovery_work);
list_for_each_entry_safe(mr, tmp, &info->mr_list, list) {
if (mr->state == MR_INVALIDATED)
- ib_dma_unmap_sg(info->id->device, mr->sgt.sgl,
+ ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl,
mr->sgt.nents, mr->dir);
ib_dereg_mr(mr->mr);
kfree(mr->sgt.sgl);
@@ -2121,6 +2108,7 @@ static void destroy_mr_list(struct smbd_connection *info)
*/
static int allocate_mr_list(struct smbd_connection *info)
{
+ struct smbdirect_socket *sc = &info->socket;
int i;
struct smbd_mr *smbdirect_mr, *tmp;
@@ -2136,7 +2124,7 @@ static int allocate_mr_list(struct smbd_connection *info)
smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL);
if (!smbdirect_mr)
goto cleanup_entries;
- smbdirect_mr->mr = ib_alloc_mr(info->pd, info->mr_type,
+ smbdirect_mr->mr = ib_alloc_mr(sc->ib.pd, info->mr_type,
info->max_frmr_depth);
if (IS_ERR(smbdirect_mr->mr)) {
log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n",
@@ -2181,20 +2169,20 @@ cleanup_entries:
*/
static struct smbd_mr *get_mr(struct smbd_connection *info)
{
+ struct smbdirect_socket *sc = &info->socket;
struct smbd_mr *ret;
int rc;
again:
rc = wait_event_interruptible(info->wait_mr,
atomic_read(&info->mr_ready_count) ||
- info->transport_status != SMBD_CONNECTED);
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
if (rc) {
log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc);
return NULL;
}
- if (info->transport_status != SMBD_CONNECTED) {
- log_rdma_mr(ERR, "info->transport_status=%x\n",
- info->transport_status);
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
+ log_rdma_mr(ERR, "sc->status=%x\n", sc->status);
return NULL;
}
@@ -2247,6 +2235,7 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info,
struct iov_iter *iter,
bool writing, bool need_invalidate)
{
+ struct smbdirect_socket *sc = &info->socket;
struct smbd_mr *smbdirect_mr;
int rc, num_pages;
enum dma_data_direction dir;
@@ -2276,7 +2265,7 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info,
num_pages, iov_iter_count(iter), info->max_frmr_depth);
smbd_iter_to_mr(info, iter, &smbdirect_mr->sgt, info->max_frmr_depth);
- rc = ib_dma_map_sg(info->id->device, smbdirect_mr->sgt.sgl,
+ rc = ib_dma_map_sg(sc->ib.dev, smbdirect_mr->sgt.sgl,
smbdirect_mr->sgt.nents, dir);
if (!rc) {
log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n",
@@ -2312,7 +2301,7 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info,
* on IB_WR_REG_MR. Hardware enforces a barrier and order of execution
* on the next ib_post_send when we actually send I/O to remote peer
*/
- rc = ib_post_send(info->id->qp, &reg_wr->wr, NULL);
+ rc = ib_post_send(sc->ib.qp, &reg_wr->wr, NULL);
if (!rc)
return smbdirect_mr;
@@ -2321,7 +2310,7 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info,
/* If all failed, attempt to recover this MR by setting it MR_ERROR*/
map_mr_error:
- ib_dma_unmap_sg(info->id->device, smbdirect_mr->sgt.sgl,
+ ib_dma_unmap_sg(sc->ib.dev, smbdirect_mr->sgt.sgl,
smbdirect_mr->sgt.nents, smbdirect_mr->dir);
dma_map_error:
@@ -2359,6 +2348,7 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr)
{
struct ib_send_wr *wr;
struct smbd_connection *info = smbdirect_mr->conn;
+ struct smbdirect_socket *sc = &info->socket;
int rc = 0;
if (smbdirect_mr->need_invalidate) {
@@ -2372,7 +2362,7 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr)
wr->send_flags = IB_SEND_SIGNALED;
init_completion(&smbdirect_mr->invalidate_done);
- rc = ib_post_send(info->id->qp, wr, NULL);
+ rc = ib_post_send(sc->ib.qp, wr, NULL);
if (rc) {
log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc);
smbd_disconnect_rdma_connection(info);
@@ -2389,7 +2379,7 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr)
if (smbdirect_mr->state == MR_INVALIDATED) {
ib_dma_unmap_sg(
- info->id->device, smbdirect_mr->sgt.sgl,
+ sc->ib.dev, smbdirect_mr->sgt.sgl,
smbdirect_mr->sgt.nents,
smbdirect_mr->dir);
smbdirect_mr->state = MR_READY;
@@ -2552,13 +2542,14 @@ static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter,
size_t fsize = folioq_folio_size(folioq, slot);
if (offset < fsize) {
- size_t part = umin(maxsize - ret, fsize - offset);
+ size_t part = umin(maxsize, fsize - offset);
if (!smb_set_sge(rdma, folio_page(folio, 0), offset, part))
return -EIO;
offset += part;
ret += part;
+ maxsize -= part;
}
if (offset >= fsize) {
@@ -2573,7 +2564,7 @@ static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter,
slot = 0;
}
}
- } while (rdma->nr_sge < rdma->max_sge || maxsize > 0);
+ } while (rdma->nr_sge < rdma->max_sge && maxsize > 0);
iter->folioq = folioq;
iter->folioq_slot = slot;
diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h
index c08e3665150d..75b3f491c3ad 100644
--- a/fs/smb/client/smbdirect.h
+++ b/fs/smb/client/smbdirect.h
@@ -15,6 +15,9 @@
#include <rdma/rdma_cm.h>
#include <linux/mempool.h>
+#include "../common/smbdirect/smbdirect.h"
+#include "../common/smbdirect/smbdirect_socket.h"
+
extern int rdma_readwrite_threshold;
extern int smbd_max_frmr_depth;
extern int smbd_keep_alive_interval;
@@ -50,14 +53,8 @@ enum smbd_connection_status {
* 5. mempools for allocating packets
*/
struct smbd_connection {
- enum smbd_connection_status transport_status;
-
- /* RDMA related */
- struct rdma_cm_id *id;
- struct ib_qp_init_attr qp_attr;
- struct ib_pd *pd;
- struct ib_cq *send_cq, *recv_cq;
- struct ib_device_attr dev_attr;
+ struct smbdirect_socket socket;
+
int ri_rc;
struct completion ri_done;
wait_queue_head_t conn_wait;
@@ -72,15 +69,7 @@ struct smbd_connection {
spinlock_t lock_new_credits_offered;
int new_credits_offered;
- /* Connection parameters defined in [MS-SMBD] 3.1.1.1 */
- int receive_credit_max;
- int send_credit_target;
- int max_send_size;
- int max_fragmented_recv_size;
- int max_fragmented_send_size;
- int max_receive_size;
- int keep_alive_interval;
- int max_readwrite_size;
+ /* dynamic connection parameters defined in [MS-SMBD] 3.1.1.1 */
enum keep_alive_status keep_alive_requested;
int protocol;
atomic_t send_credits;
@@ -177,54 +166,6 @@ enum smbd_message_type {
SMBD_TRANSFER_DATA,
};
-#define SMB_DIRECT_RESPONSE_REQUESTED 0x0001
-
-/* SMBD negotiation request packet [MS-SMBD] 2.2.1 */
-struct smbd_negotiate_req {
- __le16 min_version;
- __le16 max_version;
- __le16 reserved;
- __le16 credits_requested;
- __le32 preferred_send_size;
- __le32 max_receive_size;
- __le32 max_fragmented_size;
-} __packed;
-
-/* SMBD negotiation response packet [MS-SMBD] 2.2.2 */
-struct smbd_negotiate_resp {
- __le16 min_version;
- __le16 max_version;
- __le16 negotiated_version;
- __le16 reserved;
- __le16 credits_requested;
- __le16 credits_granted;
- __le32 status;
- __le32 max_readwrite_size;
- __le32 preferred_send_size;
- __le32 max_receive_size;
- __le32 max_fragmented_size;
-} __packed;
-
-/* SMBD data transfer packet with payload [MS-SMBD] 2.2.3 */
-struct smbd_data_transfer {
- __le16 credits_requested;
- __le16 credits_granted;
- __le16 flags;
- __le16 reserved;
- __le32 remaining_data_length;
- __le32 data_offset;
- __le32 data_length;
- __le32 padding;
- __u8 buffer[];
-} __packed;
-
-/* The packet fields for a registered RDMA buffer */
-struct smbd_buffer_descriptor_v1 {
- __le64 offset;
- __le32 token;
- __le32 length;
-} __packed;
-
/* Maximum number of SGEs used by smbdirect.c in any send work request */
#define SMBDIRECT_MAX_SEND_SGE 6
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index 52bcb55d9952..93e5b2bb9f28 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -140,7 +140,7 @@ DECLARE_EVENT_CLASS(smb3_rw_err_class,
__entry->len = len;
__entry->rc = rc;
),
- TP_printk("\tR=%08x[%x] xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d",
+ TP_printk("R=%08x[%x] xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d",
__entry->rreq_debug_id, __entry->rreq_debug_index,
__entry->xid, __entry->sesid, __entry->tid, __entry->fid,
__entry->offset, __entry->len, __entry->rc)
@@ -190,7 +190,7 @@ DECLARE_EVENT_CLASS(smb3_other_err_class,
__entry->len = len;
__entry->rc = rc;
),
- TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d",
+ TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d",
__entry->xid, __entry->sesid, __entry->tid, __entry->fid,
__entry->offset, __entry->len, __entry->rc)
)
@@ -247,7 +247,7 @@ DECLARE_EVENT_CLASS(smb3_copy_range_err_class,
__entry->len = len;
__entry->rc = rc;
),
- TP_printk("\txid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x rc=%d",
+ TP_printk("xid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x rc=%d",
__entry->xid, __entry->sesid, __entry->tid, __entry->target_fid,
__entry->src_offset, __entry->target_fid, __entry->target_offset, __entry->len, __entry->rc)
)
@@ -298,7 +298,7 @@ DECLARE_EVENT_CLASS(smb3_copy_range_done_class,
__entry->target_offset = target_offset;
__entry->len = len;
),
- TP_printk("\txid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x",
+ TP_printk("xid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x",
__entry->xid, __entry->sesid, __entry->tid, __entry->target_fid,
__entry->src_offset, __entry->target_fid, __entry->target_offset, __entry->len)
)
@@ -482,7 +482,7 @@ DECLARE_EVENT_CLASS(smb3_fd_class,
__entry->tid = tid;
__entry->sesid = sesid;
),
- TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx",
+ TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx",
__entry->xid, __entry->sesid, __entry->tid, __entry->fid)
)
@@ -521,7 +521,7 @@ DECLARE_EVENT_CLASS(smb3_fd_err_class,
__entry->sesid = sesid;
__entry->rc = rc;
),
- TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx rc=%d",
+ TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx rc=%d",
__entry->xid, __entry->sesid, __entry->tid, __entry->fid,
__entry->rc)
)
@@ -794,7 +794,7 @@ DECLARE_EVENT_CLASS(smb3_cmd_err_class,
__entry->status = status;
__entry->rc = rc;
),
- TP_printk("\tsid=0x%llx tid=0x%x cmd=%u mid=%llu status=0x%x rc=%d",
+ TP_printk("sid=0x%llx tid=0x%x cmd=%u mid=%llu status=0x%x rc=%d",
__entry->sesid, __entry->tid, __entry->cmd, __entry->mid,
__entry->status, __entry->rc)
)
@@ -829,7 +829,7 @@ DECLARE_EVENT_CLASS(smb3_cmd_done_class,
__entry->cmd = cmd;
__entry->mid = mid;
),
- TP_printk("\tsid=0x%llx tid=0x%x cmd=%u mid=%llu",
+ TP_printk("sid=0x%llx tid=0x%x cmd=%u mid=%llu",
__entry->sesid, __entry->tid,
__entry->cmd, __entry->mid)
)
@@ -867,7 +867,7 @@ DECLARE_EVENT_CLASS(smb3_mid_class,
__entry->when_sent = when_sent;
__entry->when_received = when_received;
),
- TP_printk("\tcmd=%u mid=%llu pid=%u, when_sent=%lu when_rcv=%lu",
+ TP_printk("cmd=%u mid=%llu pid=%u, when_sent=%lu when_rcv=%lu",
__entry->cmd, __entry->mid, __entry->pid, __entry->when_sent,
__entry->when_received)
)
@@ -898,7 +898,7 @@ DECLARE_EVENT_CLASS(smb3_exit_err_class,
__assign_str(func_name);
__entry->rc = rc;
),
- TP_printk("\t%s: xid=%u rc=%d",
+ TP_printk("%s: xid=%u rc=%d",
__get_str(func_name), __entry->xid, __entry->rc)
)
@@ -924,7 +924,7 @@ DECLARE_EVENT_CLASS(smb3_sync_err_class,
__entry->ino = ino;
__entry->rc = rc;
),
- TP_printk("\tino=%lu rc=%d",
+ TP_printk("ino=%lu rc=%d",
__entry->ino, __entry->rc)
)
@@ -950,7 +950,7 @@ DECLARE_EVENT_CLASS(smb3_enter_exit_class,
__entry->xid = xid;
__assign_str(func_name);
),
- TP_printk("\t%s: xid=%u",
+ TP_printk("%s: xid=%u",
__get_str(func_name), __entry->xid)
)
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 266af17aa7d9..191783f553ce 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -1018,14 +1018,16 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses)
uint index = 0;
unsigned int min_in_flight = UINT_MAX, max_in_flight = 0;
struct TCP_Server_Info *server = NULL;
- int i;
+ int i, start, cur;
if (!ses)
return NULL;
spin_lock(&ses->chan_lock);
+ start = atomic_inc_return(&ses->chan_seq);
for (i = 0; i < ses->chan_count; i++) {
- server = ses->chans[i].server;
+ cur = (start + i) % ses->chan_count;
+ server = ses->chans[cur].server;
if (!server || server->terminate)
continue;
@@ -1042,17 +1044,15 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses)
*/
if (server->in_flight < min_in_flight) {
min_in_flight = server->in_flight;
- index = i;
+ index = cur;
}
if (server->in_flight > max_in_flight)
max_in_flight = server->in_flight;
}
/* if all channels are equally loaded, fall back to round-robin */
- if (min_in_flight == max_in_flight) {
- index = (uint)atomic_inc_return(&ses->chan_seq);
- index %= ses->chan_count;
- }
+ if (min_in_flight == max_in_flight)
+ index = (uint)start % ses->chan_count;
server = ses->chans[index].server;
spin_unlock(&ses->chan_lock);
diff --git a/fs/smb/common/smbdirect/smbdirect.h b/fs/smb/common/smbdirect/smbdirect.h
new file mode 100644
index 000000000000..b9a385344ff3
--- /dev/null
+++ b/fs/smb/common/smbdirect/smbdirect.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2017, Microsoft Corporation.
+ * Copyright (C) 2018, LG Electronics.
+ */
+
+#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__
+#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__
+
+/* SMB-DIRECT buffer descriptor V1 structure [MS-SMBD] 2.2.3.1 */
+struct smbdirect_buffer_descriptor_v1 {
+ __le64 offset;
+ __le32 token;
+ __le32 length;
+} __packed;
+
+/*
+ * Connection parameters mostly from [MS-SMBD] 3.1.1.1
+ *
+ * These are setup and negotiated at the beginning of a
+ * connection and remain constant unless explicitly changed.
+ *
+ * Some values are important for the upper layer.
+ */
+struct smbdirect_socket_parameters {
+ __u16 recv_credit_max;
+ __u16 send_credit_target;
+ __u32 max_send_size;
+ __u32 max_fragmented_send_size;
+ __u32 max_recv_size;
+ __u32 max_fragmented_recv_size;
+ __u32 max_read_write_size;
+ __u32 keepalive_interval_msec;
+ __u32 keepalive_timeout_msec;
+} __packed;
+
+#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__ */
diff --git a/fs/smb/common/smbdirect/smbdirect_pdu.h b/fs/smb/common/smbdirect/smbdirect_pdu.h
new file mode 100644
index 000000000000..ae9fdb05ce23
--- /dev/null
+++ b/fs/smb/common/smbdirect/smbdirect_pdu.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2017 Stefan Metzmacher
+ */
+
+#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PDU_H__
+#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PDU_H__
+
+#define SMBDIRECT_V1 0x0100
+
+/* SMBD negotiation request packet [MS-SMBD] 2.2.1 */
+struct smbdirect_negotiate_req {
+ __le16 min_version;
+ __le16 max_version;
+ __le16 reserved;
+ __le16 credits_requested;
+ __le32 preferred_send_size;
+ __le32 max_receive_size;
+ __le32 max_fragmented_size;
+} __packed;
+
+/* SMBD negotiation response packet [MS-SMBD] 2.2.2 */
+struct smbdirect_negotiate_resp {
+ __le16 min_version;
+ __le16 max_version;
+ __le16 negotiated_version;
+ __le16 reserved;
+ __le16 credits_requested;
+ __le16 credits_granted;
+ __le32 status;
+ __le32 max_readwrite_size;
+ __le32 preferred_send_size;
+ __le32 max_receive_size;
+ __le32 max_fragmented_size;
+} __packed;
+
+#define SMBDIRECT_DATA_MIN_HDR_SIZE 0x14
+#define SMBDIRECT_DATA_OFFSET 0x18
+
+#define SMBDIRECT_FLAG_RESPONSE_REQUESTED 0x0001
+
+/* SMBD data transfer packet with payload [MS-SMBD] 2.2.3 */
+struct smbdirect_data_transfer {
+ __le16 credits_requested;
+ __le16 credits_granted;
+ __le16 flags;
+ __le16 reserved;
+ __le32 remaining_data_length;
+ __le32 data_offset;
+ __le32 data_length;
+ __le32 padding;
+ __u8 buffer[];
+} __packed;
+
+#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PDU_H__ */
diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h
new file mode 100644
index 000000000000..e5b15cc44a7b
--- /dev/null
+++ b/fs/smb/common/smbdirect/smbdirect_socket.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2025 Stefan Metzmacher
+ */
+
+#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__
+#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__
+
+enum smbdirect_socket_status {
+ SMBDIRECT_SOCKET_CREATED,
+ SMBDIRECT_SOCKET_CONNECTING,
+ SMBDIRECT_SOCKET_CONNECTED,
+ SMBDIRECT_SOCKET_NEGOTIATE_FAILED,
+ SMBDIRECT_SOCKET_DISCONNECTING,
+ SMBDIRECT_SOCKET_DISCONNECTED,
+ SMBDIRECT_SOCKET_DESTROYED
+};
+
+struct smbdirect_socket {
+ enum smbdirect_socket_status status;
+
+ /* RDMA related */
+ struct {
+ struct rdma_cm_id *cm_id;
+ } rdma;
+
+ /* IB verbs related */
+ struct {
+ struct ib_pd *pd;
+ struct ib_cq *send_cq;
+ struct ib_cq *recv_cq;
+
+ /*
+ * shortcuts for rdma.cm_id->{qp,device};
+ */
+ struct ib_qp *qp;
+ struct ib_device *dev;
+ } ib;
+
+ struct smbdirect_socket_parameters parameters;
+};
+
+#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__ */
diff --git a/fs/smb/server/Kconfig b/fs/smb/server/Kconfig
index cf70e96ad4de..4a23a5e7e8fe 100644
--- a/fs/smb/server/Kconfig
+++ b/fs/smb/server/Kconfig
@@ -11,6 +11,7 @@ config SMB_SERVER
select CRYPTO_HMAC
select CRYPTO_ECB
select CRYPTO_LIB_DES
+ select CRYPTO_LIB_SHA256
select CRYPTO_SHA256
select CRYPTO_CMAC
select CRYPTO_SHA512
diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c
index b3d121052408..d99871c21451 100644
--- a/fs/smb/server/auth.c
+++ b/fs/smb/server/auth.c
@@ -979,40 +979,6 @@ out:
return rc;
}
-int ksmbd_gen_sd_hash(struct ksmbd_conn *conn, char *sd_buf, int len,
- __u8 *pi_hash)
-{
- int rc;
- struct ksmbd_crypto_ctx *ctx = NULL;
-
- ctx = ksmbd_crypto_ctx_find_sha256();
- if (!ctx) {
- ksmbd_debug(AUTH, "could not alloc sha256\n");
- return -ENOMEM;
- }
-
- rc = crypto_shash_init(CRYPTO_SHA256(ctx));
- if (rc) {
- ksmbd_debug(AUTH, "could not init shashn");
- goto out;
- }
-
- rc = crypto_shash_update(CRYPTO_SHA256(ctx), sd_buf, len);
- if (rc) {
- ksmbd_debug(AUTH, "could not update with n\n");
- goto out;
- }
-
- rc = crypto_shash_final(CRYPTO_SHA256(ctx), pi_hash);
- if (rc) {
- ksmbd_debug(AUTH, "Could not generate hash err : %d\n", rc);
- goto out;
- }
-out:
- ksmbd_release_crypto_ctx(ctx);
- return rc;
-}
-
static int ksmbd_get_encryption_key(struct ksmbd_work *work, __u64 ses_id,
int enc, u8 *key)
{
diff --git a/fs/smb/server/auth.h b/fs/smb/server/auth.h
index 362b6159a6cf..6879a1bd1b91 100644
--- a/fs/smb/server/auth.h
+++ b/fs/smb/server/auth.h
@@ -66,6 +66,4 @@ int ksmbd_gen_smb311_encryptionkey(struct ksmbd_conn *conn,
struct ksmbd_session *sess);
int ksmbd_gen_preauth_integrity_hash(struct ksmbd_conn *conn, char *buf,
__u8 *pi_hash);
-int ksmbd_gen_sd_hash(struct ksmbd_conn *conn, char *sd_buf, int len,
- __u8 *pi_hash);
#endif
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index 83764c230e9d..3f04a2977ba8 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -40,7 +40,7 @@ void ksmbd_conn_free(struct ksmbd_conn *conn)
kvfree(conn->request_buf);
kfree(conn->preauth_info);
if (atomic_dec_and_test(&conn->refcnt)) {
- ksmbd_free_transport(conn->transport);
+ conn->transport->ops->free_transport(conn->transport);
kfree(conn);
}
}
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index 14620e147dda..dd3e0e3f7bf0 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -108,6 +108,7 @@ struct ksmbd_conn {
__le16 signing_algorithm;
bool binding;
atomic_t refcnt;
+ bool is_aapl;
};
struct ksmbd_conn_ops {
@@ -132,6 +133,7 @@ struct ksmbd_transport_ops {
void *buf, unsigned int len,
struct smb2_buffer_desc_v1 *desc,
unsigned int desc_len);
+ void (*free_transport)(struct ksmbd_transport *kt);
};
struct ksmbd_transport {
diff --git a/fs/smb/server/crypto_ctx.c b/fs/smb/server/crypto_ctx.c
index ce733dc9a4a3..80bd68c8635e 100644
--- a/fs/smb/server/crypto_ctx.c
+++ b/fs/smb/server/crypto_ctx.c
@@ -75,9 +75,6 @@ static struct shash_desc *alloc_shash_desc(int id)
case CRYPTO_SHASH_CMACAES:
tfm = crypto_alloc_shash("cmac(aes)", 0, 0);
break;
- case CRYPTO_SHASH_SHA256:
- tfm = crypto_alloc_shash("sha256", 0, 0);
- break;
case CRYPTO_SHASH_SHA512:
tfm = crypto_alloc_shash("sha512", 0, 0);
break;
@@ -198,11 +195,6 @@ struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_cmacaes(void)
return ____crypto_shash_ctx_find(CRYPTO_SHASH_CMACAES);
}
-struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha256(void)
-{
- return ____crypto_shash_ctx_find(CRYPTO_SHASH_SHA256);
-}
-
struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha512(void)
{
return ____crypto_shash_ctx_find(CRYPTO_SHASH_SHA512);
diff --git a/fs/smb/server/crypto_ctx.h b/fs/smb/server/crypto_ctx.h
index 4a367c62f653..ac64801d52d3 100644
--- a/fs/smb/server/crypto_ctx.h
+++ b/fs/smb/server/crypto_ctx.h
@@ -13,7 +13,6 @@ enum {
CRYPTO_SHASH_HMACMD5 = 0,
CRYPTO_SHASH_HMACSHA256,
CRYPTO_SHASH_CMACAES,
- CRYPTO_SHASH_SHA256,
CRYPTO_SHASH_SHA512,
CRYPTO_SHASH_MAX,
};
@@ -39,14 +38,12 @@ struct ksmbd_crypto_ctx {
#define CRYPTO_HMACMD5(c) ((c)->desc[CRYPTO_SHASH_HMACMD5])
#define CRYPTO_HMACSHA256(c) ((c)->desc[CRYPTO_SHASH_HMACSHA256])
#define CRYPTO_CMACAES(c) ((c)->desc[CRYPTO_SHASH_CMACAES])
-#define CRYPTO_SHA256(c) ((c)->desc[CRYPTO_SHASH_SHA256])
#define CRYPTO_SHA512(c) ((c)->desc[CRYPTO_SHASH_SHA512])
#define CRYPTO_HMACMD5_TFM(c) ((c)->desc[CRYPTO_SHASH_HMACMD5]->tfm)
#define CRYPTO_HMACSHA256_TFM(c)\
((c)->desc[CRYPTO_SHASH_HMACSHA256]->tfm)
#define CRYPTO_CMACAES_TFM(c) ((c)->desc[CRYPTO_SHASH_CMACAES]->tfm)
-#define CRYPTO_SHA256_TFM(c) ((c)->desc[CRYPTO_SHASH_SHA256]->tfm)
#define CRYPTO_SHA512_TFM(c) ((c)->desc[CRYPTO_SHASH_SHA512]->tfm)
#define CRYPTO_GCM(c) ((c)->ccmaes[CRYPTO_AEAD_AES_GCM])
@@ -57,7 +54,6 @@ struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_hmacmd5(void);
struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_hmacsha256(void);
struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_cmacaes(void);
struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha512(void);
-struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha256(void);
struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_gcm(void);
struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_ccm(void);
void ksmbd_crypto_destroy(void);
diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c
index ab533c602987..8c9c49c3a0a4 100644
--- a/fs/smb/server/server.c
+++ b/fs/smb/server/server.c
@@ -631,6 +631,5 @@ MODULE_SOFTDEP("pre: sha512");
MODULE_SOFTDEP("pre: aead2");
MODULE_SOFTDEP("pre: ccm");
MODULE_SOFTDEP("pre: gcm");
-MODULE_SOFTDEP("pre: crc32");
module_init(ksmbd_server_init)
module_exit(ksmbd_server_exit)
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 8d414239b3fe..0d92ce49aed7 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -1594,7 +1594,7 @@ static int krb5_authenticate(struct ksmbd_work *work,
struct ksmbd_conn *conn = work->conn;
struct ksmbd_session *sess = work->sess;
char *in_blob, *out_blob;
- struct channel *chann = NULL;
+ struct channel *chann = NULL, *old;
u64 prev_sess_id;
int in_len, out_len;
int retval;
@@ -1607,24 +1607,38 @@ static int krb5_authenticate(struct ksmbd_work *work,
out_len = work->response_sz -
(le16_to_cpu(rsp->SecurityBufferOffset) + 4);
- /* Check previous session */
- prev_sess_id = le64_to_cpu(req->PreviousSessionId);
- if (prev_sess_id && prev_sess_id != sess->id)
- destroy_previous_session(conn, sess->user, prev_sess_id);
-
retval = ksmbd_krb5_authenticate(sess, in_blob, in_len,
out_blob, &out_len);
if (retval) {
ksmbd_debug(SMB, "krb5 authentication failed\n");
return -EINVAL;
}
+
+ /* Check previous session */
+ prev_sess_id = le64_to_cpu(req->PreviousSessionId);
+ if (prev_sess_id && prev_sess_id != sess->id)
+ destroy_previous_session(conn, sess->user, prev_sess_id);
+
rsp->SecurityBufferLength = cpu_to_le16(out_len);
- if ((conn->sign || server_conf.enforced_signing) ||
+ /*
+ * If session state is SMB2_SESSION_VALID, We can assume
+ * that it is reauthentication. And the user/password
+ * has been verified, so return it here.
+ */
+ if (sess->state == SMB2_SESSION_VALID) {
+ if (conn->binding)
+ goto binding_session;
+ return 0;
+ }
+
+ if ((rsp->SessionFlags != SMB2_SESSION_FLAG_IS_GUEST_LE &&
+ (conn->sign || server_conf.enforced_signing)) ||
(req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED))
sess->sign = true;
- if (smb3_encryption_negotiated(conn)) {
+ if (smb3_encryption_negotiated(conn) &&
+ !(req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) {
retval = conn->ops->generate_encryptionkey(conn, sess);
if (retval) {
ksmbd_debug(SMB,
@@ -1637,6 +1651,7 @@ static int krb5_authenticate(struct ksmbd_work *work,
sess->sign = false;
}
+binding_session:
if (conn->dialect >= SMB30_PROT_ID) {
chann = lookup_chann_list(sess, conn);
if (!chann) {
@@ -1645,7 +1660,12 @@ static int krb5_authenticate(struct ksmbd_work *work,
return -ENOMEM;
chann->conn = conn;
- xa_store(&sess->ksmbd_chann_list, (long)conn, chann, KSMBD_DEFAULT_GFP);
+ old = xa_store(&sess->ksmbd_chann_list, (long)conn,
+ chann, KSMBD_DEFAULT_GFP);
+ if (xa_is_err(old)) {
+ kfree(chann);
+ return xa_err(old);
+ }
}
}
@@ -1832,8 +1852,6 @@ int smb2_sess_setup(struct ksmbd_work *work)
ksmbd_conn_set_good(conn);
sess->state = SMB2_SESSION_VALID;
}
- kfree(sess->Preauth_HashValue);
- sess->Preauth_HashValue = NULL;
} else if (conn->preferred_auth_mech == KSMBD_AUTH_NTLMSSP) {
if (negblob->MessageType == NtLmNegotiate) {
rc = ntlm_negotiate(work, negblob, negblob_len, rsp);
@@ -1860,8 +1878,6 @@ int smb2_sess_setup(struct ksmbd_work *work)
kfree(preauth_sess);
}
}
- kfree(sess->Preauth_HashValue);
- sess->Preauth_HashValue = NULL;
} else {
pr_info_ratelimited("Unknown NTLMSSP message type : 0x%x\n",
le32_to_cpu(negblob->MessageType));
@@ -2580,7 +2596,7 @@ static void smb2_update_xattrs(struct ksmbd_tree_connect *tcon,
}
}
-static int smb2_creat(struct ksmbd_work *work, struct path *parent_path,
+static int smb2_creat(struct ksmbd_work *work,
struct path *path, char *name, int open_flags,
umode_t posix_mode, bool is_dir)
{
@@ -2609,7 +2625,7 @@ static int smb2_creat(struct ksmbd_work *work, struct path *parent_path,
return rc;
}
- rc = ksmbd_vfs_kern_path_locked(work, name, 0, parent_path, path, 0);
+ rc = ksmbd_vfs_kern_path(work, name, 0, path, 0);
if (rc) {
pr_err("cannot get linux path (%s), err = %d\n",
name, rc);
@@ -2859,7 +2875,7 @@ int smb2_open(struct ksmbd_work *work)
struct ksmbd_tree_connect *tcon = work->tcon;
struct smb2_create_req *req;
struct smb2_create_rsp *rsp;
- struct path path, parent_path;
+ struct path path;
struct ksmbd_share_config *share = tcon->share_conf;
struct ksmbd_file *fp = NULL;
struct file *filp = NULL;
@@ -2874,7 +2890,7 @@ int smb2_open(struct ksmbd_work *work)
int req_op_level = 0, open_flags = 0, may_flags = 0, file_info = 0;
int rc = 0;
int contxt_cnt = 0, query_disk_id = 0;
- int maximal_access_ctxt = 0, posix_ctxt = 0;
+ bool maximal_access_ctxt = false, posix_ctxt = false;
int s_type = 0;
int next_off = 0;
char *name = NULL;
@@ -2903,6 +2919,27 @@ int smb2_open(struct ksmbd_work *work)
return create_smb2_pipe(work);
}
+ if (req->CreateContextsOffset && tcon->posix_extensions) {
+ context = smb2_find_context_vals(req, SMB2_CREATE_TAG_POSIX, 16);
+ if (IS_ERR(context)) {
+ rc = PTR_ERR(context);
+ goto err_out2;
+ } else if (context) {
+ struct create_posix *posix = (struct create_posix *)context;
+
+ if (le16_to_cpu(context->DataOffset) +
+ le32_to_cpu(context->DataLength) <
+ sizeof(struct create_posix) - 4) {
+ rc = -EINVAL;
+ goto err_out2;
+ }
+ ksmbd_debug(SMB, "get posix context\n");
+
+ posix_mode = le32_to_cpu(posix->Mode);
+ posix_ctxt = true;
+ }
+ }
+
if (req->NameLength) {
name = smb2_get_name((char *)req + le16_to_cpu(req->NameOffset),
le16_to_cpu(req->NameLength),
@@ -2925,9 +2962,11 @@ int smb2_open(struct ksmbd_work *work)
goto err_out2;
}
- rc = ksmbd_validate_filename(name);
- if (rc < 0)
- goto err_out2;
+ if (posix_ctxt == false) {
+ rc = ksmbd_validate_filename(name);
+ if (rc < 0)
+ goto err_out2;
+ }
if (ksmbd_share_veto_filename(share, name)) {
rc = -ENOENT;
@@ -3085,28 +3124,6 @@ int smb2_open(struct ksmbd_work *work)
rc = -EBADF;
goto err_out2;
}
-
- if (tcon->posix_extensions) {
- context = smb2_find_context_vals(req,
- SMB2_CREATE_TAG_POSIX, 16);
- if (IS_ERR(context)) {
- rc = PTR_ERR(context);
- goto err_out2;
- } else if (context) {
- struct create_posix *posix =
- (struct create_posix *)context;
- if (le16_to_cpu(context->DataOffset) +
- le32_to_cpu(context->DataLength) <
- sizeof(struct create_posix) - 4) {
- rc = -EINVAL;
- goto err_out2;
- }
- ksmbd_debug(SMB, "get posix context\n");
-
- posix_mode = le32_to_cpu(posix->Mode);
- posix_ctxt = 1;
- }
- }
}
if (ksmbd_override_fsids(work)) {
@@ -3114,8 +3131,8 @@ int smb2_open(struct ksmbd_work *work)
goto err_out2;
}
- rc = ksmbd_vfs_kern_path_locked(work, name, LOOKUP_NO_SYMLINKS,
- &parent_path, &path, 1);
+ rc = ksmbd_vfs_kern_path(work, name, LOOKUP_NO_SYMLINKS,
+ &path, 1);
if (!rc) {
file_present = true;
@@ -3236,7 +3253,7 @@ int smb2_open(struct ksmbd_work *work)
/*create file if not present */
if (!file_present) {
- rc = smb2_creat(work, &parent_path, &path, name, open_flags,
+ rc = smb2_creat(work, &path, name, open_flags,
posix_mode,
req->CreateOptions & FILE_DIRECTORY_FILE_LE);
if (rc) {
@@ -3441,7 +3458,7 @@ int smb2_open(struct ksmbd_work *work)
}
if (file_present || created)
- ksmbd_vfs_kern_path_unlock(&parent_path, &path);
+ path_put(&path);
if (!S_ISDIR(file_inode(filp)->i_mode) && open_flags & O_TRUNC &&
!fp->attrib_only && !stream_name) {
@@ -3539,6 +3556,15 @@ int smb2_open(struct ksmbd_work *work)
ksmbd_debug(SMB, "get query on disk id context\n");
query_disk_id = 1;
}
+
+ if (conn->is_aapl == false) {
+ context = smb2_find_context_vals(req, SMB2_CREATE_AAPL, 4);
+ if (IS_ERR(context)) {
+ rc = PTR_ERR(context);
+ goto err_out1;
+ } else if (context)
+ conn->is_aapl = true;
+ }
}
rc = ksmbd_vfs_getattr(&path, &stat);
@@ -3713,7 +3739,7 @@ reconnected_fp:
err_out:
if (rc && (file_present || created))
- ksmbd_vfs_kern_path_unlock(&parent_path, &path);
+ path_put(&path);
err_out1:
ksmbd_revert_fsids(work);
@@ -3978,7 +4004,10 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
if (dinfo->EaSize)
dinfo->ExtFileAttributes = FILE_ATTRIBUTE_REPARSE_POINT_LE;
dinfo->Reserved = 0;
- dinfo->UniqueId = cpu_to_le64(ksmbd_kstat->kstat->ino);
+ if (conn->is_aapl)
+ dinfo->UniqueId = 0;
+ else
+ dinfo->UniqueId = cpu_to_le64(ksmbd_kstat->kstat->ino);
if (d_info->hide_dot_file && d_info->name[0] == '.')
dinfo->ExtFileAttributes |= FILE_ATTRIBUTE_HIDDEN_LE;
memcpy(dinfo->FileName, conv_name, conv_len);
@@ -3995,7 +4024,10 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
smb2_get_reparse_tag_special_file(ksmbd_kstat->kstat->mode);
if (fibdinfo->EaSize)
fibdinfo->ExtFileAttributes = FILE_ATTRIBUTE_REPARSE_POINT_LE;
- fibdinfo->UniqueId = cpu_to_le64(ksmbd_kstat->kstat->ino);
+ if (conn->is_aapl)
+ fibdinfo->UniqueId = 0;
+ else
+ fibdinfo->UniqueId = cpu_to_le64(ksmbd_kstat->kstat->ino);
fibdinfo->ShortNameLength = 0;
fibdinfo->Reserved = 0;
fibdinfo->Reserved2 = cpu_to_le16(0);
@@ -4091,20 +4123,6 @@ struct smb2_query_dir_private {
int info_level;
};
-static void lock_dir(struct ksmbd_file *dir_fp)
-{
- struct dentry *dir = dir_fp->filp->f_path.dentry;
-
- inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
-}
-
-static void unlock_dir(struct ksmbd_file *dir_fp)
-{
- struct dentry *dir = dir_fp->filp->f_path.dentry;
-
- inode_unlock(d_inode(dir));
-}
-
static int process_query_dir_entries(struct smb2_query_dir_private *priv)
{
struct mnt_idmap *idmap = file_mnt_idmap(priv->dir_fp->filp);
@@ -4119,12 +4137,10 @@ static int process_query_dir_entries(struct smb2_query_dir_private *priv)
if (dentry_name(priv->d_info, priv->info_level))
return -EINVAL;
- lock_dir(priv->dir_fp);
- dent = lookup_one(idmap,
- &QSTR_LEN(priv->d_info->name,
- priv->d_info->name_len),
- priv->dir_fp->filp->f_path.dentry);
- unlock_dir(priv->dir_fp);
+ dent = lookup_one_unlocked(idmap,
+ &QSTR_LEN(priv->d_info->name,
+ priv->d_info->name_len),
+ priv->dir_fp->filp->f_path.dentry);
if (IS_ERR(dent)) {
ksmbd_debug(SMB, "Cannot lookup `%s' [%ld]\n",
@@ -4855,8 +4871,13 @@ static int get_file_standard_info(struct smb2_query_info_rsp *rsp,
sinfo = (struct smb2_file_standard_info *)rsp->Buffer;
delete_pending = ksmbd_inode_pending_delete(fp);
- sinfo->AllocationSize = cpu_to_le64(stat.blocks << 9);
- sinfo->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
+ if (ksmbd_stream_fd(fp) == false) {
+ sinfo->AllocationSize = cpu_to_le64(stat.blocks << 9);
+ sinfo->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
+ } else {
+ sinfo->AllocationSize = cpu_to_le64(fp->stream.size);
+ sinfo->EndOfFile = cpu_to_le64(fp->stream.size);
+ }
sinfo->NumberOfLinks = cpu_to_le32(get_nlink(&stat) - delete_pending);
sinfo->DeletePending = delete_pending;
sinfo->Directory = S_ISDIR(stat.mode) ? 1 : 0;
@@ -4919,9 +4940,14 @@ static int get_file_all_info(struct ksmbd_work *work,
file_info->ChangeTime = cpu_to_le64(time);
file_info->Attributes = fp->f_ci->m_fattr;
file_info->Pad1 = 0;
- file_info->AllocationSize =
- cpu_to_le64(stat.blocks << 9);
- file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
+ if (ksmbd_stream_fd(fp) == false) {
+ file_info->AllocationSize =
+ cpu_to_le64(stat.blocks << 9);
+ file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
+ } else {
+ file_info->AllocationSize = cpu_to_le64(fp->stream.size);
+ file_info->EndOfFile = cpu_to_le64(fp->stream.size);
+ }
file_info->NumberOfLinks =
cpu_to_le32(get_nlink(&stat) - delete_pending);
file_info->DeletePending = delete_pending;
@@ -4930,7 +4956,10 @@ static int get_file_all_info(struct ksmbd_work *work,
file_info->IndexNumber = cpu_to_le64(stat.ino);
file_info->EASize = 0;
file_info->AccessFlags = fp->daccess;
- file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos);
+ if (ksmbd_stream_fd(fp) == false)
+ file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos);
+ else
+ file_info->CurrentByteOffset = cpu_to_le64(fp->stream.pos);
file_info->Mode = fp->coption;
file_info->AlignmentRequirement = 0;
conv_len = smbConvertToUTF16((__le16 *)file_info->FileName, filename,
@@ -5118,8 +5147,13 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp,
time = ksmbd_UnixTimeToNT(stat.ctime);
file_info->ChangeTime = cpu_to_le64(time);
file_info->Attributes = fp->f_ci->m_fattr;
- file_info->AllocationSize = cpu_to_le64(stat.blocks << 9);
- file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
+ if (ksmbd_stream_fd(fp) == false) {
+ file_info->AllocationSize = cpu_to_le64(stat.blocks << 9);
+ file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
+ } else {
+ file_info->AllocationSize = cpu_to_le64(fp->stream.size);
+ file_info->EndOfFile = cpu_to_le64(fp->stream.size);
+ }
file_info->Reserved = cpu_to_le32(0);
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_ntwrk_info));
@@ -5142,7 +5176,11 @@ static void get_file_position_info(struct smb2_query_info_rsp *rsp,
struct smb2_file_pos_info *file_info;
file_info = (struct smb2_file_pos_info *)rsp->Buffer;
- file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos);
+ if (ksmbd_stream_fd(fp) == false)
+ file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos);
+ else
+ file_info->CurrentByteOffset = cpu_to_le64(fp->stream.pos);
+
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_pos_info));
}
@@ -5231,8 +5269,13 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp,
file_info->ChangeTime = cpu_to_le64(time);
file_info->DosAttributes = fp->f_ci->m_fattr;
file_info->Inode = cpu_to_le64(stat.ino);
- file_info->EndOfFile = cpu_to_le64(stat.size);
- file_info->AllocationSize = cpu_to_le64(stat.blocks << 9);
+ if (ksmbd_stream_fd(fp) == false) {
+ file_info->EndOfFile = cpu_to_le64(stat.size);
+ file_info->AllocationSize = cpu_to_le64(stat.blocks << 9);
+ } else {
+ file_info->EndOfFile = cpu_to_le64(fp->stream.size);
+ file_info->AllocationSize = cpu_to_le64(fp->stream.size);
+ }
file_info->HardLinks = cpu_to_le32(stat.nlink);
file_info->Mode = cpu_to_le32(stat.mode & 0777);
switch (stat.mode & S_IFMT) {
@@ -6008,8 +6051,7 @@ static int smb2_create_link(struct ksmbd_work *work,
struct nls_table *local_nls)
{
char *link_name = NULL, *target_name = NULL, *pathname = NULL;
- struct path path, parent_path;
- bool file_present = false;
+ struct path path;
int rc;
if (buf_len < (u64)sizeof(struct smb2_file_link_info) +
@@ -6038,15 +6080,12 @@ static int smb2_create_link(struct ksmbd_work *work,
ksmbd_debug(SMB, "target name is %s\n", target_name);
rc = ksmbd_vfs_kern_path_locked(work, link_name, LOOKUP_NO_SYMLINKS,
- &parent_path, &path, 0);
+ &path, 0);
if (rc) {
if (rc != -ENOENT)
goto out;
- } else
- file_present = true;
-
- if (file_info->ReplaceIfExists) {
- if (file_present) {
+ } else {
+ if (file_info->ReplaceIfExists) {
rc = ksmbd_vfs_remove_file(work, &path);
if (rc) {
rc = -EINVAL;
@@ -6054,21 +6093,17 @@ static int smb2_create_link(struct ksmbd_work *work,
link_name);
goto out;
}
- }
- } else {
- if (file_present) {
+ } else {
rc = -EEXIST;
ksmbd_debug(SMB, "link already exists\n");
goto out;
}
+ ksmbd_vfs_kern_path_unlock(&path);
}
-
rc = ksmbd_vfs_link(work, target_name, link_name);
if (rc)
rc = -EINVAL;
out:
- if (file_present)
- ksmbd_vfs_kern_path_unlock(&parent_path, &path);
if (!IS_ERR(link_name))
kfree(link_name);
@@ -6174,6 +6209,9 @@ static int set_file_allocation_info(struct ksmbd_work *work,
if (!(fp->daccess & FILE_WRITE_DATA_LE))
return -EACCES;
+ if (ksmbd_stream_fd(fp) == true)
+ return 0;
+
rc = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
AT_STATX_SYNC_AS_STAT);
if (rc)
@@ -6232,7 +6270,8 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp,
* truncate of some filesystem like FAT32 fill zero data in
* truncated range.
*/
- if (inode->i_sb->s_magic != MSDOS_SUPER_MAGIC) {
+ if (inode->i_sb->s_magic != MSDOS_SUPER_MAGIC &&
+ ksmbd_stream_fd(fp) == false) {
ksmbd_debug(SMB, "truncated to newsize %lld\n", newsize);
rc = ksmbd_vfs_truncate(work, fp, newsize);
if (rc) {
@@ -6305,7 +6344,13 @@ static int set_file_position_info(struct ksmbd_file *fp,
return -EINVAL;
}
- fp->filp->f_pos = current_byte_offset;
+ if (ksmbd_stream_fd(fp) == false)
+ fp->filp->f_pos = current_byte_offset;
+ else {
+ if (current_byte_offset > XATTR_SIZE_MAX)
+ current_byte_offset = XATTR_SIZE_MAX;
+ fp->stream.pos = current_byte_offset;
+ }
return 0;
}
@@ -7793,7 +7838,7 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
if (!ksmbd_find_netdev_name_iface_list(netdev->name))
continue;
- flags = dev_get_flags(netdev);
+ flags = netif_get_flags(netdev);
if (!(flags & IFF_RUNNING))
continue;
ipv6_retry:
@@ -8519,11 +8564,6 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work)
goto err_out;
}
- opinfo->op_state = OPLOCK_STATE_NONE;
- wake_up_interruptible_all(&opinfo->oplock_q);
- opinfo_put(opinfo);
- ksmbd_fd_put(work, fp);
-
rsp->StructureSize = cpu_to_le16(24);
rsp->OplockLevel = rsp_oplevel;
rsp->Reserved = 0;
@@ -8531,16 +8571,15 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work)
rsp->VolatileFid = volatile_id;
rsp->PersistentFid = persistent_id;
ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_oplock_break));
- if (!ret)
- return;
-
+ if (ret) {
err_out:
+ smb2_set_err_rsp(work);
+ }
+
opinfo->op_state = OPLOCK_STATE_NONE;
wake_up_interruptible_all(&opinfo->oplock_q);
-
opinfo_put(opinfo);
ksmbd_fd_put(work, fp);
- smb2_set_err_rsp(work);
}
static int check_lease_state(struct lease *lease, __le32 req_state)
@@ -8670,11 +8709,6 @@ static void smb21_lease_break_ack(struct ksmbd_work *work)
}
lease_state = lease->state;
- opinfo->op_state = OPLOCK_STATE_NONE;
- wake_up_interruptible_all(&opinfo->oplock_q);
- atomic_dec(&opinfo->breaking_cnt);
- wake_up_interruptible_all(&opinfo->oplock_brk);
- opinfo_put(opinfo);
rsp->StructureSize = cpu_to_le16(36);
rsp->Reserved = 0;
@@ -8683,16 +8717,16 @@ static void smb21_lease_break_ack(struct ksmbd_work *work)
rsp->LeaseState = lease_state;
rsp->LeaseDuration = 0;
ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_lease_ack));
- if (!ret)
- return;
-
+ if (ret) {
err_out:
+ smb2_set_err_rsp(work);
+ }
+
+ opinfo->op_state = OPLOCK_STATE_NONE;
wake_up_interruptible_all(&opinfo->oplock_q);
atomic_dec(&opinfo->breaking_cnt);
wake_up_interruptible_all(&opinfo->oplock_brk);
-
opinfo_put(opinfo);
- smb2_set_err_rsp(work);
}
/**
diff --git a/fs/smb/server/smb2pdu.h b/fs/smb/server/smb2pdu.h
index 17a0b18a8406..16ae8a10490b 100644
--- a/fs/smb/server/smb2pdu.h
+++ b/fs/smb/server/smb2pdu.h
@@ -63,6 +63,9 @@ struct preauth_integrity_info {
#define SMB2_SESSION_TIMEOUT (10 * HZ)
+/* Apple Defined Contexts */
+#define SMB2_CREATE_AAPL "AAPL"
+
struct create_durable_req_v2 {
struct create_context_hdr ccontext;
__u8 Name[8];
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index 4998df04ab95..c6cbe0d56e32 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -159,7 +159,8 @@ struct smb_direct_transport {
};
#define KSMBD_TRANS(t) ((struct ksmbd_transport *)&((t)->transport))
-
+#define SMBD_TRANS(t) ((struct smb_direct_transport *)container_of(t, \
+ struct smb_direct_transport, transport))
enum {
SMB_DIRECT_MSG_NEGOTIATE_REQ = 0,
SMB_DIRECT_MSG_DATA_TRANSFER
@@ -410,6 +411,11 @@ err:
return NULL;
}
+static void smb_direct_free_transport(struct ksmbd_transport *kt)
+{
+ kfree(SMBD_TRANS(kt));
+}
+
static void free_transport(struct smb_direct_transport *t)
{
struct smb_direct_recvmsg *recvmsg;
@@ -427,7 +433,8 @@ static void free_transport(struct smb_direct_transport *t)
if (t->qp) {
ib_drain_qp(t->qp);
ib_mr_pool_destroy(t->qp, &t->qp->rdma_mrs);
- ib_destroy_qp(t->qp);
+ t->qp = NULL;
+ rdma_destroy_qp(t->cm_id);
}
ksmbd_debug(RDMA, "drain the reassembly queue\n");
@@ -455,7 +462,6 @@ static void free_transport(struct smb_direct_transport *t)
smb_direct_destroy_pools(t);
ksmbd_conn_free(KSMBD_TRANS(t)->conn);
- kfree(t);
}
static struct smb_direct_sendmsg
@@ -1935,8 +1941,8 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t,
return 0;
err:
if (t->qp) {
- ib_destroy_qp(t->qp);
t->qp = NULL;
+ rdma_destroy_qp(t->cm_id);
}
if (t->recv_cq) {
ib_destroy_cq(t->recv_cq);
@@ -2281,4 +2287,5 @@ static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = {
.read = smb_direct_read,
.rdma_read = smb_direct_rdma_read,
.rdma_write = smb_direct_rdma_write,
+ .free_transport = smb_direct_free_transport,
};
diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c
index abedf510899a..f8c772a7cb43 100644
--- a/fs/smb/server/transport_tcp.c
+++ b/fs/smb/server/transport_tcp.c
@@ -58,12 +58,10 @@ static inline void ksmbd_tcp_reuseaddr(struct socket *sock)
static inline void ksmbd_tcp_rcv_timeout(struct socket *sock, s64 secs)
{
- lock_sock(sock->sk);
if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
- sock->sk->sk_rcvtimeo = secs * HZ;
+ WRITE_ONCE(sock->sk->sk_rcvtimeo, secs * HZ);
else
- sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
- release_sock(sock->sk);
+ WRITE_ONCE(sock->sk->sk_rcvtimeo, MAX_SCHEDULE_TIMEOUT);
}
static inline void ksmbd_tcp_snd_timeout(struct socket *sock, s64 secs)
@@ -93,7 +91,7 @@ static struct tcp_transport *alloc_transport(struct socket *client_sk)
return t;
}
-void ksmbd_free_transport(struct ksmbd_transport *kt)
+static void ksmbd_tcp_free_transport(struct ksmbd_transport *kt)
{
struct tcp_transport *t = TCP_TRANS(kt);
@@ -656,4 +654,5 @@ static const struct ksmbd_transport_ops ksmbd_tcp_transport_ops = {
.read = ksmbd_tcp_read,
.writev = ksmbd_tcp_writev,
.disconnect = ksmbd_tcp_disconnect,
+ .free_transport = ksmbd_tcp_free_transport,
};
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index baf0d3031a44..04539037108c 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -4,6 +4,7 @@
* Copyright (C) 2018 Samsung Electronics Co., Ltd.
*/
+#include <crypto/sha2.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/filelock.h>
@@ -65,13 +66,12 @@ int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child)
return 0;
}
-static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf,
- char *pathname, unsigned int flags,
- struct path *parent_path,
- struct path *path)
+static int ksmbd_vfs_path_lookup(struct ksmbd_share_config *share_conf,
+ char *pathname, unsigned int flags,
+ struct path *path, bool do_lock)
{
struct qstr last;
- struct filename *filename;
+ struct filename *filename __free(putname) = NULL;
struct path *root_share_path = &share_conf->vfs_path;
int err, type;
struct dentry *d;
@@ -88,51 +88,57 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf,
return PTR_ERR(filename);
err = vfs_path_parent_lookup(filename, flags,
- parent_path, &last, &type,
+ path, &last, &type,
root_share_path);
- if (err) {
- putname(filename);
+ if (err)
return err;
- }
if (unlikely(type != LAST_NORM)) {
- path_put(parent_path);
- putname(filename);
+ path_put(path);
return -ENOENT;
}
- err = mnt_want_write(parent_path->mnt);
- if (err) {
- path_put(parent_path);
- putname(filename);
+ if (do_lock) {
+ err = mnt_want_write(path->mnt);
+ if (err) {
+ path_put(path);
+ return -ENOENT;
+ }
+
+ inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
+ d = lookup_one_qstr_excl(&last, path->dentry, 0);
+
+ if (!IS_ERR(d)) {
+ dput(path->dentry);
+ path->dentry = d;
+ return 0;
+ }
+ inode_unlock(path->dentry->d_inode);
+ mnt_drop_write(path->mnt);
+ path_put(path);
return -ENOENT;
}
- inode_lock_nested(parent_path->dentry->d_inode, I_MUTEX_PARENT);
- d = lookup_one_qstr_excl(&last, parent_path->dentry, 0);
- if (IS_ERR(d))
- goto err_out;
-
+ d = lookup_noperm_unlocked(&last, path->dentry);
+ if (!IS_ERR(d) && d_is_negative(d)) {
+ dput(d);
+ d = ERR_PTR(-ENOENT);
+ }
+ if (IS_ERR(d)) {
+ path_put(path);
+ return -ENOENT;
+ }
+ dput(path->dentry);
path->dentry = d;
- path->mnt = mntget(parent_path->mnt);
if (test_share_config_flag(share_conf, KSMBD_SHARE_FLAG_CROSSMNT)) {
err = follow_down(path, 0);
if (err < 0) {
path_put(path);
- goto err_out;
+ return -ENOENT;
}
}
-
- putname(filename);
return 0;
-
-err_out:
- inode_unlock(d_inode(parent_path->dentry));
- mnt_drop_write(parent_path->mnt);
- path_put(parent_path);
- putname(filename);
- return -ENOENT;
}
void ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap,
@@ -292,6 +298,7 @@ static int ksmbd_vfs_stream_read(struct ksmbd_file *fp, char *buf, loff_t *pos,
if (v_len - *pos < count)
count = v_len - *pos;
+ fp->stream.pos = v_len;
memcpy(buf, &stream_buf[*pos], count);
@@ -455,8 +462,8 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos,
true);
if (err < 0)
goto out;
-
- fp->filp->f_pos = *pos;
+ else
+ fp->stream.pos = size;
err = 0;
out:
kvfree(stream_buf);
@@ -546,7 +553,8 @@ int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat)
{
int err;
- err = vfs_getattr(path, stat, STATX_BTIME, AT_STATX_SYNC_AS_STAT);
+ err = vfs_getattr(path, stat, STATX_BASIC_STATS | STATX_BTIME,
+ AT_STATX_SYNC_AS_STAT);
if (err)
pr_err("getattr failed, err %d\n", err);
return err;
@@ -763,10 +771,10 @@ retry:
}
rd.old_mnt_idmap = mnt_idmap(old_path->mnt),
- rd.old_dir = d_inode(old_parent),
+ rd.old_parent = old_parent,
rd.old_dentry = old_child,
rd.new_mnt_idmap = mnt_idmap(new_path.mnt),
- rd.new_dir = new_path.dentry->d_inode,
+ rd.new_parent = new_path.dentry,
rd.new_dentry = new_dentry,
rd.flags = flags,
rd.delegated_inode = NULL,
@@ -1196,103 +1204,114 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name,
return ret;
}
-/**
- * ksmbd_vfs_kern_path_locked() - lookup a file and get path info
- * @work: work
- * @name: file path that is relative to share
- * @flags: lookup flags
- * @parent_path: if lookup succeed, return parent_path info
- * @path: if lookup succeed, return path info
- * @caseless: caseless filename lookup
- *
- * Return: 0 on success, otherwise error
- */
-int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
- unsigned int flags, struct path *parent_path,
- struct path *path, bool caseless)
+static
+int __ksmbd_vfs_kern_path(struct ksmbd_work *work, char *filepath,
+ unsigned int flags,
+ struct path *path, bool caseless, bool do_lock)
{
struct ksmbd_share_config *share_conf = work->tcon->share_conf;
+ struct path parent_path;
+ size_t path_len, remain_len;
int err;
- err = ksmbd_vfs_path_lookup_locked(share_conf, name, flags, parent_path,
- path);
- if (!err)
- return 0;
-
- if (caseless) {
- char *filepath;
- size_t path_len, remain_len;
-
- filepath = name;
- path_len = strlen(filepath);
- remain_len = path_len;
-
- *parent_path = share_conf->vfs_path;
- path_get(parent_path);
-
- while (d_can_lookup(parent_path->dentry)) {
- char *filename = filepath + path_len - remain_len;
- char *next = strchrnul(filename, '/');
- size_t filename_len = next - filename;
- bool is_last = !next[0];
+retry:
+ err = ksmbd_vfs_path_lookup(share_conf, filepath, flags, path, do_lock);
+ if (!err || !caseless)
+ return err;
- if (filename_len == 0)
- break;
+ path_len = strlen(filepath);
+ remain_len = path_len;
- err = ksmbd_vfs_lookup_in_dir(parent_path, filename,
- filename_len,
- work->conn->um);
- if (err)
- goto out2;
+ parent_path = share_conf->vfs_path;
+ path_get(&parent_path);
- next[0] = '\0';
+ while (d_can_lookup(parent_path.dentry)) {
+ char *filename = filepath + path_len - remain_len;
+ char *next = strchrnul(filename, '/');
+ size_t filename_len = next - filename;
+ bool is_last = !next[0];
- err = vfs_path_lookup(share_conf->vfs_path.dentry,
- share_conf->vfs_path.mnt,
- filepath,
- flags,
- path);
- if (!is_last)
- next[0] = '/';
- if (err)
- goto out2;
- else if (is_last)
- goto out1;
- path_put(parent_path);
- *parent_path = *path;
+ if (filename_len == 0)
+ break;
- remain_len -= filename_len + 1;
+ err = ksmbd_vfs_lookup_in_dir(&parent_path, filename,
+ filename_len,
+ work->conn->um);
+ path_put(&parent_path);
+ if (err)
+ goto out;
+ if (is_last) {
+ caseless = false;
+ goto retry;
}
+ next[0] = '\0';
+
+ err = vfs_path_lookup(share_conf->vfs_path.dentry,
+ share_conf->vfs_path.mnt,
+ filepath,
+ flags,
+ &parent_path);
+ next[0] = '/';
+ if (err)
+ goto out;
- err = -EINVAL;
-out2:
- path_put(parent_path);
+ remain_len -= filename_len + 1;
}
-out1:
- if (!err) {
- err = mnt_want_write(parent_path->mnt);
- if (err) {
- path_put(path);
- path_put(parent_path);
- return err;
- }
-
- err = ksmbd_vfs_lock_parent(parent_path->dentry, path->dentry);
- if (err) {
- path_put(path);
- path_put(parent_path);
- }
- }
+ err = -EINVAL;
+ path_put(&parent_path);
+out:
return err;
}
-void ksmbd_vfs_kern_path_unlock(struct path *parent_path, struct path *path)
+/**
+ * ksmbd_vfs_kern_path() - lookup a file and get path info
+ * @work: work
+ * @filepath: file path that is relative to share
+ * @flags: lookup flags
+ * @path: if lookup succeed, return path info
+ * @caseless: caseless filename lookup
+ *
+ * Perform the lookup, possibly crossing over any mount point.
+ * On return no locks will be held and write-access to filesystem
+ * won't have been checked.
+ * Return: 0 if file was found, otherwise error
+ */
+int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *filepath,
+ unsigned int flags,
+ struct path *path, bool caseless)
+{
+ return __ksmbd_vfs_kern_path(work, filepath, flags, path,
+ caseless, false);
+}
+
+/**
+ * ksmbd_vfs_kern_path_locked() - lookup a file and get path info
+ * @work: work
+ * @filepath: file path that is relative to share
+ * @flags: lookup flags
+ * @path: if lookup succeed, return path info
+ * @caseless: caseless filename lookup
+ *
+ * Perform the lookup, but don't cross over any mount point.
+ * On return the parent of path->dentry will be locked and write-access to
+ * filesystem will have been gained.
+ * Return: 0 on if file was found, otherwise error
+ */
+int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *filepath,
+ unsigned int flags,
+ struct path *path, bool caseless)
+{
+ return __ksmbd_vfs_kern_path(work, filepath, flags, path,
+ caseless, true);
+}
+
+void ksmbd_vfs_kern_path_unlock(struct path *path)
{
- inode_unlock(d_inode(parent_path->dentry));
- mnt_drop_write(parent_path->mnt);
+ /* While lock is still held, ->d_parent is safe */
+ inode_unlock(d_inode(path->dentry->d_parent));
+ mnt_drop_write(path->mnt);
path_put(path);
- path_put(parent_path);
}
struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work,
@@ -1476,11 +1495,7 @@ int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn,
acl.sd_buf = (char *)pntsd;
acl.sd_size = len;
- rc = ksmbd_gen_sd_hash(conn, acl.sd_buf, acl.sd_size, acl.hash);
- if (rc) {
- pr_err("failed to generate hash for ndr acl\n");
- return rc;
- }
+ sha256(acl.sd_buf, acl.sd_size, acl.hash);
smb_acl = ksmbd_vfs_make_xattr_posix_acl(idmap, inode,
ACL_TYPE_ACCESS);
@@ -1495,12 +1510,7 @@ int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn,
goto out;
}
- rc = ksmbd_gen_sd_hash(conn, acl_ndr.data, acl_ndr.offset,
- acl.posix_acl_hash);
- if (rc) {
- pr_err("failed to generate hash for ndr acl\n");
- goto out;
- }
+ sha256(acl_ndr.data, acl_ndr.offset, acl.posix_acl_hash);
rc = ndr_encode_v4_ntacl(&sd_ndr, &acl);
if (rc) {
@@ -1557,11 +1567,7 @@ int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn,
goto out_free;
}
- rc = ksmbd_gen_sd_hash(conn, acl_ndr.data, acl_ndr.offset, cmp_hash);
- if (rc) {
- pr_err("failed to generate hash for ndr acl\n");
- goto out_free;
- }
+ sha256(acl_ndr.data, acl_ndr.offset, cmp_hash);
if (memcmp(cmp_hash, acl.posix_acl_hash, XATTR_SD_HASH_SIZE)) {
pr_err("hash value diff\n");
diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h
index 2893f59803a6..d47472f3e30b 100644
--- a/fs/smb/server/vfs.h
+++ b/fs/smb/server/vfs.h
@@ -117,10 +117,13 @@ int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name,
int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap,
const struct path *path, char *attr_name,
bool get_write);
+int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name,
+ unsigned int flags,
+ struct path *path, bool caseless);
int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
- unsigned int flags, struct path *parent_path,
+ unsigned int flags,
struct path *path, bool caseless);
-void ksmbd_vfs_kern_path_unlock(struct path *parent_path, struct path *path);
+void ksmbd_vfs_kern_path_unlock(struct path *path);
struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work,
const char *name,
unsigned int flags,
diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h
index 5bbb179736c2..0708155b5caf 100644
--- a/fs/smb/server/vfs_cache.h
+++ b/fs/smb/server/vfs_cache.h
@@ -44,6 +44,7 @@ struct ksmbd_lock {
struct stream {
char *name;
ssize_t size;
+ loff_t pos;
};
struct ksmbd_inode {
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index b1091e70434a..a9602aae21ef 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -149,6 +149,27 @@ config SQUASHFS_XATTR
If unsure, say N.
+config SQUASHFS_COMP_CACHE_FULL
+ bool "Enable full caching of compressed blocks"
+ depends on SQUASHFS
+ default n
+ help
+ This option enables caching of all compressed blocks, Without caching,
+ repeated reads of the same files trigger excessive disk I/O, significantly
+ reducinng performance in workloads like fio-based benchmarks.
+
+ For example, fio tests (iodepth=1, numjobs=1, ioengine=psync) show:
+ With caching: IOPS=2223, BW=278MiB/s (291MB/s)
+ Without caching: IOPS=815, BW=102MiB/s (107MB/s)
+
+ Enabling this option restores performance to pre-regression levels by
+ caching all compressed blocks in the page cache, reducing disk I/O for
+ repeated reads. However, this increases memory usage, which may be a
+ concern in memory-constrained environments.
+
+ Enable this option if your workload involves frequent repeated reads and
+ memory usage is not a limiting factor. If unsure, say N.
+
config SQUASHFS_ZLIB
bool "Include support for ZLIB compressed file systems"
depends on SQUASHFS
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 2dc730800f44..3061043e915c 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -88,6 +88,10 @@ static int squashfs_bio_read_cached(struct bio *fullbio,
struct bio_vec *bv;
int idx = 0;
int err = 0;
+#ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL
+ struct page **cache_pages = kmalloc_array(page_count,
+ sizeof(void *), GFP_KERNEL | __GFP_ZERO);
+#endif
bio_for_each_segment_all(bv, fullbio, iter_all) {
struct page *page = bv->bv_page;
@@ -110,6 +114,11 @@ static int squashfs_bio_read_cached(struct bio *fullbio,
head_to_cache = page;
else if (idx == page_count - 1 && index + length != read_end)
tail_to_cache = page;
+#ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL
+ /* Cache all pages in the BIO for repeated reads */
+ else if (cache_pages)
+ cache_pages[idx] = page;
+#endif
if (!bio || idx != end_idx) {
struct bio *new = bio_alloc_clone(bdev, fullbio,
@@ -163,6 +172,25 @@ static int squashfs_bio_read_cached(struct bio *fullbio,
}
}
+#ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL
+ if (!cache_pages)
+ goto out;
+
+ for (idx = 0; idx < page_count; idx++) {
+ if (!cache_pages[idx])
+ continue;
+ int ret = add_to_page_cache_lru(cache_pages[idx], cache_mapping,
+ (read_start >> PAGE_SHIFT) + idx,
+ GFP_NOIO);
+
+ if (!ret) {
+ SetPageUptodate(cache_pages[idx]);
+ unlock_page(cache_pages[idx]);
+ }
+ }
+ kfree(cache_pages);
+out:
+#endif
return 0;
}
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 67c55fe32ce8..992ea0e37257 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -202,6 +202,11 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
msblk->panic_on_errors = (opts->errors == Opt_errors_panic);
msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);
+ if (!msblk->devblksize) {
+ errorf(fc, "squashfs: unable to set blocksize\n");
+ return -EINVAL;
+ }
+
msblk->devblksize_log2 = ffz(~msblk->devblksize);
mutex_init(&msblk->meta_index_mutex);
diff --git a/fs/stack.c b/fs/stack.c
index f18920119944..d8c782e064e3 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -3,7 +3,7 @@
#include <linux/fs.h>
#include <linux/fs_stack.h>
-/* does _NOT_ require i_mutex to be held.
+/* does _NOT_ require i_rwsem to be held.
*
* This function cannot be inlined since i_size_{read,write} is rather
* heavy-weight on 32-bit systems
@@ -41,7 +41,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
* If CONFIG_SMP or CONFIG_PREEMPTION on 32-bit, it's vital for
* fsstack_copy_inode_size() to hold some lock around
* i_size_write(), otherwise i_size_read() may spin forever (see
- * include/linux/fs.h). We don't necessarily hold i_mutex when this
+ * include/linux/fs.h). We don't necessarily hold i_rwsem when this
* is called, so take i_lock for that case.
*
* And if on 32-bit, continue our effort to keep the two halves of
diff --git a/fs/super.c b/fs/super.c
index 21799e213fd7..7f876f32343a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -964,8 +964,10 @@ void iterate_supers_type(struct file_system_type *type,
spin_unlock(&sb_lock);
locked = super_lock_shared(sb);
- if (locked)
+ if (locked) {
f(sb, arg);
+ super_unlock_shared(sb);
+ }
spin_lock(&sb_lock);
if (p)
@@ -1457,6 +1459,17 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
if (!sb)
return;
+ if (sb->s_op->remove_bdev) {
+ int ret;
+
+ ret = sb->s_op->remove_bdev(sb, bdev);
+ if (!ret) {
+ super_unlock_shared(sb);
+ return;
+ }
+ /* Fallback to shutdown. */
+ }
+
if (!surprise)
sync_filesystem(sb);
shrink_dcache_sb(sb);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index c3d3b079aedd..1ca143d2f22a 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -83,7 +83,7 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf,
size_t count, loff_t pos)
{
- struct bin_attribute *battr = of->kn->priv;
+ const struct bin_attribute *battr = of->kn->priv;
struct kobject *kobj = sysfs_file_kobj(of->kn);
loff_t size = file_inode(of->file)->i_size;
@@ -149,7 +149,7 @@ static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf,
static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf,
size_t count, loff_t pos)
{
- struct bin_attribute *battr = of->kn->priv;
+ const struct bin_attribute *battr = of->kn->priv;
struct kobject *kobj = sysfs_file_kobj(of->kn);
loff_t size = file_inode(of->file)->i_size;
@@ -173,7 +173,7 @@ static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf,
static int sysfs_kf_bin_mmap(struct kernfs_open_file *of,
struct vm_area_struct *vma)
{
- struct bin_attribute *battr = of->kn->priv;
+ const struct bin_attribute *battr = of->kn->priv;
struct kobject *kobj = sysfs_file_kobj(of->kn);
return battr->mmap(of->file, kobj, battr, vma);
@@ -182,7 +182,7 @@ static int sysfs_kf_bin_mmap(struct kernfs_open_file *of,
static loff_t sysfs_kf_bin_llseek(struct kernfs_open_file *of, loff_t offset,
int whence)
{
- struct bin_attribute *battr = of->kn->priv;
+ const struct bin_attribute *battr = of->kn->priv;
struct kobject *kobj = sysfs_file_kobj(of->kn);
if (battr->llseek)
@@ -193,7 +193,7 @@ static loff_t sysfs_kf_bin_llseek(struct kernfs_open_file *of, loff_t offset,
static int sysfs_kf_bin_open(struct kernfs_open_file *of)
{
- struct bin_attribute *battr = of->kn->priv;
+ const struct bin_attribute *battr = of->kn->priv;
if (battr->f_mapping)
of->file->f_mapping = battr->f_mapping();
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index a3fd3cc591bd..0c023941a316 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -465,9 +465,20 @@ static int tracefs_d_revalidate(struct inode *inode, const struct qstr *name,
return !(ei && ei->is_freed);
}
+static int tracefs_d_delete(const struct dentry *dentry)
+{
+ /*
+ * We want to keep eventfs dentries around but not tracefs
+ * ones. eventfs dentries have content in d_fsdata.
+ * Use d_fsdata to determine if it's a eventfs dentry or not.
+ */
+ return dentry->d_fsdata == NULL;
+}
+
static const struct dentry_operations tracefs_dentry_operations = {
.d_revalidate = tracefs_d_revalidate,
.d_release = tracefs_d_release,
+ .d_delete = tracefs_d_delete,
};
static int tracefs_fill_super(struct super_block *sb, struct fs_context *fc)
@@ -480,7 +491,7 @@ static int tracefs_fill_super(struct super_block *sb, struct fs_context *fc)
return err;
sb->s_op = &tracefs_super_operations;
- sb->s_d_op = &tracefs_dentry_operations;
+ set_default_d_op(sb, &tracefs_dentry_operations);
return 0;
}
@@ -551,20 +562,9 @@ struct dentry *tracefs_start_creating(const char *name, struct dentry *parent)
if (!parent)
parent = tracefs_mount->mnt_root;
- inode_lock(d_inode(parent));
- if (unlikely(IS_DEADDIR(d_inode(parent))))
- dentry = ERR_PTR(-ENOENT);
- else
- dentry = lookup_noperm(&QSTR(name), parent);
- if (!IS_ERR(dentry) && d_inode(dentry)) {
- dput(dentry);
- dentry = ERR_PTR(-EEXIST);
- }
-
- if (IS_ERR(dentry)) {
- inode_unlock(d_inode(parent));
+ dentry = simple_start_creating(parent, name);
+ if (IS_ERR(dentry))
simple_release_fs(&tracefs_mount, &tracefs_mount_count);
- }
return dentry;
}
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 921f9033d0d2..fb5ac358077b 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -51,7 +51,7 @@ int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn,
memset(p + in_len, 0, pad_len - in_len);
err = fscrypt_encrypt_block_inplace(inode, virt_to_page(p), pad_len,
- offset_in_page(p), block, GFP_NOFS);
+ offset_in_page(p), block);
if (err) {
ubifs_err(c, "fscrypt_encrypt_block_inplace() failed: %d", err);
return err;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index bf311c38d9a8..e75a6cec67be 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -404,7 +404,8 @@ static int allocate_budget(struct ubifs_info *c, struct folio *folio,
* there is a plenty of flash space and the budget will be acquired quickly,
* without forcing write-back. The slow path does not make this assumption.
*/
-static int ubifs_write_begin(struct file *file, struct address_space *mapping,
+static int ubifs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
@@ -514,8 +515,9 @@ static void cancel_budget(struct ubifs_info *c, struct folio *folio,
}
}
-static int ubifs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
+static int ubifs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos,
+ unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
@@ -977,8 +979,7 @@ static int do_writepage(struct folio *folio, size_t len)
* on the page lock and it would not write the truncated inode node to the
* journal before we have finished.
*/
-static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc,
- void *data)
+static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc)
{
struct inode *inode = folio->mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -1050,7 +1051,12 @@ out_unlock:
static int ubifs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- return write_cache_pages(mapping, wbc, ubifs_writepage, NULL);
+ struct folio *folio = NULL;
+ int error;
+
+ while ((folio = writeback_iter(mapping, wbc, folio, &error)))
+ error = ubifs_writepage(folio, wbc);
+ return error;
}
/**
@@ -1579,17 +1585,17 @@ static const struct vm_operations_struct ubifs_file_vm_ops = {
.page_mkwrite = ubifs_vm_page_mkwrite,
};
-static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int ubifs_file_mmap_prepare(struct vm_area_desc *desc)
{
int err;
- err = generic_file_mmap(file, vma);
+ err = generic_file_mmap_prepare(desc);
if (err)
return err;
- vma->vm_ops = &ubifs_file_vm_ops;
+ desc->vm_ops = &ubifs_file_vm_ops;
if (IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT))
- file_accessed(file);
+ file_accessed(desc->file);
return 0;
}
@@ -1652,7 +1658,7 @@ const struct file_operations ubifs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = ubifs_write_iter,
- .mmap = ubifs_file_mmap,
+ .mmap_prepare = ubifs_file_mmap_prepare,
.fsync = ubifs_fsync,
.unlocked_ioctl = ubifs_ioctl,
.splice_read = filemap_splice_read,
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 2c99349cf537..79536b2e3d7a 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -130,7 +130,7 @@ static int setflags(struct inode *inode, int flags)
return err;
}
-int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int ubifs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
int flags = ubifs2ioctl(ubifs_inode(inode)->flags);
@@ -145,7 +145,7 @@ int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
int ubifs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
int flags = fa->flags;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index ee954e64ce7f..e28ab4395e5c 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -985,7 +985,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink);
if (kill_xattrs && ui->xattr_cnt > ubifs_xattr_max_cnt(c)) {
- ubifs_err(c, "Cannot delete inode, it has too much xattrs!");
+ ubifs_err(c, "Cannot delete inode, it has too many xattrs!");
err = -EPERM;
ubifs_ro_mode(c, err);
return err;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 256dbaeeb0de..5db45c9e26ee 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -2073,9 +2073,9 @@ int ubifs_recover_size(struct ubifs_info *c, bool in_place);
void ubifs_destroy_size_tree(struct ubifs_info *c);
/* ioctl.c */
-int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int ubifs_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
int ubifs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
void ubifs_set_inode_flags(struct inode *inode);
#ifdef CONFIG_COMPAT
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 4386dd845e40..f24aa98e6869 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -181,19 +181,23 @@ static void udf_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int udf_adinicb_writepage(struct folio *folio,
- struct writeback_control *wbc, void *data)
+static int udf_adinicb_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- struct inode *inode = folio->mapping->host;
+ struct inode *inode = mapping->host;
struct udf_inode_info *iinfo = UDF_I(inode);
+ struct folio *folio = NULL;
+ int error = 0;
+
+ while ((folio = writeback_iter(mapping, wbc, folio, &error))) {
+ BUG_ON(!folio_test_locked(folio));
+ BUG_ON(folio->index != 0);
+ memcpy_from_file_folio(iinfo->i_data + iinfo->i_lenEAttr, folio,
+ 0, i_size_read(inode));
+ folio_unlock(folio);
+ }
- BUG_ON(!folio_test_locked(folio));
- BUG_ON(folio->index != 0);
- memcpy_from_file_folio(iinfo->i_data + iinfo->i_lenEAttr, folio, 0,
- i_size_read(inode));
- folio_unlock(folio);
mark_inode_dirty(inode);
-
return 0;
}
@@ -203,9 +207,9 @@ static int udf_writepages(struct address_space *mapping,
struct inode *inode = mapping->host;
struct udf_inode_info *iinfo = UDF_I(inode);
- if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB)
- return mpage_writepages(mapping, wbc, udf_get_block_wb);
- return write_cache_pages(mapping, wbc, udf_adinicb_writepage, NULL);
+ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+ return udf_adinicb_writepages(mapping, wbc);
+ return mpage_writepages(mapping, wbc, udf_get_block_wb);
}
static void udf_adinicb_read_folio(struct folio *folio)
@@ -244,10 +248,12 @@ static void udf_readahead(struct readahead_control *rac)
mpage_readahead(rac, udf_get_block);
}
-static int udf_write_begin(struct file *file, struct address_space *mapping,
+static int udf_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
+ struct file *file = iocb->ki_filp;
struct udf_inode_info *iinfo = UDF_I(file_inode(file));
struct folio *folio;
int ret;
@@ -271,15 +277,16 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
return 0;
}
-static int udf_write_end(struct file *file, struct address_space *mapping,
+static int udf_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
{
- struct inode *inode = file_inode(file);
+ struct inode *inode = file_inode(iocb->ki_filp);
loff_t last_pos;
if (UDF_I(inode)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB)
- return generic_write_end(file, mapping, pos, len, copied, folio,
+ return generic_write_end(iocb, mapping, pos, len, copied, folio,
fsdata);
last_pos = pos + copied;
if (last_pos > inode->i_size)
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 1c8a736b3309..b2f168b0a0d1 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1440,7 +1440,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
struct genericPartitionMap *gpm;
uint16_t ident;
struct buffer_head *bh;
- unsigned int table_len;
+ unsigned int table_len, part_map_count;
int ret;
bh = udf_read_tagged(sb, block, block, &ident);
@@ -1461,7 +1461,16 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
"logical volume");
if (ret)
goto out_bh;
- ret = udf_sb_alloc_partition_maps(sb, le32_to_cpu(lvd->numPartitionMaps));
+
+ part_map_count = le32_to_cpu(lvd->numPartitionMaps);
+ if (part_map_count > table_len / sizeof(struct genericPartitionMap1)) {
+ udf_err(sb, "error loading logical volume descriptor: "
+ "Too many partition maps (%u > %u)\n", part_map_count,
+ table_len / (unsigned)sizeof(struct genericPartitionMap1));
+ ret = -EIO;
+ goto out_bh;
+ }
+ ret = udf_sb_alloc_partition_maps(sb, part_map_count);
if (ret)
goto out_bh;
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 88d0062cfdb9..0388a1bae326 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -48,7 +48,7 @@ static void ufs_commit_chunk(struct folio *folio, loff_t pos, unsigned len)
struct inode *dir = mapping->host;
inode_inc_iversion(dir);
- block_write_end(NULL, mapping, pos, len, len, folio, NULL);
+ block_write_end(pos, len, len, folio);
if (pos+len > dir->i_size) {
i_size_write(dir, pos+len);
mark_inode_dirty(dir);
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 487ad1fc2de6..c2a391c17df7 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -38,7 +38,7 @@ const struct file_operations ufs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.open = generic_file_open,
.fsync = generic_file_fsync,
.splice_read = filemap_splice_read,
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 7dc38fdef2ea..8361c00e8fa6 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -474,9 +474,10 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int ufs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int ufs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int ret;
@@ -487,13 +488,14 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
-static int ufs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+static int ufs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
int ret;
- ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
if (ret < len)
ufs_write_failed(mapping, pos + len);
return ret;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index eea718ac66b4..6e4585169f94 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -397,7 +397,7 @@ static int ufs_parse_param(struct fs_context *fc, struct fs_parameter *param)
pr_err("ufstype can't be changed during remount\n");
return -EINVAL;
}
- if (!ctx->flavour) {
+ if (ctx->flavour) {
pr_err("conflicting ufstype options\n");
return -EINVAL;
}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 22f4bf956ba1..54c6cc7fe9c6 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -165,14 +165,14 @@ static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
{
if (refcount_dec_and_test(&ctx->refcount)) {
- VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
- VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
- VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
- VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
- VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
- VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
- VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
- VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
+ VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_pending_wqh.lock));
+ VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_pending_wqh));
+ VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_wqh.lock));
+ VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_wqh));
+ VM_WARN_ON_ONCE(spin_is_locked(&ctx->event_wqh.lock));
+ VM_WARN_ON_ONCE(waitqueue_active(&ctx->event_wqh));
+ VM_WARN_ON_ONCE(spin_is_locked(&ctx->fd_wqh.lock));
+ VM_WARN_ON_ONCE(waitqueue_active(&ctx->fd_wqh));
mmdrop(ctx->mm);
kmem_cache_free(userfaultfd_ctx_cachep, ctx);
}
@@ -304,7 +304,7 @@ again:
goto out;
ret = false;
- if (!pmd_present(_pmd) || pmd_devmap(_pmd))
+ if (!pmd_present(_pmd))
goto out;
if (pmd_trans_huge(_pmd)) {
@@ -383,12 +383,12 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
if (!ctx)
goto out;
- BUG_ON(ctx->mm != mm);
+ VM_WARN_ON_ONCE(ctx->mm != mm);
/* Any unrecognized flag is a bug. */
- VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
+ VM_WARN_ON_ONCE(reason & ~__VM_UFFD_FLAGS);
/* 0 or > 1 flags set is a bug; we expect exactly 1. */
- VM_BUG_ON(!reason || (reason & (reason - 1)));
+ VM_WARN_ON_ONCE(!reason || (reason & (reason - 1)));
if (ctx->features & UFFD_FEATURE_SIGBUS)
goto out;
@@ -411,12 +411,11 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
* to be sure not to return SIGBUS erroneously on
* nowait invocations.
*/
- BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
+ VM_WARN_ON_ONCE(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
#ifdef CONFIG_DEBUG_VM
if (printk_ratelimit()) {
- printk(KERN_WARNING
- "FAULT_FLAG_ALLOW_RETRY missing %x\n",
- vmf->flags);
+ pr_warn("FAULT_FLAG_ALLOW_RETRY missing %x\n",
+ vmf->flags);
dump_stack();
}
#endif
@@ -602,7 +601,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
*/
out:
atomic_dec(&ctx->mmap_changing);
- VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0);
+ VM_WARN_ON_ONCE(atomic_read(&ctx->mmap_changing) < 0);
userfaultfd_ctx_put(ctx);
}
@@ -710,7 +709,7 @@ void dup_userfaultfd_fail(struct list_head *fcs)
struct userfaultfd_ctx *ctx = fctx->new;
atomic_dec(&octx->mmap_changing);
- VM_BUG_ON(atomic_read(&octx->mmap_changing) < 0);
+ VM_WARN_ON_ONCE(atomic_read(&octx->mmap_changing) < 0);
userfaultfd_ctx_put(octx);
userfaultfd_ctx_put(ctx);
@@ -751,11 +750,6 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
if (!ctx)
return;
- if (to & ~PAGE_MASK) {
- userfaultfd_ctx_put(ctx);
- return;
- }
-
msg_init(&ewq.msg);
ewq.msg.event = UFFD_EVENT_REMAP;
@@ -766,6 +760,16 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
userfaultfd_event_wait_completion(ctx, &ewq);
}
+void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *vm_ctx)
+{
+ struct userfaultfd_ctx *ctx = vm_ctx->ctx;
+
+ if (!ctx)
+ return;
+
+ userfaultfd_ctx_put(ctx);
+}
+
bool userfaultfd_remove(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
@@ -1243,7 +1247,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
int ret;
struct uffdio_register uffdio_register;
struct uffdio_register __user *user_uffdio_register;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
bool found;
bool basic_ioctls;
unsigned long start, end;
@@ -1317,8 +1321,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
do {
cond_resched();
- BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
- !!(cur->vm_flags & __VM_UFFD_FLAGS));
+ VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^
+ !!(cur->vm_flags & __VM_UFFD_FLAGS));
/* check not compatible vmas */
ret = -EINVAL;
@@ -1372,7 +1376,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
found = true;
} for_each_vma_range(vmi, cur, end);
- BUG_ON(!found);
+ VM_WARN_ON_ONCE(!found);
ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end,
wp_async);
@@ -1464,8 +1468,16 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
do {
cond_resched();
- BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
- !!(cur->vm_flags & __VM_UFFD_FLAGS));
+ VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^
+ !!(cur->vm_flags & __VM_UFFD_FLAGS));
+
+ /*
+ * Prevent unregistering through a different userfaultfd than
+ * the one used for registration.
+ */
+ if (cur->vm_userfaultfd_ctx.ctx &&
+ cur->vm_userfaultfd_ctx.ctx != ctx)
+ goto out_unlock;
/*
* Check not compatible vmas, not strictly required
@@ -1479,7 +1491,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
found = true;
} for_each_vma_range(vmi, cur, end);
- BUG_ON(!found);
+ VM_WARN_ON_ONCE(!found);
vma_iter_set(&vmi, start);
prev = vma_prev(&vmi);
@@ -1490,16 +1502,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
for_each_vma_range(vmi, vma, end) {
cond_resched();
- BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));
-
- /*
- * Nothing to do: this vma is already registered into this
- * userfaultfd and with the right tracking mode too.
- */
+ /* VMA not registered with userfaultfd. */
if (!vma->vm_userfaultfd_ctx.ctx)
goto skip;
- WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
+ VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx != ctx);
+ VM_WARN_ON_ONCE(!vma_can_userfault(vma, vma->vm_flags, wp_async));
+ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE));
if (vma->vm_start > start)
start = vma->vm_start;
@@ -1564,7 +1573,7 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
* len == 0 means wake all and we don't want to wake all here,
* so check it again to be sure.
*/
- VM_BUG_ON(!range.len);
+ VM_WARN_ON_ONCE(!range.len);
wake_userfault(ctx, &range);
ret = 0;
@@ -1621,7 +1630,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
return -EFAULT;
if (ret < 0)
goto out;
- BUG_ON(!ret);
+ VM_WARN_ON_ONCE(!ret);
/* len == 0 would wake all */
range.len = ret;
if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
@@ -1676,7 +1685,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
if (ret < 0)
goto out;
/* len == 0 would wake all */
- BUG_ON(!ret);
+ VM_WARN_ON_ONCE(!ret);
range.len = ret;
if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
range.start = uffdio_zeropage.range.start;
@@ -1788,7 +1797,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
goto out;
/* len == 0 would wake all */
- BUG_ON(!ret);
+ VM_WARN_ON_ONCE(!ret);
range.len = ret;
if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
range.start = uffdio_continue.range.start;
@@ -1845,7 +1854,7 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long
goto out;
/* len == 0 would wake all */
- BUG_ON(!ret);
+ VM_WARN_ON_ONCE(!ret);
range.len = ret;
if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
range.start = uffdio_poison.range.start;
@@ -2106,12 +2115,10 @@ static int new_userfaultfd(int flags)
struct file *file;
int fd;
- BUG_ON(!current->mm);
+ VM_WARN_ON_ONCE(!current->mm);
/* Check the UFFD_* constants for consistency. */
BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
- BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
- BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
return -EINVAL;
diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c
index b492794f8e9a..4bebd947314a 100644
--- a/fs/vboxsf/file.c
+++ b/fs/vboxsf/file.c
@@ -165,13 +165,13 @@ static const struct vm_operations_struct vboxsf_file_vm_ops = {
.map_pages = filemap_map_pages,
};
-static int vboxsf_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int vboxsf_file_mmap_prepare(struct vm_area_desc *desc)
{
int err;
- err = generic_file_mmap(file, vma);
+ err = generic_file_mmap_prepare(desc);
if (!err)
- vma->vm_ops = &vboxsf_file_vm_ops;
+ desc->vm_ops = &vboxsf_file_vm_ops;
return err;
}
@@ -213,7 +213,7 @@ const struct file_operations vboxsf_reg_fops = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = vboxsf_file_mmap,
+ .mmap_prepare = vboxsf_file_mmap_prepare,
.open = vboxsf_file_open,
.release = vboxsf_file_release,
.fsync = noop_fsync,
@@ -300,12 +300,13 @@ static int vboxsf_writepages(struct address_space *mapping,
return error;
}
-static int vboxsf_write_end(struct file *file, struct address_space *mapping,
+static int vboxsf_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned int len, unsigned int copied,
struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
- struct vboxsf_handle *sf_handle = file->private_data;
+ struct vboxsf_handle *sf_handle = iocb->ki_filp->private_data;
size_t from = offset_in_folio(folio, pos);
u32 nwritten = len;
u8 *buf;
diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c
index 0bc96ab6580b..241647b060ee 100644
--- a/fs/vboxsf/super.c
+++ b/fs/vboxsf/super.c
@@ -189,7 +189,7 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_blocksize = 1024;
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_op = &vboxsf_super_ops;
- sb->s_d_op = &vboxsf_dentry_ops;
+ set_default_d_op(sb, &vboxsf_dentry_ops);
iroot = iget_locked(sb, 0);
if (!iroot) {
diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig
index 40569d3527a7..76d1c5971b82 100644
--- a/fs/verity/Kconfig
+++ b/fs/verity/Kconfig
@@ -2,11 +2,9 @@
config FS_VERITY
bool "FS Verity (read-only file-based authenticity protection)"
- select CRYPTO
select CRYPTO_HASH_INFO
- # SHA-256 is selected as it's intended to be the default hash algorithm.
- # To avoid bloat, other wanted algorithms must be selected explicitly.
- select CRYPTO_SHA256
+ select CRYPTO_LIB_SHA256
+ select CRYPTO_LIB_SHA512
help
This option enables fs-verity. fs-verity is the dm-verity
mechanism implemented at the file level. On supported
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index c284f46d1b53..503268cf4296 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -7,7 +7,7 @@
#include "fsverity_private.h"
-#include <crypto/hash.h>
+#include <linux/export.h>
#include <linux/mount.h>
#include <linux/sched/signal.h>
#include <linux/uaccess.h>
@@ -24,7 +24,6 @@ static int hash_one_block(struct inode *inode,
struct block_buffer *cur)
{
struct block_buffer *next = cur + 1;
- int err;
/*
* Safety check to prevent a buffer overflow in case of a filesystem bug
@@ -37,10 +36,8 @@ static int hash_one_block(struct inode *inode,
/* Zero-pad the block if it's shorter than the block size. */
memset(&cur->data[cur->filled], 0, params->block_size - cur->filled);
- err = fsverity_hash_block(params, inode, cur->data,
- &next->data[next->filled]);
- if (err)
- return err;
+ fsverity_hash_block(params, inode, cur->data,
+ &next->data[next->filled]);
next->filled += params->digest_size;
cur->filled = 0;
return 0;
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index b3506f56e180..5fe854a5b9ad 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -20,7 +20,6 @@
/* A hash algorithm supported by fs-verity */
struct fsverity_hash_alg {
- struct crypto_shash *tfm; /* hash tfm, allocated on demand */
const char *name; /* crypto API name, e.g. sha256 */
unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */
unsigned int block_size; /* block size in bytes, e.g. 64 for SHA-256 */
@@ -31,10 +30,16 @@ struct fsverity_hash_alg {
enum hash_algo algo_id;
};
+union fsverity_hash_ctx {
+ struct sha256_ctx sha256;
+ struct sha512_ctx sha512;
+};
+
/* Merkle tree parameters: hash algorithm, initial hash state, and topology */
struct merkle_tree_params {
const struct fsverity_hash_alg *hash_alg; /* the hash algorithm */
- const u8 *hashstate; /* initial hash state or NULL */
+ /* initial hash state if salted, NULL if unsalted */
+ const union fsverity_hash_ctx *hashstate;
unsigned int digest_size; /* same as hash_alg->digest_size */
unsigned int block_size; /* size of data and tree blocks */
unsigned int hashes_per_block; /* number of hashes per tree block */
@@ -76,16 +81,17 @@ struct fsverity_info {
/* hash_algs.c */
-extern struct fsverity_hash_alg fsverity_hash_algs[];
+extern const struct fsverity_hash_alg fsverity_hash_algs[];
const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
unsigned int num);
-const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
- const u8 *salt, size_t salt_size);
-int fsverity_hash_block(const struct merkle_tree_params *params,
- const struct inode *inode, const void *data, u8 *out);
-int fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
- const void *data, size_t size, u8 *out);
+union fsverity_hash_ctx *
+fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
+ const u8 *salt, size_t salt_size);
+void fsverity_hash_block(const struct merkle_tree_params *params,
+ const struct inode *inode, const void *data, u8 *out);
+void fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
+ const void *data, size_t size, u8 *out);
void __init fsverity_check_hash_algs(void);
/* init.c */
diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c
index 6b08b1d9a7d7..9bb3c6344907 100644
--- a/fs/verity/hash_algs.c
+++ b/fs/verity/hash_algs.c
@@ -7,10 +7,8 @@
#include "fsverity_private.h"
-#include <crypto/hash.h>
-
/* The hash algorithms supported by fs-verity */
-struct fsverity_hash_alg fsverity_hash_algs[] = {
+const struct fsverity_hash_alg fsverity_hash_algs[] = {
[FS_VERITY_HASH_ALG_SHA256] = {
.name = "sha256",
.digest_size = SHA256_DIGEST_SIZE,
@@ -25,106 +23,42 @@ struct fsverity_hash_alg fsverity_hash_algs[] = {
},
};
-static DEFINE_MUTEX(fsverity_hash_alg_init_mutex);
-
/**
- * fsverity_get_hash_alg() - validate and prepare a hash algorithm
+ * fsverity_get_hash_alg() - get a hash algorithm by number
* @inode: optional inode for logging purposes
* @num: the hash algorithm number
*
- * Get the struct fsverity_hash_alg for the given hash algorithm number, and
- * ensure it has a hash transform ready to go. The hash transforms are
- * allocated on-demand so that we don't waste resources unnecessarily, and
- * because the crypto modules may be initialized later than fs/verity/.
+ * Get the struct fsverity_hash_alg for the given hash algorithm number.
*
- * Return: pointer to the hash alg on success, else an ERR_PTR()
+ * Return: pointer to the hash alg if it's known, otherwise NULL.
*/
const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
unsigned int num)
{
- struct fsverity_hash_alg *alg;
- struct crypto_shash *tfm;
- int err;
-
if (num >= ARRAY_SIZE(fsverity_hash_algs) ||
!fsverity_hash_algs[num].name) {
fsverity_warn(inode, "Unknown hash algorithm number: %u", num);
- return ERR_PTR(-EINVAL);
- }
- alg = &fsverity_hash_algs[num];
-
- /* pairs with smp_store_release() below */
- if (likely(smp_load_acquire(&alg->tfm) != NULL))
- return alg;
-
- mutex_lock(&fsverity_hash_alg_init_mutex);
-
- if (alg->tfm != NULL)
- goto out_unlock;
-
- tfm = crypto_alloc_shash(alg->name, 0, 0);
- if (IS_ERR(tfm)) {
- if (PTR_ERR(tfm) == -ENOENT) {
- fsverity_warn(inode,
- "Missing crypto API support for hash algorithm \"%s\"",
- alg->name);
- alg = ERR_PTR(-ENOPKG);
- goto out_unlock;
- }
- fsverity_err(inode,
- "Error allocating hash algorithm \"%s\": %ld",
- alg->name, PTR_ERR(tfm));
- alg = ERR_CAST(tfm);
- goto out_unlock;
+ return NULL;
}
-
- err = -EINVAL;
- if (WARN_ON_ONCE(alg->digest_size != crypto_shash_digestsize(tfm)))
- goto err_free_tfm;
- if (WARN_ON_ONCE(alg->block_size != crypto_shash_blocksize(tfm)))
- goto err_free_tfm;
-
- pr_info("%s using implementation \"%s\"\n",
- alg->name, crypto_shash_driver_name(tfm));
-
- /* pairs with smp_load_acquire() above */
- smp_store_release(&alg->tfm, tfm);
- goto out_unlock;
-
-err_free_tfm:
- crypto_free_shash(tfm);
- alg = ERR_PTR(err);
-out_unlock:
- mutex_unlock(&fsverity_hash_alg_init_mutex);
- return alg;
+ return &fsverity_hash_algs[num];
}
/**
* fsverity_prepare_hash_state() - precompute the initial hash state
* @alg: hash algorithm
* @salt: a salt which is to be prepended to all data to be hashed
- * @salt_size: salt size in bytes, possibly 0
+ * @salt_size: salt size in bytes
*
- * Return: NULL if the salt is empty, otherwise the kmalloc()'ed precomputed
- * initial hash state on success or an ERR_PTR() on failure.
+ * Return: the kmalloc()'ed initial hash state, or NULL if out of memory.
*/
-const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
- const u8 *salt, size_t salt_size)
+union fsverity_hash_ctx *
+fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
+ const u8 *salt, size_t salt_size)
{
- u8 *hashstate = NULL;
- SHASH_DESC_ON_STACK(desc, alg->tfm);
u8 *padded_salt = NULL;
size_t padded_salt_size;
- int err;
-
- desc->tfm = alg->tfm;
-
- if (salt_size == 0)
- return NULL;
-
- hashstate = kmalloc(crypto_shash_statesize(alg->tfm), GFP_KERNEL);
- if (!hashstate)
- return ERR_PTR(-ENOMEM);
+ union fsverity_hash_ctx ctx;
+ void *res = NULL;
/*
* Zero-pad the salt to the next multiple of the input size of the hash
@@ -135,30 +69,26 @@ const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
*/
padded_salt_size = round_up(salt_size, alg->block_size);
padded_salt = kzalloc(padded_salt_size, GFP_KERNEL);
- if (!padded_salt) {
- err = -ENOMEM;
- goto err_free;
- }
+ if (!padded_salt)
+ return NULL;
memcpy(padded_salt, salt, salt_size);
- err = crypto_shash_init(desc);
- if (err)
- goto err_free;
-
- err = crypto_shash_update(desc, padded_salt, padded_salt_size);
- if (err)
- goto err_free;
-
- err = crypto_shash_export(desc, hashstate);
- if (err)
- goto err_free;
-out:
- kfree(padded_salt);
- return hashstate;
-err_free:
- kfree(hashstate);
- hashstate = ERR_PTR(err);
- goto out;
+ switch (alg->algo_id) {
+ case HASH_ALGO_SHA256:
+ sha256_init(&ctx.sha256);
+ sha256_update(&ctx.sha256, padded_salt, padded_salt_size);
+ res = kmemdup(&ctx.sha256, sizeof(ctx.sha256), GFP_KERNEL);
+ break;
+ case HASH_ALGO_SHA512:
+ sha512_init(&ctx.sha512);
+ sha512_update(&ctx.sha512, padded_salt, padded_salt_size);
+ res = kmemdup(&ctx.sha512, sizeof(ctx.sha512), GFP_KERNEL);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+ kfree(padded_salt);
+ return res;
}
/**
@@ -170,31 +100,32 @@ err_free:
*
* Hash a single data or hash block. The hash is salted if a salt is specified
* in the Merkle tree parameters.
- *
- * Return: 0 on success, -errno on failure
*/
-int fsverity_hash_block(const struct merkle_tree_params *params,
- const struct inode *inode, const void *data, u8 *out)
+void fsverity_hash_block(const struct merkle_tree_params *params,
+ const struct inode *inode, const void *data, u8 *out)
{
- SHASH_DESC_ON_STACK(desc, params->hash_alg->tfm);
- int err;
-
- desc->tfm = params->hash_alg->tfm;
-
- if (params->hashstate) {
- err = crypto_shash_import(desc, params->hashstate);
- if (err) {
- fsverity_err(inode,
- "Error %d importing hash state", err);
- return err;
- }
- err = crypto_shash_finup(desc, data, params->block_size, out);
- } else {
- err = crypto_shash_digest(desc, data, params->block_size, out);
+ union fsverity_hash_ctx ctx;
+
+ if (!params->hashstate) {
+ fsverity_hash_buffer(params->hash_alg, data, params->block_size,
+ out);
+ return;
+ }
+
+ switch (params->hash_alg->algo_id) {
+ case HASH_ALGO_SHA256:
+ ctx.sha256 = params->hashstate->sha256;
+ sha256_update(&ctx.sha256, data, params->block_size);
+ sha256_final(&ctx.sha256, out);
+ return;
+ case HASH_ALGO_SHA512:
+ ctx.sha512 = params->hashstate->sha512;
+ sha512_update(&ctx.sha512, data, params->block_size);
+ sha512_final(&ctx.sha512, out);
+ return;
+ default:
+ BUG();
}
- if (err)
- fsverity_err(inode, "Error %d computing block hash", err);
- return err;
}
/**
@@ -203,13 +134,20 @@ int fsverity_hash_block(const struct merkle_tree_params *params,
* @data: the data to hash
* @size: size of data to hash, in bytes
* @out: output digest, size 'alg->digest_size' bytes
- *
- * Return: 0 on success, -errno on failure
*/
-int fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
- const void *data, size_t size, u8 *out)
+void fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
+ const void *data, size_t size, u8 *out)
{
- return crypto_shash_tfm_digest(alg->tfm, data, size, out);
+ switch (alg->algo_id) {
+ case HASH_ALGO_SHA256:
+ sha256(data, size, out);
+ return;
+ case HASH_ALGO_SHA512:
+ sha512(data, size, out);
+ return;
+ default:
+ BUG();
+ }
}
void __init fsverity_check_hash_algs(void)
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
index 175d2f1bc089..388734132f01 100644
--- a/fs/verity/measure.c
+++ b/fs/verity/measure.c
@@ -9,6 +9,7 @@
#include <linux/bpf.h>
#include <linux/btf.h>
+#include <linux/export.h>
#include <linux/uaccess.h>
/**
diff --git a/fs/verity/open.c b/fs/verity/open.c
index fdeb95eca3af..c561e130cd0c 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -7,6 +7,7 @@
#include "fsverity_private.h"
+#include <linux/export.h>
#include <linux/mm.h>
#include <linux/slab.h>
@@ -42,18 +43,18 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
memset(params, 0, sizeof(*params));
hash_alg = fsverity_get_hash_alg(inode, hash_algorithm);
- if (IS_ERR(hash_alg))
- return PTR_ERR(hash_alg);
+ if (!hash_alg)
+ return -EINVAL;
params->hash_alg = hash_alg;
params->digest_size = hash_alg->digest_size;
- params->hashstate = fsverity_prepare_hash_state(hash_alg, salt,
- salt_size);
- if (IS_ERR(params->hashstate)) {
- err = PTR_ERR(params->hashstate);
- params->hashstate = NULL;
- fsverity_err(inode, "Error %d preparing hash state", err);
- goto out_err;
+ if (salt_size) {
+ params->hashstate =
+ fsverity_prepare_hash_state(hash_alg, salt, salt_size);
+ if (!params->hashstate) {
+ err = -ENOMEM;
+ goto out_err;
+ }
}
/*
@@ -158,18 +159,15 @@ out_err:
* Compute the file digest by hashing the fsverity_descriptor excluding the
* builtin signature and with the sig_size field set to 0.
*/
-static int compute_file_digest(const struct fsverity_hash_alg *hash_alg,
- struct fsverity_descriptor *desc,
- u8 *file_digest)
+static void compute_file_digest(const struct fsverity_hash_alg *hash_alg,
+ struct fsverity_descriptor *desc,
+ u8 *file_digest)
{
__le32 sig_size = desc->sig_size;
- int err;
desc->sig_size = 0;
- err = fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), file_digest);
+ fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), file_digest);
desc->sig_size = sig_size;
-
- return err;
}
/*
@@ -201,12 +199,7 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode,
memcpy(vi->root_hash, desc->root_hash, vi->tree_params.digest_size);
- err = compute_file_digest(vi->tree_params.hash_alg, desc,
- vi->file_digest);
- if (err) {
- fsverity_err(inode, "Error %d computing file digest", err);
- goto fail;
- }
+ compute_file_digest(vi->tree_params.hash_alg, desc, vi->file_digest);
err = fsverity_verify_signature(vi, desc->signature,
le32_to_cpu(desc->sig_size));
diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c
index f58432772d9e..cba5d6af4e04 100644
--- a/fs/verity/read_metadata.c
+++ b/fs/verity/read_metadata.c
@@ -8,6 +8,7 @@
#include "fsverity_private.h"
#include <linux/backing-dev.h>
+#include <linux/export.h>
#include <linux/highmem.h>
#include <linux/sched/signal.h>
#include <linux/uaccess.h>
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index 4fcad0825a12..a1f00c3fd3b2 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -7,8 +7,8 @@
#include "fsverity_private.h"
-#include <crypto/hash.h>
#include <linux/bio.h>
+#include <linux/export.h>
static struct workqueue_struct *fsverity_read_workqueue;
@@ -202,8 +202,7 @@ descend:
unsigned long hblock_idx = hblocks[level - 1].index;
unsigned int hoffset = hblocks[level - 1].hoffset;
- if (fsverity_hash_block(params, inode, haddr, real_hash) != 0)
- goto error;
+ fsverity_hash_block(params, inode, haddr, real_hash);
if (memcmp(want_hash, real_hash, hsize) != 0)
goto corrupted;
/*
@@ -222,8 +221,7 @@ descend:
}
/* Finally, verify the data block. */
- if (fsverity_hash_block(params, inode, data, real_hash) != 0)
- goto error;
+ fsverity_hash_block(params, inode, data, real_hash);
if (memcmp(want_hash, real_hash, hsize) != 0)
goto corrupted;
return true;
diff --git a/fs/xattr.c b/fs/xattr.c
index 8ec5b0204bfd..8851a5ef34f5 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -215,7 +215,7 @@ EXPORT_SYMBOL(__vfs_setxattr);
*
* returns the result of the internal setxattr or setsecurity operations.
*
- * This function requires the caller to lock the inode's i_mutex before it
+ * This function requires the caller to lock the inode's i_rwsem before it
* is executed. It also assumes that the caller will make the appropriate
* permission checks.
*/
@@ -1479,6 +1479,7 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
buffer += err;
}
remaining_size -= err;
+ err = 0;
read_lock(&xattrs->lock);
for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) {
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 7839efe050bf..000cc7f4a3ce 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -3444,16 +3444,41 @@ xfs_alloc_read_agf(
set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
}
+
#ifdef DEBUG
- else if (!xfs_is_shutdown(mp)) {
- ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
- ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
- ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
- ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
- ASSERT(pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level));
- ASSERT(pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level));
+ /*
+ * It's possible for the AGF to be out of sync if the block device is
+ * silently dropping writes. This can happen in fstests with dmflakey
+ * enabled, which allows the buffer to be cleaned and reclaimed by
+ * memory pressure and then re-read from disk here. We will get a
+ * stale version of the AGF from disk, and nothing good can happen from
+ * here. Hence if we detect this situation, immediately shut down the
+ * filesystem.
+ *
+ * This can also happen if we are already in the middle of a forced
+ * shutdown, so don't bother checking if we are already shut down.
+ */
+ if (!xfs_is_shutdown(pag_mount(pag))) {
+ bool ok = true;
+
+ ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks);
+ ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks);
+ ok &= pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks);
+ ok &= pag->pagf_flcount == be32_to_cpu(agf->agf_flcount);
+ ok &= pag->pagf_longest == be32_to_cpu(agf->agf_longest);
+ ok &= pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level);
+ ok &= pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level);
+
+ if (XFS_IS_CORRUPT(pag_mount(pag), !ok)) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF);
+ xfs_trans_brelse(tp, agfbp);
+ xfs_force_shutdown(pag_mount(pag),
+ SHUTDOWN_CORRUPT_ONDISK);
+ return -EFSCORRUPTED;
+ }
}
-#endif
+#endif /* DEBUG */
+
if (agfbpp)
*agfbpp = agfbp;
else
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index a4ac37ba5d51..fa1f03c1331e 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -186,35 +186,32 @@ xfs_allocbt_init_ptr_from_cur(
ptr->s = agf->agf_cnt_root;
}
-STATIC int64_t
-xfs_bnobt_key_diff(
+STATIC int
+xfs_bnobt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
struct xfs_alloc_rec_incore *rec = &cur->bc_rec.a;
const struct xfs_alloc_rec *kp = &key->alloc;
- return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
+ return cmp_int(be32_to_cpu(kp->ar_startblock),
+ rec->ar_startblock);
}
-STATIC int64_t
-xfs_cntbt_key_diff(
+STATIC int
+xfs_cntbt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
struct xfs_alloc_rec_incore *rec = &cur->bc_rec.a;
const struct xfs_alloc_rec *kp = &key->alloc;
- int64_t diff;
- diff = (int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
- if (diff)
- return diff;
-
- return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
+ return cmp_int(be32_to_cpu(kp->ar_blockcount), rec->ar_blockcount) ?:
+ cmp_int(be32_to_cpu(kp->ar_startblock), rec->ar_startblock);
}
-STATIC int64_t
-xfs_bnobt_diff_two_keys(
+STATIC int
+xfs_bnobt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
@@ -222,29 +219,24 @@ xfs_bnobt_diff_two_keys(
{
ASSERT(!mask || mask->alloc.ar_startblock);
- return (int64_t)be32_to_cpu(k1->alloc.ar_startblock) -
- be32_to_cpu(k2->alloc.ar_startblock);
+ return cmp_int(be32_to_cpu(k1->alloc.ar_startblock),
+ be32_to_cpu(k2->alloc.ar_startblock));
}
-STATIC int64_t
-xfs_cntbt_diff_two_keys(
+STATIC int
+xfs_cntbt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
const union xfs_btree_key *mask)
{
- int64_t diff;
-
ASSERT(!mask || (mask->alloc.ar_blockcount &&
mask->alloc.ar_startblock));
- diff = be32_to_cpu(k1->alloc.ar_blockcount) -
- be32_to_cpu(k2->alloc.ar_blockcount);
- if (diff)
- return diff;
-
- return be32_to_cpu(k1->alloc.ar_startblock) -
- be32_to_cpu(k2->alloc.ar_startblock);
+ return cmp_int(be32_to_cpu(k1->alloc.ar_blockcount),
+ be32_to_cpu(k2->alloc.ar_blockcount)) ?:
+ cmp_int(be32_to_cpu(k1->alloc.ar_startblock),
+ be32_to_cpu(k2->alloc.ar_startblock));
}
static xfs_failaddr_t
@@ -438,9 +430,9 @@ const struct xfs_btree_ops xfs_bnobt_ops = {
.init_high_key_from_rec = xfs_bnobt_init_high_key_from_rec,
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
- .key_diff = xfs_bnobt_key_diff,
+ .cmp_key_with_cur = xfs_bnobt_cmp_key_with_cur,
.buf_ops = &xfs_bnobt_buf_ops,
- .diff_two_keys = xfs_bnobt_diff_two_keys,
+ .cmp_two_keys = xfs_bnobt_cmp_two_keys,
.keys_inorder = xfs_bnobt_keys_inorder,
.recs_inorder = xfs_bnobt_recs_inorder,
.keys_contiguous = xfs_allocbt_keys_contiguous,
@@ -468,9 +460,9 @@ const struct xfs_btree_ops xfs_cntbt_ops = {
.init_high_key_from_rec = xfs_cntbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
- .key_diff = xfs_cntbt_key_diff,
+ .cmp_key_with_cur = xfs_cntbt_cmp_key_with_cur,
.buf_ops = &xfs_cntbt_buf_ops,
- .diff_two_keys = xfs_cntbt_diff_two_keys,
+ .cmp_two_keys = xfs_cntbt_cmp_two_keys,
.keys_inorder = xfs_cntbt_keys_inorder,
.recs_inorder = xfs_cntbt_recs_inorder,
.keys_contiguous = NULL, /* not needed right now */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 908d7b050e9c..188feac04b60 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -369,38 +369,26 @@ xfs_bmbt_init_rec_from_cur(
xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
}
-STATIC int64_t
-xfs_bmbt_key_diff(
+STATIC int
+xfs_bmbt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
- return (int64_t)be64_to_cpu(key->bmbt.br_startoff) -
- cur->bc_rec.b.br_startoff;
+ return cmp_int(be64_to_cpu(key->bmbt.br_startoff),
+ cur->bc_rec.b.br_startoff);
}
-STATIC int64_t
-xfs_bmbt_diff_two_keys(
+STATIC int
+xfs_bmbt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
const union xfs_btree_key *mask)
{
- uint64_t a = be64_to_cpu(k1->bmbt.br_startoff);
- uint64_t b = be64_to_cpu(k2->bmbt.br_startoff);
-
ASSERT(!mask || mask->bmbt.br_startoff);
- /*
- * Note: This routine previously casted a and b to int64 and subtracted
- * them to generate a result. This lead to problems if b was the
- * "maximum" key value (all ones) being signed incorrectly, hence this
- * somewhat less efficient version.
- */
- if (a > b)
- return 1;
- if (b > a)
- return -1;
- return 0;
+ return cmp_int(be64_to_cpu(k1->bmbt.br_startoff),
+ be64_to_cpu(k2->bmbt.br_startoff));
}
static xfs_failaddr_t
@@ -647,8 +635,8 @@ const struct xfs_btree_ops xfs_bmbt_ops = {
.init_key_from_rec = xfs_bmbt_init_key_from_rec,
.init_high_key_from_rec = xfs_bmbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_bmbt_init_rec_from_cur,
- .key_diff = xfs_bmbt_key_diff,
- .diff_two_keys = xfs_bmbt_diff_two_keys,
+ .cmp_key_with_cur = xfs_bmbt_cmp_key_with_cur,
+ .cmp_two_keys = xfs_bmbt_cmp_two_keys,
.buf_ops = &xfs_bmbt_buf_ops,
.keys_inorder = xfs_bmbt_keys_inorder,
.recs_inorder = xfs_bmbt_recs_inorder,
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 299ce7fd11b0..a61211d253f1 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -1985,7 +1985,7 @@ xfs_btree_lookup(
int *stat) /* success/failure */
{
struct xfs_btree_block *block; /* current btree block */
- int64_t diff; /* difference for the current key */
+ int cmp_r; /* current key comparison result */
int error; /* error return value */
int keyno; /* current key number */
int level; /* level in the btree */
@@ -2013,13 +2013,13 @@ xfs_btree_lookup(
* on the lookup record, then follow the corresponding block
* pointer down to the next level.
*/
- for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+ for (level = cur->bc_nlevels - 1, cmp_r = 1; level >= 0; level--) {
/* Get the block we need to do the lookup on. */
error = xfs_btree_lookup_get_block(cur, level, pp, &block);
if (error)
goto error0;
- if (diff == 0) {
+ if (cmp_r == 0) {
/*
* If we already had a key match at a higher level, we
* know we need to use the first entry in this block.
@@ -2065,15 +2065,16 @@ xfs_btree_lookup(
keyno, block, &key);
/*
- * Compute difference to get next direction:
+ * Compute comparison result to get next
+ * direction:
* - less than, move right
* - greater than, move left
* - equal, we're done
*/
- diff = cur->bc_ops->key_diff(cur, kp);
- if (diff < 0)
+ cmp_r = cur->bc_ops->cmp_key_with_cur(cur, kp);
+ if (cmp_r < 0)
low = keyno + 1;
- else if (diff > 0)
+ else if (cmp_r > 0)
high = keyno - 1;
else
break;
@@ -2089,7 +2090,7 @@ xfs_btree_lookup(
* If we moved left, need the previous key number,
* unless there isn't one.
*/
- if (diff > 0 && --keyno < 1)
+ if (cmp_r > 0 && --keyno < 1)
keyno = 1;
pp = xfs_btree_ptr_addr(cur, keyno, block);
@@ -2102,7 +2103,7 @@ xfs_btree_lookup(
}
/* Done with the search. See if we need to adjust the results. */
- if (dir != XFS_LOOKUP_LE && diff < 0) {
+ if (dir != XFS_LOOKUP_LE && cmp_r < 0) {
keyno++;
/*
* If ge search and we went off the end of the block, but it's
@@ -2125,14 +2126,14 @@ xfs_btree_lookup(
*stat = 1;
return 0;
}
- } else if (dir == XFS_LOOKUP_LE && diff > 0)
+ } else if (dir == XFS_LOOKUP_LE && cmp_r > 0)
keyno--;
cur->bc_levels[0].ptr = keyno;
/* Return if we succeeded or not. */
if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
*stat = 0;
- else if (dir != XFS_LOOKUP_EQ || diff == 0)
+ else if (dir != XFS_LOOKUP_EQ || cmp_r == 0)
*stat = 1;
else
*stat = 0;
@@ -5058,7 +5059,7 @@ xfs_btree_simple_query_range(
int error;
ASSERT(cur->bc_ops->init_high_key_from_rec);
- ASSERT(cur->bc_ops->diff_two_keys);
+ ASSERT(cur->bc_ops->cmp_two_keys);
/*
* Find the leftmost record. The btree cursor must be set
@@ -5352,15 +5353,15 @@ xfs_btree_count_blocks(
}
/* Compare two btree pointers. */
-int64_t
-xfs_btree_diff_two_ptrs(
+int
+xfs_btree_cmp_two_ptrs(
struct xfs_btree_cur *cur,
const union xfs_btree_ptr *a,
const union xfs_btree_ptr *b)
{
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
- return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
- return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
+ return cmp_int(be64_to_cpu(a->l), be64_to_cpu(b->l));
+ return cmp_int(be32_to_cpu(a->s), be32_to_cpu(b->s));
}
struct xfs_btree_has_records {
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 355b304696e6..60e78572e725 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -171,20 +171,23 @@ struct xfs_btree_ops {
void (*init_high_key_from_rec)(union xfs_btree_key *key,
const union xfs_btree_rec *rec);
- /* difference between key value and cursor value */
- int64_t (*key_diff)(struct xfs_btree_cur *cur,
- const union xfs_btree_key *key);
+ /*
+ * Compare key value and cursor value -- positive if key > cur,
+ * negative if key < cur, and zero if equal.
+ */
+ int (*cmp_key_with_cur)(struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key);
/*
- * Difference between key2 and key1 -- positive if key1 > key2,
- * negative if key1 < key2, and zero if equal. If the @mask parameter
- * is non NULL, each key field to be used in the comparison must
- * contain a nonzero value.
+ * Compare key1 and key2 -- positive if key1 > key2, negative if
+ * key1 < key2, and zero if equal. If the @mask parameter is non NULL,
+ * each key field to be used in the comparison must contain a nonzero
+ * value.
*/
- int64_t (*diff_two_keys)(struct xfs_btree_cur *cur,
- const union xfs_btree_key *key1,
- const union xfs_btree_key *key2,
- const union xfs_btree_key *mask);
+ int (*cmp_two_keys)(struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2,
+ const union xfs_btree_key *mask);
const struct xfs_buf_ops *buf_ops;
@@ -516,9 +519,9 @@ struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur,
int level, struct xfs_buf **bpp);
bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur,
const union xfs_btree_ptr *ptr);
-int64_t xfs_btree_diff_two_ptrs(struct xfs_btree_cur *cur,
- const union xfs_btree_ptr *a,
- const union xfs_btree_ptr *b);
+int xfs_btree_cmp_two_ptrs(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *a,
+ const union xfs_btree_ptr *b);
void xfs_btree_get_sibling(struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
union xfs_btree_ptr *ptr, int lr);
@@ -546,7 +549,7 @@ xfs_btree_keycmp_lt(
const union xfs_btree_key *key1,
const union xfs_btree_key *key2)
{
- return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) < 0;
+ return cur->bc_ops->cmp_two_keys(cur, key1, key2, NULL) < 0;
}
static inline bool
@@ -555,7 +558,7 @@ xfs_btree_keycmp_gt(
const union xfs_btree_key *key1,
const union xfs_btree_key *key2)
{
- return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) > 0;
+ return cur->bc_ops->cmp_two_keys(cur, key1, key2, NULL) > 0;
}
static inline bool
@@ -564,7 +567,7 @@ xfs_btree_keycmp_eq(
const union xfs_btree_key *key1,
const union xfs_btree_key *key2)
{
- return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) == 0;
+ return cur->bc_ops->cmp_two_keys(cur, key1, key2, NULL) == 0;
}
static inline bool
@@ -602,7 +605,7 @@ xfs_btree_masked_keycmp_lt(
const union xfs_btree_key *key2,
const union xfs_btree_key *mask)
{
- return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) < 0;
+ return cur->bc_ops->cmp_two_keys(cur, key1, key2, mask) < 0;
}
static inline bool
@@ -612,7 +615,7 @@ xfs_btree_masked_keycmp_gt(
const union xfs_btree_key *key2,
const union xfs_btree_key *mask)
{
- return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) > 0;
+ return cur->bc_ops->cmp_two_keys(cur, key1, key2, mask) > 0;
}
static inline bool
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 9566a7623365..779dac59b1f3 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -112,7 +112,7 @@ typedef struct xfs_sb {
uint16_t sb_sectsize; /* volume sector size, bytes */
uint16_t sb_inodesize; /* inode size, bytes */
uint16_t sb_inopblock; /* inodes per block */
- char sb_fname[XFSLABEL_MAX]; /* file system name */
+ char sb_fname[XFSLABEL_MAX] __nonstring; /* file system name */
uint8_t sb_blocklog; /* log2 of sb_blocksize */
uint8_t sb_sectlog; /* log2 of sb_sectsize */
uint8_t sb_inodelog; /* log2 of sb_inodesize */
diff --git a/fs/xfs/libxfs/xfs_group.c b/fs/xfs/libxfs/xfs_group.c
index e9d76bcdc820..792f76d2e2a0 100644
--- a/fs/xfs/libxfs/xfs_group.c
+++ b/fs/xfs/libxfs/xfs_group.c
@@ -163,7 +163,8 @@ xfs_group_free(
xfs_defer_drain_free(&xg->xg_intents_drain);
#ifdef __KERNEL__
- kfree(xg->xg_busy_extents);
+ if (xfs_group_has_extent_busy(xg->xg_mount, xg->xg_type))
+ kfree(xg->xg_busy_extents);
#endif
if (uninit)
@@ -171,7 +172,8 @@ xfs_group_free(
/* drop the mount's active reference */
xfs_group_rele(xg);
- XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_active_ref) != 0);
+ XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_active_ref) > 0);
+ XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_active_ref) < 0);
kfree_rcu_mightsleep(xg);
}
@@ -189,9 +191,11 @@ xfs_group_insert(
xg->xg_type = type;
#ifdef __KERNEL__
- xg->xg_busy_extents = xfs_extent_busy_alloc();
- if (!xg->xg_busy_extents)
- return -ENOMEM;
+ if (xfs_group_has_extent_busy(mp, type)) {
+ xg->xg_busy_extents = xfs_extent_busy_alloc();
+ if (!xg->xg_busy_extents)
+ return -ENOMEM;
+ }
spin_lock_init(&xg->xg_state_lock);
xfs_hooks_init(&xg->xg_rmap_update_hooks);
#endif
@@ -210,7 +214,8 @@ xfs_group_insert(
out_drain:
xfs_defer_drain_free(&xg->xg_intents_drain);
#ifdef __KERNEL__
- kfree(xg->xg_busy_extents);
+ if (xfs_group_has_extent_busy(xg->xg_mount, xg->xg_type))
+ kfree(xg->xg_busy_extents);
#endif
return error;
}
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 0c47b5c6ca7d..750111634d9f 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2801,12 +2801,35 @@ xfs_ialloc_read_agi(
set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
}
+#ifdef DEBUG
/*
- * It's possible for these to be out of sync if
- * we are in the middle of a forced shutdown.
+ * It's possible for the AGF to be out of sync if the block device is
+ * silently dropping writes. This can happen in fstests with dmflakey
+ * enabled, which allows the buffer to be cleaned and reclaimed by
+ * memory pressure and then re-read from disk here. We will get a
+ * stale version of the AGF from disk, and nothing good can happen from
+ * here. Hence if we detect this situation, immediately shut down the
+ * filesystem.
+ *
+ * This can also happen if we are already in the middle of a forced
+ * shutdown, so don't bother checking if we are already shut down.
*/
- ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
- xfs_is_shutdown(pag_mount(pag)));
+ if (!xfs_is_shutdown(pag_mount(pag))) {
+ bool ok = true;
+
+ ok &= pag->pagi_freecount == be32_to_cpu(agi->agi_freecount);
+ ok &= pag->pagi_count == be32_to_cpu(agi->agi_count);
+
+ if (XFS_IS_CORRUPT(pag_mount(pag), !ok)) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
+ xfs_trans_brelse(tp, agibp);
+ xfs_force_shutdown(pag_mount(pag),
+ SHUTDOWN_CORRUPT_ONDISK);
+ return -EFSCORRUPTED;
+ }
+ }
+#endif /* DEBUG */
+
if (agibpp)
*agibpp = agibp;
else
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 6f270d8f4270..100afdd66cdd 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -265,17 +265,17 @@ xfs_finobt_init_ptr_from_cur(
ptr->s = agi->agi_free_root;
}
-STATIC int64_t
-xfs_inobt_key_diff(
+STATIC int
+xfs_inobt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
- return (int64_t)be32_to_cpu(key->inobt.ir_startino) -
- cur->bc_rec.i.ir_startino;
+ return cmp_int(be32_to_cpu(key->inobt.ir_startino),
+ cur->bc_rec.i.ir_startino);
}
-STATIC int64_t
-xfs_inobt_diff_two_keys(
+STATIC int
+xfs_inobt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
@@ -283,8 +283,8 @@ xfs_inobt_diff_two_keys(
{
ASSERT(!mask || mask->inobt.ir_startino);
- return (int64_t)be32_to_cpu(k1->inobt.ir_startino) -
- be32_to_cpu(k2->inobt.ir_startino);
+ return cmp_int(be32_to_cpu(k1->inobt.ir_startino),
+ be32_to_cpu(k2->inobt.ir_startino));
}
static xfs_failaddr_t
@@ -430,9 +430,9 @@ const struct xfs_btree_ops xfs_inobt_ops = {
.init_high_key_from_rec = xfs_inobt_init_high_key_from_rec,
.init_rec_from_cur = xfs_inobt_init_rec_from_cur,
.init_ptr_from_cur = xfs_inobt_init_ptr_from_cur,
- .key_diff = xfs_inobt_key_diff,
+ .cmp_key_with_cur = xfs_inobt_cmp_key_with_cur,
.buf_ops = &xfs_inobt_buf_ops,
- .diff_two_keys = xfs_inobt_diff_two_keys,
+ .cmp_two_keys = xfs_inobt_cmp_two_keys,
.keys_inorder = xfs_inobt_keys_inorder,
.recs_inorder = xfs_inobt_recs_inorder,
.keys_contiguous = xfs_inobt_keys_contiguous,
@@ -460,9 +460,9 @@ const struct xfs_btree_ops xfs_finobt_ops = {
.init_high_key_from_rec = xfs_inobt_init_high_key_from_rec,
.init_rec_from_cur = xfs_inobt_init_rec_from_cur,
.init_ptr_from_cur = xfs_finobt_init_ptr_from_cur,
- .key_diff = xfs_inobt_key_diff,
+ .cmp_key_with_cur = xfs_inobt_cmp_key_with_cur,
.buf_ops = &xfs_finobt_buf_ops,
- .diff_two_keys = xfs_inobt_diff_two_keys,
+ .cmp_two_keys = xfs_inobt_cmp_two_keys,
.keys_inorder = xfs_inobt_keys_inorder,
.recs_inorder = xfs_inobt_recs_inorder,
.keys_contiguous = xfs_inobt_keys_contiguous,
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 66c7916fb5cd..95de23095030 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -104,7 +104,7 @@ struct xlog_recover_item {
struct list_head ri_list;
int ri_cnt; /* count of regions found */
int ri_total; /* total regions */
- struct xfs_log_iovec *ri_buf; /* ptr to regions buffer */
+ struct kvec *ri_buf; /* ptr to regions buffer */
const struct xlog_recover_item_ops *ri_ops;
};
@@ -117,7 +117,7 @@ struct xlog_recover {
struct list_head r_itemq; /* q for items */
};
-#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr)
+#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].iov_base)
#define XLOG_RECOVER_CRCPASS 0
#define XLOG_RECOVER_PASS1 1
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index cebe83f7842a..897784037483 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -2099,9 +2099,7 @@ xfs_refcount_recover_cow_leftovers(
* recording the CoW debris we cancel the (empty) transaction
* and everything goes away cleanly.
*/
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
+ tp = xfs_trans_alloc_empty(mp);
if (isrt) {
xfs_rtgroup_lock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT);
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 54505fee1852..06da3ca14727 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -174,8 +174,8 @@ xfs_refcountbt_init_ptr_from_cur(
ptr->s = agf->agf_refcount_root;
}
-STATIC int64_t
-xfs_refcountbt_key_diff(
+STATIC int
+xfs_refcountbt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
@@ -185,11 +185,11 @@ xfs_refcountbt_key_diff(
start = xfs_refcount_encode_startblock(irec->rc_startblock,
irec->rc_domain);
- return (int64_t)be32_to_cpu(kp->rc_startblock) - start;
+ return cmp_int(be32_to_cpu(kp->rc_startblock), start);
}
-STATIC int64_t
-xfs_refcountbt_diff_two_keys(
+STATIC int
+xfs_refcountbt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
@@ -197,8 +197,8 @@ xfs_refcountbt_diff_two_keys(
{
ASSERT(!mask || mask->refc.rc_startblock);
- return (int64_t)be32_to_cpu(k1->refc.rc_startblock) -
- be32_to_cpu(k2->refc.rc_startblock);
+ return cmp_int(be32_to_cpu(k1->refc.rc_startblock),
+ be32_to_cpu(k2->refc.rc_startblock));
}
STATIC xfs_failaddr_t
@@ -339,9 +339,9 @@ const struct xfs_btree_ops xfs_refcountbt_ops = {
.init_high_key_from_rec = xfs_refcountbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_refcountbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_refcountbt_init_ptr_from_cur,
- .key_diff = xfs_refcountbt_key_diff,
+ .cmp_key_with_cur = xfs_refcountbt_cmp_key_with_cur,
.buf_ops = &xfs_refcountbt_buf_ops,
- .diff_two_keys = xfs_refcountbt_diff_two_keys,
+ .cmp_two_keys = xfs_refcountbt_cmp_two_keys,
.keys_inorder = xfs_refcountbt_keys_inorder,
.recs_inorder = xfs_refcountbt_recs_inorder,
.keys_contiguous = xfs_refcountbt_keys_contiguous,
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 2cab694ac58a..bf16aee50d73 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -243,38 +243,22 @@ static inline uint64_t offset_keymask(uint64_t offset)
return offset & ~XFS_RMAP_OFF_UNWRITTEN;
}
-STATIC int64_t
-xfs_rmapbt_key_diff(
+STATIC int
+xfs_rmapbt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
struct xfs_rmap_irec *rec = &cur->bc_rec.r;
const struct xfs_rmap_key *kp = &key->rmap;
- __u64 x, y;
- int64_t d;
- d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
- if (d)
- return d;
-
- x = be64_to_cpu(kp->rm_owner);
- y = rec->rm_owner;
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
-
- x = offset_keymask(be64_to_cpu(kp->rm_offset));
- y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
- return 0;
+ return cmp_int(be32_to_cpu(kp->rm_startblock), rec->rm_startblock) ?:
+ cmp_int(be64_to_cpu(kp->rm_owner), rec->rm_owner) ?:
+ cmp_int(offset_keymask(be64_to_cpu(kp->rm_offset)),
+ offset_keymask(xfs_rmap_irec_offset_pack(rec)));
}
-STATIC int64_t
-xfs_rmapbt_diff_two_keys(
+STATIC int
+xfs_rmapbt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
@@ -282,36 +266,31 @@ xfs_rmapbt_diff_two_keys(
{
const struct xfs_rmap_key *kp1 = &k1->rmap;
const struct xfs_rmap_key *kp2 = &k2->rmap;
- int64_t d;
- __u64 x, y;
+ int d;
/* Doesn't make sense to mask off the physical space part */
ASSERT(!mask || mask->rmap.rm_startblock);
- d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
- be32_to_cpu(kp2->rm_startblock);
+ d = cmp_int(be32_to_cpu(kp1->rm_startblock),
+ be32_to_cpu(kp2->rm_startblock));
if (d)
return d;
if (!mask || mask->rmap.rm_owner) {
- x = be64_to_cpu(kp1->rm_owner);
- y = be64_to_cpu(kp2->rm_owner);
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
+ d = cmp_int(be64_to_cpu(kp1->rm_owner),
+ be64_to_cpu(kp2->rm_owner));
+ if (d)
+ return d;
}
if (!mask || mask->rmap.rm_offset) {
/* Doesn't make sense to allow offset but not owner */
ASSERT(!mask || mask->rmap.rm_owner);
- x = offset_keymask(be64_to_cpu(kp1->rm_offset));
- y = offset_keymask(be64_to_cpu(kp2->rm_offset));
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
+ d = cmp_int(offset_keymask(be64_to_cpu(kp1->rm_offset)),
+ offset_keymask(be64_to_cpu(kp2->rm_offset)));
+ if (d)
+ return d;
}
return 0;
@@ -515,9 +494,9 @@ const struct xfs_btree_ops xfs_rmapbt_ops = {
.init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_rmapbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur,
- .key_diff = xfs_rmapbt_key_diff,
+ .cmp_key_with_cur = xfs_rmapbt_cmp_key_with_cur,
.buf_ops = &xfs_rmapbt_buf_ops,
- .diff_two_keys = xfs_rmapbt_diff_two_keys,
+ .cmp_two_keys = xfs_rmapbt_cmp_two_keys,
.keys_inorder = xfs_rmapbt_keys_inorder,
.recs_inorder = xfs_rmapbt_recs_inorder,
.keys_contiguous = xfs_rmapbt_keys_contiguous,
@@ -632,9 +611,9 @@ const struct xfs_btree_ops xfs_rmapbt_mem_ops = {
.init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_rmapbt_init_rec_from_cur,
.init_ptr_from_cur = xfbtree_init_ptr_from_cur,
- .key_diff = xfs_rmapbt_key_diff,
+ .cmp_key_with_cur = xfs_rmapbt_cmp_key_with_cur,
.buf_ops = &xfs_rmapbt_mem_buf_ops,
- .diff_two_keys = xfs_rmapbt_diff_two_keys,
+ .cmp_two_keys = xfs_rmapbt_cmp_two_keys,
.keys_inorder = xfs_rmapbt_keys_inorder,
.recs_inorder = xfs_rmapbt_recs_inorder,
.keys_contiguous = xfs_rmapbt_keys_contiguous,
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
index 3db5e7a4a945..ac11e94b42ae 100644
--- a/fs/xfs/libxfs/xfs_rtrefcount_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
@@ -156,8 +156,8 @@ xfs_rtrefcountbt_init_ptr_from_cur(
ptr->l = 0;
}
-STATIC int64_t
-xfs_rtrefcountbt_key_diff(
+STATIC int
+xfs_rtrefcountbt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
@@ -167,11 +167,11 @@ xfs_rtrefcountbt_key_diff(
start = xfs_refcount_encode_startblock(irec->rc_startblock,
irec->rc_domain);
- return (int64_t)be32_to_cpu(kp->rc_startblock) - start;
+ return cmp_int(be32_to_cpu(kp->rc_startblock), start);
}
-STATIC int64_t
-xfs_rtrefcountbt_diff_two_keys(
+STATIC int
+xfs_rtrefcountbt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
@@ -179,8 +179,8 @@ xfs_rtrefcountbt_diff_two_keys(
{
ASSERT(!mask || mask->refc.rc_startblock);
- return (int64_t)be32_to_cpu(k1->refc.rc_startblock) -
- be32_to_cpu(k2->refc.rc_startblock);
+ return cmp_int(be32_to_cpu(k1->refc.rc_startblock),
+ be32_to_cpu(k2->refc.rc_startblock));
}
static xfs_failaddr_t
@@ -387,9 +387,9 @@ const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
.init_high_key_from_rec = xfs_rtrefcountbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_rtrefcountbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_rtrefcountbt_init_ptr_from_cur,
- .key_diff = xfs_rtrefcountbt_key_diff,
+ .cmp_key_with_cur = xfs_rtrefcountbt_cmp_key_with_cur,
.buf_ops = &xfs_rtrefcountbt_buf_ops,
- .diff_two_keys = xfs_rtrefcountbt_diff_two_keys,
+ .cmp_two_keys = xfs_rtrefcountbt_cmp_two_keys,
.keys_inorder = xfs_rtrefcountbt_keys_inorder,
.recs_inorder = xfs_rtrefcountbt_recs_inorder,
.keys_contiguous = xfs_rtrefcountbt_keys_contiguous,
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index 9bdc2cbfc113..55f903165769 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -185,38 +185,22 @@ static inline uint64_t offset_keymask(uint64_t offset)
return offset & ~XFS_RMAP_OFF_UNWRITTEN;
}
-STATIC int64_t
-xfs_rtrmapbt_key_diff(
+STATIC int
+xfs_rtrmapbt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
struct xfs_rmap_irec *rec = &cur->bc_rec.r;
const struct xfs_rmap_key *kp = &key->rmap;
- __u64 x, y;
- int64_t d;
- d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
- if (d)
- return d;
-
- x = be64_to_cpu(kp->rm_owner);
- y = rec->rm_owner;
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
-
- x = offset_keymask(be64_to_cpu(kp->rm_offset));
- y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
- return 0;
+ return cmp_int(be32_to_cpu(kp->rm_startblock), rec->rm_startblock) ?:
+ cmp_int(be64_to_cpu(kp->rm_owner), rec->rm_owner) ?:
+ cmp_int(offset_keymask(be64_to_cpu(kp->rm_offset)),
+ offset_keymask(xfs_rmap_irec_offset_pack(rec)));
}
-STATIC int64_t
-xfs_rtrmapbt_diff_two_keys(
+STATIC int
+xfs_rtrmapbt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
@@ -224,36 +208,31 @@ xfs_rtrmapbt_diff_two_keys(
{
const struct xfs_rmap_key *kp1 = &k1->rmap;
const struct xfs_rmap_key *kp2 = &k2->rmap;
- int64_t d;
- __u64 x, y;
+ int d;
/* Doesn't make sense to mask off the physical space part */
ASSERT(!mask || mask->rmap.rm_startblock);
- d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
- be32_to_cpu(kp2->rm_startblock);
+ d = cmp_int(be32_to_cpu(kp1->rm_startblock),
+ be32_to_cpu(kp2->rm_startblock));
if (d)
return d;
if (!mask || mask->rmap.rm_owner) {
- x = be64_to_cpu(kp1->rm_owner);
- y = be64_to_cpu(kp2->rm_owner);
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
+ d = cmp_int(be64_to_cpu(kp1->rm_owner),
+ be64_to_cpu(kp2->rm_owner));
+ if (d)
+ return d;
}
if (!mask || mask->rmap.rm_offset) {
/* Doesn't make sense to allow offset but not owner */
ASSERT(!mask || mask->rmap.rm_owner);
- x = offset_keymask(be64_to_cpu(kp1->rm_offset));
- y = offset_keymask(be64_to_cpu(kp2->rm_offset));
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
+ d = cmp_int(offset_keymask(be64_to_cpu(kp1->rm_offset)),
+ offset_keymask(be64_to_cpu(kp2->rm_offset)));
+ if (d)
+ return d;
}
return 0;
@@ -511,9 +490,9 @@ const struct xfs_btree_ops xfs_rtrmapbt_ops = {
.init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_rtrmapbt_init_ptr_from_cur,
- .key_diff = xfs_rtrmapbt_key_diff,
+ .cmp_key_with_cur = xfs_rtrmapbt_cmp_key_with_cur,
.buf_ops = &xfs_rtrmapbt_buf_ops,
- .diff_two_keys = xfs_rtrmapbt_diff_two_keys,
+ .cmp_two_keys = xfs_rtrmapbt_cmp_two_keys,
.keys_inorder = xfs_rtrmapbt_keys_inorder,
.recs_inorder = xfs_rtrmapbt_recs_inorder,
.keys_contiguous = xfs_rtrmapbt_keys_contiguous,
@@ -620,9 +599,9 @@ const struct xfs_btree_ops xfs_rtrmapbt_mem_ops = {
.init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur,
.init_ptr_from_cur = xfbtree_init_ptr_from_cur,
- .key_diff = xfs_rtrmapbt_key_diff,
+ .cmp_key_with_cur = xfs_rtrmapbt_cmp_key_with_cur,
.buf_ops = &xfs_rtrmapbt_mem_buf_ops,
- .diff_two_keys = xfs_rtrmapbt_diff_two_keys,
+ .cmp_two_keys = xfs_rtrmapbt_cmp_two_keys,
.keys_inorder = xfs_rtrmapbt_keys_inorder,
.recs_inorder = xfs_rtrmapbt_recs_inorder,
.keys_contiguous = xfs_rtrmapbt_keys_contiguous,
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index fe678a0438bc..cd6f0ff382a7 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -306,7 +306,7 @@ xchk_btree_block_check_sibling(
if (pbp)
xchk_buffer_recheck(bs->sc, pbp);
- if (xfs_btree_diff_two_ptrs(cur, pp, sibling))
+ if (xfs_btree_cmp_two_ptrs(cur, pp, sibling))
xchk_btree_set_corrupt(bs->sc, cur, level);
out:
xfs_btree_del_cursor(ncur, XFS_BTREE_ERROR);
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 28ad341df8ee..2ef7742be7d3 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -866,11 +866,11 @@ xchk_trans_cancel(
sc->tp = NULL;
}
-int
+void
xchk_trans_alloc_empty(
struct xfs_scrub *sc)
{
- return xfs_trans_alloc_empty(sc->mp, &sc->tp);
+ sc->tp = xfs_trans_alloc_empty(sc->mp);
}
/*
@@ -892,7 +892,8 @@ xchk_trans_alloc(
return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
resblks, 0, 0, &sc->tp);
- return xchk_trans_alloc_empty(sc);
+ xchk_trans_alloc_empty(sc);
+ return 0;
}
/* Set us up with a transaction and an empty context. */
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 19877d99f255..ddbc065c798c 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -7,7 +7,7 @@
#define __XFS_SCRUB_COMMON_H__
int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks);
-int xchk_trans_alloc_empty(struct xfs_scrub *sc);
+void xchk_trans_alloc_empty(struct xfs_scrub *sc);
void xchk_trans_cancel(struct xfs_scrub *sc);
bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno,
diff --git a/fs/xfs/scrub/dir_repair.c b/fs/xfs/scrub/dir_repair.c
index 249313882108..8d3b550990b5 100644
--- a/fs/xfs/scrub/dir_repair.c
+++ b/fs/xfs/scrub/dir_repair.c
@@ -1289,9 +1289,7 @@ xrep_dir_scan_dirtree(
if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))
xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED |
XFS_ILOCK_EXCL));
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
while ((error = xchk_iscan_iter(&rd->pscan.iscan, &ip)) == 1) {
bool flush;
@@ -1317,9 +1315,7 @@ xrep_dir_scan_dirtree(
if (error)
break;
- error = xchk_trans_alloc_empty(sc);
- if (error)
- break;
+ xchk_trans_alloc_empty(sc);
}
if (xchk_should_terminate(sc, &error))
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 9b598c5790ad..cebd0d526926 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -237,7 +237,8 @@ xchk_setup_fscounters(
return error;
}
- return xchk_trans_alloc_empty(sc);
+ xchk_trans_alloc_empty(sc);
+ return 0;
}
/*
diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c
index e21c16fbd15d..14939d7de349 100644
--- a/fs/xfs/scrub/metapath.c
+++ b/fs/xfs/scrub/metapath.c
@@ -318,9 +318,7 @@ xchk_metapath(
return 0;
}
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
error = xchk_metapath_ilock_both(mpath);
if (error)
diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c
index 4a47d0aabf73..26721fab5cab 100644
--- a/fs/xfs/scrub/nlinks.c
+++ b/fs/xfs/scrub/nlinks.c
@@ -555,9 +555,7 @@ xchk_nlinks_collect(
* do not take sb_internal.
*/
xchk_trans_cancel(sc);
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
while ((error = xchk_iscan_iter(&xnc->collect_iscan, &ip)) == 1) {
if (S_ISDIR(VFS_I(ip)->i_mode))
@@ -880,9 +878,7 @@ xchk_nlinks_compare(
* inactivation workqueue.
*/
xchk_trans_cancel(sc);
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
/*
* Use the inobt to walk all allocated inodes to compare the link
diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c
index 4ebdee095428..6ef2ee9c3814 100644
--- a/fs/xfs/scrub/nlinks_repair.c
+++ b/fs/xfs/scrub/nlinks_repair.c
@@ -340,9 +340,7 @@ xrep_nlinks(
* We can only push the inactivation workqueues with an empty
* transaction.
*/
- error = xchk_trans_alloc_empty(sc);
- if (error)
- break;
+ xchk_trans_alloc_empty(sc);
}
xchk_iscan_iter_finish(&xnc->compare_iscan);
xchk_iscan_teardown(&xnc->compare_iscan);
diff --git a/fs/xfs/scrub/parent_repair.c b/fs/xfs/scrub/parent_repair.c
index 31bfe10be22a..2949feda6271 100644
--- a/fs/xfs/scrub/parent_repair.c
+++ b/fs/xfs/scrub/parent_repair.c
@@ -569,9 +569,7 @@ xrep_parent_scan_dirtree(
if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))
xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED |
XFS_ILOCK_EXCL));
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
while ((error = xchk_iscan_iter(&rp->pscan.iscan, &ip)) == 1) {
bool flush;
@@ -597,9 +595,7 @@ xrep_parent_scan_dirtree(
if (error)
break;
- error = xchk_trans_alloc_empty(sc);
- if (error)
- break;
+ xchk_trans_alloc_empty(sc);
}
if (xchk_should_terminate(sc, &error))
@@ -1099,9 +1095,7 @@ xrep_parent_flush_xattrs(
xrep_tempfile_iounlock(rp->sc);
/* Recreate the empty transaction and relock the inode. */
- error = xchk_trans_alloc_empty(rp->sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(rp->sc);
xchk_ilock(rp->sc, XFS_ILOCK_EXCL);
return 0;
}
diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c
index dc4033b91e44..e4105aaafe84 100644
--- a/fs/xfs/scrub/quotacheck.c
+++ b/fs/xfs/scrub/quotacheck.c
@@ -505,9 +505,7 @@ xqcheck_collect_counts(
* transactions do not take sb_internal.
*/
xchk_trans_cancel(sc);
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
while ((error = xchk_iscan_iter(&xqc->iscan, &ip)) == 1) {
error = xqcheck_collect_inode(xqc, ip);
diff --git a/fs/xfs/scrub/rcbag_btree.c b/fs/xfs/scrub/rcbag_btree.c
index 709356dc6256..9a4ef823c5a7 100644
--- a/fs/xfs/scrub/rcbag_btree.c
+++ b/fs/xfs/scrub/rcbag_btree.c
@@ -47,29 +47,20 @@ rcbagbt_init_rec_from_cur(
bag_rec->rbg_refcount = bag_irec->rbg_refcount;
}
-STATIC int64_t
-rcbagbt_key_diff(
+STATIC int
+rcbagbt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec;
const struct rcbag_key *kp = (const struct rcbag_key *)key;
- if (kp->rbg_startblock > rec->rbg_startblock)
- return 1;
- if (kp->rbg_startblock < rec->rbg_startblock)
- return -1;
-
- if (kp->rbg_blockcount > rec->rbg_blockcount)
- return 1;
- if (kp->rbg_blockcount < rec->rbg_blockcount)
- return -1;
-
- return 0;
+ return cmp_int(kp->rbg_startblock, rec->rbg_startblock) ?:
+ cmp_int(kp->rbg_blockcount, rec->rbg_blockcount);
}
-STATIC int64_t
-rcbagbt_diff_two_keys(
+STATIC int
+rcbagbt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
@@ -80,17 +71,8 @@ rcbagbt_diff_two_keys(
ASSERT(mask == NULL);
- if (kp1->rbg_startblock > kp2->rbg_startblock)
- return 1;
- if (kp1->rbg_startblock < kp2->rbg_startblock)
- return -1;
-
- if (kp1->rbg_blockcount > kp2->rbg_blockcount)
- return 1;
- if (kp1->rbg_blockcount < kp2->rbg_blockcount)
- return -1;
-
- return 0;
+ return cmp_int(kp1->rbg_startblock, kp2->rbg_startblock) ?:
+ cmp_int(kp1->rbg_blockcount, kp2->rbg_blockcount);
}
STATIC int
@@ -201,9 +183,9 @@ static const struct xfs_btree_ops rcbagbt_mem_ops = {
.init_key_from_rec = rcbagbt_init_key_from_rec,
.init_rec_from_cur = rcbagbt_init_rec_from_cur,
.init_ptr_from_cur = xfbtree_init_ptr_from_cur,
- .key_diff = rcbagbt_key_diff,
+ .cmp_key_with_cur = rcbagbt_cmp_key_with_cur,
.buf_ops = &rcbagbt_mem_buf_ops,
- .diff_two_keys = rcbagbt_diff_two_keys,
+ .cmp_two_keys = rcbagbt_cmp_two_keys,
.keys_inorder = rcbagbt_keys_inorder,
.recs_inorder = rcbagbt_recs_inorder,
};
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index f8f9ed30f56b..d00c18954a26 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -1269,42 +1269,6 @@ xrep_setup_xfbtree(
}
/*
- * Create a dummy transaction for use in a live update hook function. This
- * function MUST NOT be called from regular repair code because the current
- * process' transaction is saved via the cookie.
- */
-int
-xrep_trans_alloc_hook_dummy(
- struct xfs_mount *mp,
- void **cookiep,
- struct xfs_trans **tpp)
-{
- int error;
-
- *cookiep = current->journal_info;
- current->journal_info = NULL;
-
- error = xfs_trans_alloc_empty(mp, tpp);
- if (!error)
- return 0;
-
- current->journal_info = *cookiep;
- *cookiep = NULL;
- return error;
-}
-
-/* Cancel a dummy transaction used by a live update hook function. */
-void
-xrep_trans_cancel_hook_dummy(
- void **cookiep,
- struct xfs_trans *tp)
-{
- xfs_trans_cancel(tp);
- current->journal_info = *cookiep;
- *cookiep = NULL;
-}
-
-/*
* See if this buffer can pass the given ->verify_struct() function.
*
* If the buffer already has ops attached and they're not the ones that were
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index af0a3a9e5ed9..9c04295742c8 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -180,10 +180,6 @@ int xrep_quotacheck(struct xfs_scrub *sc);
int xrep_reinit_pagf(struct xfs_scrub *sc);
int xrep_reinit_pagi(struct xfs_scrub *sc);
-int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep,
- struct xfs_trans **tpp);
-void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp);
-
bool xrep_buf_verify_struct(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
void xrep_inode_set_nblocks(struct xfs_scrub *sc, int64_t new_blocks);
int xrep_reset_metafile_resv(struct xfs_scrub *sc);
diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c
index f5f73078ffe2..17d4a38d735c 100644
--- a/fs/xfs/scrub/rmap_repair.c
+++ b/fs/xfs/scrub/rmap_repair.c
@@ -951,9 +951,7 @@ end_agscan:
sa->agf_bp = NULL;
sa->agi_bp = NULL;
xchk_trans_cancel(sc);
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
/* Iterate all AGs for inodes rmaps. */
while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
@@ -1612,7 +1610,6 @@ xrep_rmapbt_live_update(
struct xfs_mount *mp;
struct xfs_btree_cur *mcur;
struct xfs_trans *tp;
- void *txcookie;
int error;
rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb);
@@ -1623,9 +1620,7 @@ xrep_rmapbt_live_update(
trace_xrep_rmap_live_update(pag_group(rr->sc->sa.pag), action, p);
- error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp);
- if (error)
- goto out_abort;
+ tp = xfs_trans_alloc_empty(mp);
mutex_lock(&rr->lock);
mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree);
@@ -1639,14 +1634,13 @@ xrep_rmapbt_live_update(
if (error)
goto out_cancel;
- xrep_trans_cancel_hook_dummy(&txcookie, tp);
+ xfs_trans_cancel(tp);
mutex_unlock(&rr->lock);
return NOTIFY_DONE;
out_cancel:
xfbtree_trans_cancel(&rr->rmap_btree, tp);
- xrep_trans_cancel_hook_dummy(&txcookie, tp);
-out_abort:
+ xfs_trans_cancel(tp);
mutex_unlock(&rr->lock);
xchk_iscan_abort(&rr->iscan);
out_unlock:
diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c
index fc2592c53af5..7561941a337a 100644
--- a/fs/xfs/scrub/rtrmap_repair.c
+++ b/fs/xfs/scrub/rtrmap_repair.c
@@ -580,9 +580,7 @@ xrep_rtrmap_find_rmaps(
*/
xchk_trans_cancel(sc);
xchk_rtgroup_unlock(&sc->sr);
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
error = xrep_rtrmap_scan_inode(rr, ip);
@@ -846,7 +844,6 @@ xrep_rtrmapbt_live_update(
struct xfs_mount *mp;
struct xfs_btree_cur *mcur;
struct xfs_trans *tp;
- void *txcookie;
int error;
rr = container_of(nb, struct xrep_rtrmap, rhook.rmap_hook.nb);
@@ -857,9 +854,7 @@ xrep_rtrmapbt_live_update(
trace_xrep_rmap_live_update(rtg_group(rr->sc->sr.rtg), action, p);
- error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp);
- if (error)
- goto out_abort;
+ tp = xfs_trans_alloc_empty(mp);
mutex_lock(&rr->lock);
mcur = xfs_rtrmapbt_mem_cursor(rr->sc->sr.rtg, tp, &rr->rtrmap_btree);
@@ -873,14 +868,13 @@ xrep_rtrmapbt_live_update(
if (error)
goto out_cancel;
- xrep_trans_cancel_hook_dummy(&txcookie, tp);
+ xfs_trans_cancel(tp);
mutex_unlock(&rr->lock);
return NOTIFY_DONE;
out_cancel:
xfbtree_trans_cancel(&rr->rtrmap_btree, tp);
- xrep_trans_cancel_hook_dummy(&txcookie, tp);
-out_abort:
+ xfs_trans_cancel(tp);
xchk_iscan_abort(&rr->iscan);
mutex_unlock(&rr->lock);
out_unlock:
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 76e24032e99a..3c3b0d25006f 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -876,10 +876,7 @@ xchk_scrubv_open_by_handle(
struct xfs_inode *ip;
int error;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return NULL;
-
+ tp = xfs_trans_alloc_empty(mp);
error = xfs_iget(mp, tp, head->svh_ino, XCHK_IGET_FLAGS, 0, &ip);
xfs_trans_cancel(tp);
if (error)
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index d7c4ced47c15..1e6e9c10cea2 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -2996,7 +2996,7 @@ DEFINE_EVENT(xrep_pptr_salvage_class, name, \
DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_salvage_pptr);
DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_insert_pptr);
-TRACE_EVENT(xrep_xattr_class,
+DECLARE_EVENT_CLASS(xrep_xattr_class,
TP_PROTO(struct xfs_inode *ip, struct xfs_inode *arg_ip),
TP_ARGS(ip, arg_ip),
TP_STRUCT__entry(
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 26a04a783489..1ee4f835ac3c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -234,6 +234,47 @@ xfs_end_bio(
}
/*
+ * We cannot cancel the ioend directly on error. We may have already set other
+ * pages under writeback and hence we have to run I/O completion to mark the
+ * error state of the pages under writeback appropriately.
+ *
+ * If the folio has delalloc blocks on it, the caller is asking us to punch them
+ * out. If we don't, we can leave a stale delalloc mapping covered by a clean
+ * page that needs to be dirtied again before the delalloc mapping can be
+ * converted. This stale delalloc mapping can trip up a later direct I/O read
+ * operation on the same region.
+ *
+ * We prevent this by truncating away the delalloc regions on the folio. Because
+ * they are delalloc, we can do this without needing a transaction. Indeed - if
+ * we get ENOSPC errors, we have to be able to do this truncation without a
+ * transaction as there is no space left for block reservation (typically why
+ * we see a ENOSPC in writeback).
+ */
+static void
+xfs_discard_folio(
+ struct folio *folio,
+ loff_t pos)
+{
+ struct xfs_inode *ip = XFS_I(folio->mapping->host);
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (xfs_is_shutdown(mp))
+ return;
+
+ xfs_alert_ratelimited(mp,
+ "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
+ folio, ip->i_ino, pos);
+
+ /*
+ * The end of the punch range is always the offset of the first
+ * byte of the next folio. Hence the end offset is only dependent on the
+ * folio itself and not the start offset that is passed in.
+ */
+ xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
+ folio_pos(folio) + folio_size(folio), NULL);
+}
+
+/*
* Fast revalidation of the cached writeback mapping. Return true if the current
* mapping is valid, false otherwise.
*/
@@ -278,13 +319,12 @@ xfs_imap_valid(
static int
xfs_map_blocks(
struct iomap_writepage_ctx *wpc,
- struct inode *inode,
loff_t offset,
unsigned int len)
{
- struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_inode *ip = XFS_I(wpc->inode);
struct xfs_mount *mp = ip->i_mount;
- ssize_t count = i_blocksize(inode);
+ ssize_t count = i_blocksize(wpc->inode);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
xfs_fileoff_t cow_fsb;
@@ -436,81 +476,78 @@ allocate_blocks:
return 0;
}
-static int
-xfs_submit_ioend(
+static ssize_t
+xfs_writeback_range(
struct iomap_writepage_ctx *wpc,
- int status)
+ struct folio *folio,
+ u64 offset,
+ unsigned int len,
+ u64 end_pos)
{
- struct iomap_ioend *ioend = wpc->ioend;
- unsigned int nofs_flag;
-
- /*
- * We can allocate memory here while doing writeback on behalf of
- * memory reclaim. To avoid memory allocation deadlocks set the
- * task-wide nofs context for the following operations.
- */
- nofs_flag = memalloc_nofs_save();
+ ssize_t ret;
+
+ ret = xfs_map_blocks(wpc, offset, len);
+ if (!ret)
+ ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
+ if (ret < 0)
+ xfs_discard_folio(folio, offset);
+ return ret;
+}
- /* Convert CoW extents to regular */
- if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
- status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
- ioend->io_offset, ioend->io_size);
- }
+static bool
+xfs_ioend_needs_wq_completion(
+ struct iomap_ioend *ioend)
+{
+ /* Changing inode size requires a transaction. */
+ if (xfs_ioend_is_append(ioend))
+ return true;
- memalloc_nofs_restore(nofs_flag);
+ /* Extent manipulation requires a transaction. */
+ if (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))
+ return true;
- /* send ioends that might require a transaction to the completion wq */
- if (xfs_ioend_is_append(ioend) ||
- (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED)))
- ioend->io_bio.bi_end_io = xfs_end_bio;
+ /* Page cache invalidation cannot be done in irq context. */
+ if (ioend->io_flags & IOMAP_IOEND_DONTCACHE)
+ return true;
- if (status)
- return status;
- submit_bio(&ioend->io_bio);
- return 0;
+ return false;
}
-/*
- * If the folio has delalloc blocks on it, the caller is asking us to punch them
- * out. If we don't, we can leave a stale delalloc mapping covered by a clean
- * page that needs to be dirtied again before the delalloc mapping can be
- * converted. This stale delalloc mapping can trip up a later direct I/O read
- * operation on the same region.
- *
- * We prevent this by truncating away the delalloc regions on the folio. Because
- * they are delalloc, we can do this without needing a transaction. Indeed - if
- * we get ENOSPC errors, we have to be able to do this truncation without a
- * transaction as there is no space left for block reservation (typically why
- * we see a ENOSPC in writeback).
- */
-static void
-xfs_discard_folio(
- struct folio *folio,
- loff_t pos)
+static int
+xfs_writeback_submit(
+ struct iomap_writepage_ctx *wpc,
+ int error)
{
- struct xfs_inode *ip = XFS_I(folio->mapping->host);
- struct xfs_mount *mp = ip->i_mount;
+ struct iomap_ioend *ioend = wpc->wb_ctx;
- if (xfs_is_shutdown(mp))
- return;
+ /*
+ * Convert CoW extents to regular.
+ *
+ * We can allocate memory here while doing writeback on behalf of memory
+ * reclaim. To avoid memory allocation deadlocks, set the task-wide
+ * nofs context.
+ */
+ if (!error && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
+ unsigned int nofs_flag;
- xfs_alert_ratelimited(mp,
- "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
- folio, ip->i_ino, pos);
+ nofs_flag = memalloc_nofs_save();
+ error = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
+ ioend->io_offset, ioend->io_size);
+ memalloc_nofs_restore(nofs_flag);
+ }
/*
- * The end of the punch range is always the offset of the first
- * byte of the next folio. Hence the end offset is only dependent on the
- * folio itself and not the start offset that is passed in.
+ * Send ioends that might require a transaction to the completion wq.
*/
- xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
- folio_pos(folio) + folio_size(folio), NULL);
+ if (xfs_ioend_needs_wq_completion(ioend))
+ ioend->io_bio.bi_end_io = xfs_end_bio;
+
+ return iomap_ioend_writeback_submit(wpc, error);
}
static const struct iomap_writeback_ops xfs_writeback_ops = {
- .map_blocks = xfs_map_blocks,
- .submit_ioend = xfs_submit_ioend,
- .discard_folio = xfs_discard_folio,
+ .writeback_range = xfs_writeback_range,
+ .writeback_submit = xfs_writeback_submit,
};
struct xfs_zoned_writepage_ctx {
@@ -527,11 +564,10 @@ XFS_ZWPC(struct iomap_writepage_ctx *ctx)
static int
xfs_zoned_map_blocks(
struct iomap_writepage_ctx *wpc,
- struct inode *inode,
loff_t offset,
unsigned int len)
{
- struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_inode *ip = XFS_I(wpc->inode);
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len);
@@ -590,22 +626,44 @@ xfs_zoned_map_blocks(
return 0;
}
-static int
-xfs_zoned_submit_ioend(
+static ssize_t
+xfs_zoned_writeback_range(
struct iomap_writepage_ctx *wpc,
- int status)
+ struct folio *folio,
+ u64 offset,
+ unsigned int len,
+ u64 end_pos)
{
- wpc->ioend->io_bio.bi_end_io = xfs_end_bio;
- if (status)
- return status;
- xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone);
+ ssize_t ret;
+
+ ret = xfs_zoned_map_blocks(wpc, offset, len);
+ if (!ret)
+ ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
+ if (ret < 0)
+ xfs_discard_folio(folio, offset);
+ return ret;
+}
+
+static int
+xfs_zoned_writeback_submit(
+ struct iomap_writepage_ctx *wpc,
+ int error)
+{
+ struct iomap_ioend *ioend = wpc->wb_ctx;
+
+ ioend->io_bio.bi_end_io = xfs_end_bio;
+ if (error) {
+ ioend->io_bio.bi_status = errno_to_blk_status(error);
+ bio_endio(&ioend->io_bio);
+ return error;
+ }
+ xfs_zone_alloc_and_submit(ioend, &XFS_ZWPC(wpc)->open_zone);
return 0;
}
static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
- .map_blocks = xfs_zoned_map_blocks,
- .submit_ioend = xfs_zoned_submit_ioend,
- .discard_folio = xfs_discard_folio,
+ .writeback_range = xfs_zoned_writeback_range,
+ .writeback_submit = xfs_zoned_writeback_submit,
};
STATIC int
@@ -618,19 +676,29 @@ xfs_vm_writepages(
xfs_iflags_clear(ip, XFS_ITRUNCATED);
if (xfs_is_zoned_inode(ip)) {
- struct xfs_zoned_writepage_ctx xc = { };
+ struct xfs_zoned_writepage_ctx xc = {
+ .ctx = {
+ .inode = mapping->host,
+ .wbc = wbc,
+ .ops = &xfs_zoned_writeback_ops
+ },
+ };
int error;
- error = iomap_writepages(mapping, wbc, &xc.ctx,
- &xfs_zoned_writeback_ops);
+ error = iomap_writepages(&xc.ctx);
if (xc.open_zone)
xfs_open_zone_put(xc.open_zone);
return error;
} else {
- struct xfs_writepage_ctx wpc = { };
-
- return iomap_writepages(mapping, wbc, &wpc.ctx,
- &xfs_writeback_ops);
+ struct xfs_writepage_ctx wpc = {
+ .ctx = {
+ .inode = mapping->host,
+ .wbc = wbc,
+ .ops = &xfs_writeback_ops
+ },
+ };
+
+ return iomap_writepages(&wpc.ctx);
}
}
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index f683b7a9323f..5eef3bc30bda 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -91,41 +91,37 @@ xfs_attri_log_nameval_alloc(
name_len + new_name_len + value_len +
new_value_len);
- nv->name.i_addr = nv + 1;
- nv->name.i_len = name_len;
- nv->name.i_type = XLOG_REG_TYPE_ATTR_NAME;
- memcpy(nv->name.i_addr, name, name_len);
+ nv->name.iov_base = nv + 1;
+ nv->name.iov_len = name_len;
+ memcpy(nv->name.iov_base, name, name_len);
if (new_name_len) {
- nv->new_name.i_addr = nv->name.i_addr + name_len;
- nv->new_name.i_len = new_name_len;
- memcpy(nv->new_name.i_addr, new_name, new_name_len);
+ nv->new_name.iov_base = nv->name.iov_base + name_len;
+ nv->new_name.iov_len = new_name_len;
+ memcpy(nv->new_name.iov_base, new_name, new_name_len);
} else {
- nv->new_name.i_addr = NULL;
- nv->new_name.i_len = 0;
+ nv->new_name.iov_base = NULL;
+ nv->new_name.iov_len = 0;
}
- nv->new_name.i_type = XLOG_REG_TYPE_ATTR_NEWNAME;
if (value_len) {
- nv->value.i_addr = nv->name.i_addr + name_len + new_name_len;
- nv->value.i_len = value_len;
- memcpy(nv->value.i_addr, value, value_len);
+ nv->value.iov_base = nv->name.iov_base + name_len + new_name_len;
+ nv->value.iov_len = value_len;
+ memcpy(nv->value.iov_base, value, value_len);
} else {
- nv->value.i_addr = NULL;
- nv->value.i_len = 0;
+ nv->value.iov_base = NULL;
+ nv->value.iov_len = 0;
}
- nv->value.i_type = XLOG_REG_TYPE_ATTR_VALUE;
if (new_value_len) {
- nv->new_value.i_addr = nv->name.i_addr + name_len +
+ nv->new_value.iov_base = nv->name.iov_base + name_len +
new_name_len + value_len;
- nv->new_value.i_len = new_value_len;
- memcpy(nv->new_value.i_addr, new_value, new_value_len);
+ nv->new_value.iov_len = new_value_len;
+ memcpy(nv->new_value.iov_base, new_value, new_value_len);
} else {
- nv->new_value.i_addr = NULL;
- nv->new_value.i_len = 0;
+ nv->new_value.iov_base = NULL;
+ nv->new_value.iov_len = 0;
}
- nv->new_value.i_type = XLOG_REG_TYPE_ATTR_NEWVALUE;
refcount_set(&nv->refcount, 1);
return nv;
@@ -170,21 +166,21 @@ xfs_attri_item_size(
*nvecs += 2;
*nbytes += sizeof(struct xfs_attri_log_format) +
- xlog_calc_iovec_len(nv->name.i_len);
+ xlog_calc_iovec_len(nv->name.iov_len);
- if (nv->new_name.i_len) {
+ if (nv->new_name.iov_len) {
*nvecs += 1;
- *nbytes += xlog_calc_iovec_len(nv->new_name.i_len);
+ *nbytes += xlog_calc_iovec_len(nv->new_name.iov_len);
}
- if (nv->value.i_len) {
+ if (nv->value.iov_len) {
*nvecs += 1;
- *nbytes += xlog_calc_iovec_len(nv->value.i_len);
+ *nbytes += xlog_calc_iovec_len(nv->value.iov_len);
}
- if (nv->new_value.i_len) {
+ if (nv->new_value.iov_len) {
*nvecs += 1;
- *nbytes += xlog_calc_iovec_len(nv->new_value.i_len);
+ *nbytes += xlog_calc_iovec_len(nv->new_value.iov_len);
}
}
@@ -212,31 +208,36 @@ xfs_attri_item_format(
* the log recovery.
*/
- ASSERT(nv->name.i_len > 0);
+ ASSERT(nv->name.iov_len > 0);
attrip->attri_format.alfi_size++;
- if (nv->new_name.i_len > 0)
+ if (nv->new_name.iov_len > 0)
attrip->attri_format.alfi_size++;
- if (nv->value.i_len > 0)
+ if (nv->value.iov_len > 0)
attrip->attri_format.alfi_size++;
- if (nv->new_value.i_len > 0)
+ if (nv->new_value.iov_len > 0)
attrip->attri_format.alfi_size++;
xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT,
&attrip->attri_format,
sizeof(struct xfs_attri_log_format));
- xlog_copy_from_iovec(lv, &vecp, &nv->name);
- if (nv->new_name.i_len > 0)
- xlog_copy_from_iovec(lv, &vecp, &nv->new_name);
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NAME, nv->name.iov_base,
+ nv->name.iov_len);
- if (nv->value.i_len > 0)
- xlog_copy_from_iovec(lv, &vecp, &nv->value);
+ if (nv->new_name.iov_len > 0)
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NEWNAME,
+ nv->new_name.iov_base, nv->new_name.iov_len);
- if (nv->new_value.i_len > 0)
- xlog_copy_from_iovec(lv, &vecp, &nv->new_value);
+ if (nv->value.iov_len > 0)
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_VALUE,
+ nv->value.iov_base, nv->value.iov_len);
+
+ if (nv->new_value.iov_len > 0)
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NEWVALUE,
+ nv->new_value.iov_base, nv->new_value.iov_len);
}
/*
@@ -383,22 +384,22 @@ xfs_attr_log_item(
attrp->alfi_ino = args->dp->i_ino;
ASSERT(!(attr->xattri_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK));
attrp->alfi_op_flags = attr->xattri_op_flags;
- attrp->alfi_value_len = nv->value.i_len;
+ attrp->alfi_value_len = nv->value.iov_len;
switch (xfs_attr_log_item_op(attrp)) {
case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE:
- ASSERT(nv->value.i_len == nv->new_value.i_len);
+ ASSERT(nv->value.iov_len == nv->new_value.iov_len);
attrp->alfi_igen = VFS_I(args->dp)->i_generation;
- attrp->alfi_old_name_len = nv->name.i_len;
- attrp->alfi_new_name_len = nv->new_name.i_len;
+ attrp->alfi_old_name_len = nv->name.iov_len;
+ attrp->alfi_new_name_len = nv->new_name.iov_len;
break;
case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE:
case XFS_ATTRI_OP_FLAGS_PPTR_SET:
attrp->alfi_igen = VFS_I(args->dp)->i_generation;
fallthrough;
default:
- attrp->alfi_name_len = nv->name.i_len;
+ attrp->alfi_name_len = nv->name.iov_len;
break;
}
@@ -616,10 +617,7 @@ xfs_attri_iread_extents(
struct xfs_trans *tp;
int error;
- error = xfs_trans_alloc_empty(ip->i_mount, &tp);
- if (error)
- return error;
-
+ tp = xfs_trans_alloc_empty(ip->i_mount);
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_iread_extents(tp, ip, XFS_ATTR_FORK);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -690,14 +688,14 @@ xfs_attri_recover_work(
args->dp = ip;
args->geo = mp->m_attr_geo;
args->whichfork = XFS_ATTR_FORK;
- args->name = nv->name.i_addr;
- args->namelen = nv->name.i_len;
- args->new_name = nv->new_name.i_addr;
- args->new_namelen = nv->new_name.i_len;
- args->value = nv->value.i_addr;
- args->valuelen = nv->value.i_len;
- args->new_value = nv->new_value.i_addr;
- args->new_valuelen = nv->new_value.i_len;
+ args->name = nv->name.iov_base;
+ args->namelen = nv->name.iov_len;
+ args->new_name = nv->new_name.iov_base;
+ args->new_namelen = nv->new_name.iov_len;
+ args->value = nv->value.iov_base;
+ args->valuelen = nv->value.iov_len;
+ args->new_value = nv->new_value.iov_base;
+ args->new_valuelen = nv->new_value.iov_len;
args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK;
args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT |
XFS_DA_OP_LOGGED;
@@ -754,8 +752,8 @@ xfs_attr_recover_work(
*/
attrp = &attrip->attri_format;
if (!xfs_attri_validate(mp, attrp) ||
- !xfs_attr_namecheck(attrp->alfi_attr_filter, nv->name.i_addr,
- nv->name.i_len))
+ !xfs_attr_namecheck(attrp->alfi_attr_filter, nv->name.iov_base,
+ nv->name.iov_len))
return -EFSCORRUPTED;
attr = xfs_attri_recover_work(mp, dfp, attrp, &ip, nv);
@@ -953,50 +951,50 @@ static inline void *
xfs_attri_validate_name_iovec(
struct xfs_mount *mp,
struct xfs_attri_log_format *attri_formatp,
- const struct xfs_log_iovec *iovec,
+ const struct kvec *iovec,
unsigned int name_len)
{
- if (iovec->i_len != xlog_calc_iovec_len(name_len)) {
+ if (iovec->iov_len != xlog_calc_iovec_len(name_len)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
attri_formatp, sizeof(*attri_formatp));
return NULL;
}
- if (!xfs_attr_namecheck(attri_formatp->alfi_attr_filter, iovec->i_addr,
+ if (!xfs_attr_namecheck(attri_formatp->alfi_attr_filter, iovec->iov_base,
name_len)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
attri_formatp, sizeof(*attri_formatp));
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- iovec->i_addr, iovec->i_len);
+ iovec->iov_base, iovec->iov_len);
return NULL;
}
- return iovec->i_addr;
+ return iovec->iov_base;
}
static inline void *
xfs_attri_validate_value_iovec(
struct xfs_mount *mp,
struct xfs_attri_log_format *attri_formatp,
- const struct xfs_log_iovec *iovec,
+ const struct kvec *iovec,
unsigned int value_len)
{
- if (iovec->i_len != xlog_calc_iovec_len(value_len)) {
+ if (iovec->iov_len != xlog_calc_iovec_len(value_len)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
attri_formatp, sizeof(*attri_formatp));
return NULL;
}
if ((attri_formatp->alfi_attr_filter & XFS_ATTR_PARENT) &&
- !xfs_parent_valuecheck(mp, iovec->i_addr, value_len)) {
+ !xfs_parent_valuecheck(mp, iovec->iov_base, value_len)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
attri_formatp, sizeof(*attri_formatp));
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- iovec->i_addr, iovec->i_len);
+ iovec->iov_base, iovec->iov_len);
return NULL;
}
- return iovec->i_addr;
+ return iovec->iov_base;
}
STATIC int
@@ -1023,13 +1021,13 @@ xlog_recover_attri_commit_pass2(
/* Validate xfs_attri_log_format before the large memory allocation */
len = sizeof(struct xfs_attri_log_format);
- if (item->ri_buf[i].i_len != len) {
+ if (item->ri_buf[i].iov_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
- attri_formatp = item->ri_buf[i].i_addr;
+ attri_formatp = item->ri_buf[i].iov_base;
if (!xfs_attri_validate(mp, attri_formatp)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
attri_formatp, len);
@@ -1218,10 +1216,10 @@ xlog_recover_attrd_commit_pass2(
{
struct xfs_attrd_log_format *attrd_formatp;
- attrd_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_attrd_log_format)) {
+ attrd_formatp = item->ri_buf[0].iov_base;
+ if (item->ri_buf[0].iov_len != sizeof(struct xfs_attrd_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h
index e74128cbb722..d108a11b55ae 100644
--- a/fs/xfs/xfs_attr_item.h
+++ b/fs/xfs/xfs_attr_item.h
@@ -12,10 +12,10 @@ struct xfs_mount;
struct kmem_zone;
struct xfs_attri_log_nameval {
- struct xfs_log_iovec name;
- struct xfs_log_iovec new_name; /* PPTR_REPLACE only */
- struct xfs_log_iovec value;
- struct xfs_log_iovec new_value; /* PPTR_REPLACE only */
+ struct kvec name;
+ struct kvec new_name; /* PPTR_REPLACE only */
+ struct kvec value;
+ struct kvec new_value; /* PPTR_REPLACE only */
refcount_t refcount;
/* name and value follow the end of this struct */
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 646c515ee355..80f0c4bcc483 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -654,24 +654,24 @@ xlog_recover_bui_commit_pass2(
struct xfs_bui_log_format *bui_formatp;
size_t len;
- bui_formatp = item->ri_buf[0].i_addr;
+ bui_formatp = item->ri_buf[0].iov_base;
- if (item->ri_buf[0].i_len < xfs_bui_log_format_sizeof(0)) {
+ if (item->ri_buf[0].iov_len < xfs_bui_log_format_sizeof(0)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
len = xfs_bui_log_format_sizeof(bui_formatp->bui_nextents);
- if (item->ri_buf[0].i_len != len) {
+ if (item->ri_buf[0].iov_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -705,10 +705,10 @@ xlog_recover_bud_commit_pass2(
{
struct xfs_bud_log_format *bud_formatp;
- bud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) {
+ bud_formatp = item->ri_buf[0].iov_base;
+ if (item->ri_buf[0].iov_len != sizeof(struct xfs_bud_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 8af83bd161f9..f9ef3b2a332a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1683,7 +1683,7 @@ xfs_free_buftarg(
fs_put_dax(btp->bt_daxdev, btp->bt_mount);
/* the main block device is closed by kill_block_super */
if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev)
- bdev_fput(btp->bt_bdev_file);
+ bdev_fput(btp->bt_file);
kfree(btp);
}
@@ -1712,8 +1712,8 @@ xfs_configure_buftarg_atomic_writes(
max_bytes = 0;
}
- btp->bt_bdev_awu_min = min_bytes;
- btp->bt_bdev_awu_max = max_bytes;
+ btp->bt_awu_min = min_bytes;
+ btp->bt_awu_max = max_bytes;
}
/* Configure a buffer target that abstracts a block device. */
@@ -1738,14 +1738,9 @@ xfs_configure_buftarg(
return -EINVAL;
}
- /*
- * Flush the block device pagecache so our bios see anything dirtied
- * before mount.
- */
if (bdev_can_atomic_write(btp->bt_bdev))
xfs_configure_buftarg_atomic_writes(btp);
-
- return sync_blockdev(btp->bt_bdev);
+ return 0;
}
int
@@ -1803,7 +1798,7 @@ xfs_alloc_buftarg(
btp = kzalloc(sizeof(*btp), GFP_KERNEL | __GFP_NOFAIL);
btp->bt_mount = mp;
- btp->bt_bdev_file = bdev_file;
+ btp->bt_file = bdev_file;
btp->bt_bdev = file_bdev(bdev_file);
btp->bt_dev = btp->bt_bdev->bd_dev;
btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
@@ -2082,44 +2077,6 @@ xfs_buf_delwri_submit(
return error;
}
-/*
- * Push a single buffer on a delwri queue.
- *
- * The purpose of this function is to submit a single buffer of a delwri queue
- * and return with the buffer still on the original queue.
- *
- * The buffer locking and queue management logic between _delwri_pushbuf() and
- * _delwri_queue() guarantee that the buffer cannot be queued to another list
- * before returning.
- */
-int
-xfs_buf_delwri_pushbuf(
- struct xfs_buf *bp,
- struct list_head *buffer_list)
-{
- int error;
-
- ASSERT(bp->b_flags & _XBF_DELWRI_Q);
-
- trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
-
- xfs_buf_lock(bp);
- bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC);
- bp->b_flags |= XBF_WRITE;
- xfs_buf_submit(bp);
-
- /*
- * The buffer is now locked, under I/O but still on the original delwri
- * queue. Wait for I/O completion, restore the DELWRI_Q flag and
- * return with the buffer unlocked and still on the original queue.
- */
- error = xfs_buf_iowait(bp);
- bp->b_flags |= _XBF_DELWRI_Q;
- xfs_buf_unlock(bp);
-
- return error;
-}
-
void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
{
/*
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 9d2ab567cf81..b269e115d9ac 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -94,7 +94,6 @@ void xfs_buf_cache_destroy(struct xfs_buf_cache *bch);
*/
struct xfs_buftarg {
dev_t bt_dev;
- struct file *bt_bdev_file;
struct block_device *bt_bdev;
struct dax_device *bt_daxdev;
struct file *bt_file;
@@ -112,9 +111,9 @@ struct xfs_buftarg {
struct percpu_counter bt_readahead_count;
struct ratelimit_state bt_ioerror_rl;
- /* Atomic write unit values, bytes */
- unsigned int bt_bdev_awu_min;
- unsigned int bt_bdev_awu_max;
+ /* Hardware atomic write unit values, bytes */
+ unsigned int bt_awu_min;
+ unsigned int bt_awu_max;
/* built-in cache, if we're not using the perag one */
struct xfs_buf_cache bt_cache[];
@@ -326,7 +325,6 @@ extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
void xfs_buf_delwri_queue_here(struct xfs_buf *bp, struct list_head *bl);
extern int xfs_buf_delwri_submit(struct list_head *);
extern int xfs_buf_delwri_submit_nowait(struct list_head *);
-extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp)
{
@@ -376,7 +374,6 @@ extern void xfs_buftarg_wait(struct xfs_buftarg *);
extern void xfs_buftarg_drain(struct xfs_buftarg *);
int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize);
-#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 90139e0f3271..8d85b5eee444 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -32,19 +32,74 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
return container_of(lip, struct xfs_buf_log_item, bli_item);
}
+static void
+xfs_buf_item_get_format(
+ struct xfs_buf_log_item *bip,
+ int count)
+{
+ ASSERT(bip->bli_formats == NULL);
+ bip->bli_format_count = count;
+
+ if (count == 1) {
+ bip->bli_formats = &bip->__bli_format;
+ return;
+ }
+
+ bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format),
+ GFP_KERNEL | __GFP_NOFAIL);
+}
+
+static void
+xfs_buf_item_free_format(
+ struct xfs_buf_log_item *bip)
+{
+ if (bip->bli_formats != &bip->__bli_format) {
+ kfree(bip->bli_formats);
+ bip->bli_formats = NULL;
+ }
+}
+
+static void
+xfs_buf_item_free(
+ struct xfs_buf_log_item *bip)
+{
+ xfs_buf_item_free_format(bip);
+ kvfree(bip->bli_item.li_lv_shadow);
+ kmem_cache_free(xfs_buf_item_cache, bip);
+}
+
+/*
+ * xfs_buf_item_relse() is called when the buf log item is no longer needed.
+ */
+static void
+xfs_buf_item_relse(
+ struct xfs_buf_log_item *bip)
+{
+ struct xfs_buf *bp = bip->bli_buf;
+
+ trace_xfs_buf_item_relse(bp, _RET_IP_);
+
+ ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
+ ASSERT(atomic_read(&bip->bli_refcount) == 0);
+
+ bp->b_log_item = NULL;
+ xfs_buf_rele(bp);
+ xfs_buf_item_free(bip);
+}
+
/* Is this log iovec plausibly large enough to contain the buffer log format? */
bool
xfs_buf_log_check_iovec(
- struct xfs_log_iovec *iovec)
+ struct kvec *iovec)
{
- struct xfs_buf_log_format *blfp = iovec->i_addr;
+ struct xfs_buf_log_format *blfp = iovec->iov_base;
char *bmp_end;
char *item_end;
- if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len)
+ if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->iov_len)
return false;
- item_end = (char *)iovec->i_addr + iovec->i_len;
+ item_end = (char *)iovec->iov_base + iovec->iov_len;
bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size];
return bmp_end <= item_end;
}
@@ -390,6 +445,42 @@ xfs_buf_item_pin(
}
/*
+ * For a stale BLI, process all the necessary completions that must be
+ * performed when the final BLI reference goes away. The buffer will be
+ * referenced and locked here - we return to the caller with the buffer still
+ * referenced and locked for them to finalise processing of the buffer.
+ */
+static void
+xfs_buf_item_finish_stale(
+ struct xfs_buf_log_item *bip)
+{
+ struct xfs_buf *bp = bip->bli_buf;
+ struct xfs_log_item *lip = &bip->bli_item;
+
+ ASSERT(bip->bli_flags & XFS_BLI_STALE);
+ ASSERT(xfs_buf_islocked(bp));
+ ASSERT(bp->b_flags & XBF_STALE);
+ ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
+ ASSERT(list_empty(&lip->li_trans));
+ ASSERT(!bp->b_transp);
+
+ if (bip->bli_flags & XFS_BLI_STALE_INODE) {
+ xfs_buf_item_done(bp);
+ xfs_buf_inode_iodone(bp);
+ ASSERT(list_empty(&bp->b_li_list));
+ return;
+ }
+
+ /*
+ * We may or may not be on the AIL here, xfs_trans_ail_delete() will do
+ * the right thing regardless of the situation in which we are called.
+ */
+ xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR);
+ xfs_buf_item_relse(bip);
+ ASSERT(bp->b_log_item == NULL);
+}
+
+/*
* This is called to unpin the buffer associated with the buf log item which was
* previously pinned with a call to xfs_buf_item_pin(). We enter this function
* with a buffer pin count, a buffer reference and a BLI reference.
@@ -438,13 +529,6 @@ xfs_buf_item_unpin(
}
if (stale) {
- ASSERT(bip->bli_flags & XFS_BLI_STALE);
- ASSERT(xfs_buf_islocked(bp));
- ASSERT(bp->b_flags & XBF_STALE);
- ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
- ASSERT(list_empty(&lip->li_trans));
- ASSERT(!bp->b_transp);
-
trace_xfs_buf_item_unpin_stale(bip);
/*
@@ -455,22 +539,7 @@ xfs_buf_item_unpin(
* processing is complete.
*/
xfs_buf_rele(bp);
-
- /*
- * If we get called here because of an IO error, we may or may
- * not have the item on the AIL. xfs_trans_ail_delete() will
- * take care of that situation. xfs_trans_ail_delete() drops
- * the AIL lock.
- */
- if (bip->bli_flags & XFS_BLI_STALE_INODE) {
- xfs_buf_item_done(bp);
- xfs_buf_inode_iodone(bp);
- ASSERT(list_empty(&bp->b_li_list));
- } else {
- xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR);
- xfs_buf_item_relse(bp);
- ASSERT(bp->b_log_item == NULL);
- }
+ xfs_buf_item_finish_stale(bip);
xfs_buf_relse(bp);
return;
}
@@ -543,43 +612,42 @@ xfs_buf_item_push(
* Drop the buffer log item refcount and take appropriate action. This helper
* determines whether the bli must be freed or not, since a decrement to zero
* does not necessarily mean the bli is unused.
- *
- * Return true if the bli is freed, false otherwise.
*/
-bool
+void
xfs_buf_item_put(
struct xfs_buf_log_item *bip)
{
- struct xfs_log_item *lip = &bip->bli_item;
- bool aborted;
- bool dirty;
+
+ ASSERT(xfs_buf_islocked(bip->bli_buf));
/* drop the bli ref and return if it wasn't the last one */
if (!atomic_dec_and_test(&bip->bli_refcount))
- return false;
+ return;
- /*
- * We dropped the last ref and must free the item if clean or aborted.
- * If the bli is dirty and non-aborted, the buffer was clean in the
- * transaction but still awaiting writeback from previous changes. In
- * that case, the bli is freed on buffer writeback completion.
- */
- aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) ||
- xlog_is_shutdown(lip->li_log);
- dirty = bip->bli_flags & XFS_BLI_DIRTY;
- if (dirty && !aborted)
- return false;
+ /* If the BLI is in the AIL, then it is still dirty and in use */
+ if (test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)) {
+ ASSERT(bip->bli_flags & XFS_BLI_DIRTY);
+ return;
+ }
/*
- * The bli is aborted or clean. An aborted item may be in the AIL
- * regardless of dirty state. For example, consider an aborted
- * transaction that invalidated a dirty bli and cleared the dirty
- * state.
+ * In shutdown conditions, we can be asked to free a dirty BLI that
+ * isn't in the AIL. This can occur due to a checkpoint aborting a BLI
+ * instead of inserting it into the AIL at checkpoint IO completion. If
+ * there's another bli reference (e.g. a btree cursor holds a clean
+ * reference) and it is released via xfs_trans_brelse(), we can get here
+ * with that aborted, dirty BLI. In this case, it is safe to free the
+ * dirty BLI immediately, as it is not in the AIL and there are no
+ * other references to it.
+ *
+ * We should never get here with a stale BLI via that path as
+ * xfs_trans_brelse() specifically holds onto stale buffers rather than
+ * releasing them.
*/
- if (aborted)
- xfs_trans_ail_delete(lip, 0);
- xfs_buf_item_relse(bip->bli_buf);
- return true;
+ ASSERT(!(bip->bli_flags & XFS_BLI_DIRTY) ||
+ test_bit(XFS_LI_ABORTED, &bip->bli_item.li_flags));
+ ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+ xfs_buf_item_relse(bip);
}
/*
@@ -600,6 +668,15 @@ xfs_buf_item_put(
* if necessary but do not unlock the buffer. This is for support of
* xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
* free the item.
+ *
+ * If the XFS_BLI_STALE flag is set, the last reference to the BLI *must*
+ * perform a completion abort of any objects attached to the buffer for IO
+ * tracking purposes. This generally only happens in shutdown situations,
+ * normally xfs_buf_item_unpin() will drop the last BLI reference and perform
+ * completion processing. However, because transaction completion can race with
+ * checkpoint completion during a shutdown, this release context may end up
+ * being the last active reference to the BLI and so needs to perform this
+ * cleanup.
*/
STATIC void
xfs_buf_item_release(
@@ -607,18 +684,19 @@ xfs_buf_item_release(
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
- bool released;
bool hold = bip->bli_flags & XFS_BLI_HOLD;
bool stale = bip->bli_flags & XFS_BLI_STALE;
-#if defined(DEBUG) || defined(XFS_WARN)
- bool ordered = bip->bli_flags & XFS_BLI_ORDERED;
- bool dirty = bip->bli_flags & XFS_BLI_DIRTY;
bool aborted = test_bit(XFS_LI_ABORTED,
&lip->li_flags);
+ bool dirty = bip->bli_flags & XFS_BLI_DIRTY;
+#if defined(DEBUG) || defined(XFS_WARN)
+ bool ordered = bip->bli_flags & XFS_BLI_ORDERED;
#endif
trace_xfs_buf_item_release(bip);
+ ASSERT(xfs_buf_islocked(bp));
+
/*
* The bli dirty state should match whether the blf has logged segments
* except for ordered buffers, where only the bli should be dirty.
@@ -634,16 +712,56 @@ xfs_buf_item_release(
bp->b_transp = NULL;
bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
+ /* If there are other references, then we have nothing to do. */
+ if (!atomic_dec_and_test(&bip->bli_refcount))
+ goto out_release;
+
+ /*
+ * Stale buffer completion frees the BLI, unlocks and releases the
+ * buffer. Neither the BLI or buffer are safe to reference after this
+ * call, so there's nothing more we need to do here.
+ *
+ * If we get here with a stale buffer and references to the BLI remain,
+ * we must not unlock the buffer as the last BLI reference owns lock
+ * context, not us.
+ */
+ if (stale) {
+ xfs_buf_item_finish_stale(bip);
+ xfs_buf_relse(bp);
+ ASSERT(!hold);
+ return;
+ }
+
/*
- * Unref the item and unlock the buffer unless held or stale. Stale
- * buffers remain locked until final unpin unless the bli is freed by
- * the unref call. The latter implies shutdown because buffer
- * invalidation dirties the bli and transaction.
+ * Dirty or clean, aborted items are done and need to be removed from
+ * the AIL and released. This frees the BLI, but leaves the buffer
+ * locked and referenced.
*/
- released = xfs_buf_item_put(bip);
- if (hold || (stale && !released))
+ if (aborted || xlog_is_shutdown(lip->li_log)) {
+ ASSERT(list_empty(&bip->bli_buf->b_li_list));
+ xfs_buf_item_done(bp);
+ goto out_release;
+ }
+
+ /*
+ * Clean, unreferenced BLIs can be immediately freed, leaving the buffer
+ * locked and referenced.
+ *
+ * Dirty, unreferenced BLIs *must* be in the AIL awaiting writeback.
+ */
+ if (!dirty)
+ xfs_buf_item_relse(bip);
+ else
+ ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags));
+
+ /* Not safe to reference the BLI from here */
+out_release:
+ /*
+ * If we get here with a stale buffer, we must not unlock the
+ * buffer as the last BLI reference owns lock context, not us.
+ */
+ if (stale || hold)
return;
- ASSERT(!stale || aborted);
xfs_buf_relse(bp);
}
@@ -729,33 +847,6 @@ static const struct xfs_item_ops xfs_buf_item_ops = {
.iop_push = xfs_buf_item_push,
};
-STATIC void
-xfs_buf_item_get_format(
- struct xfs_buf_log_item *bip,
- int count)
-{
- ASSERT(bip->bli_formats == NULL);
- bip->bli_format_count = count;
-
- if (count == 1) {
- bip->bli_formats = &bip->__bli_format;
- return;
- }
-
- bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format),
- GFP_KERNEL | __GFP_NOFAIL);
-}
-
-STATIC void
-xfs_buf_item_free_format(
- struct xfs_buf_log_item *bip)
-{
- if (bip->bli_formats != &bip->__bli_format) {
- kfree(bip->bli_formats);
- bip->bli_formats = NULL;
- }
-}
-
/*
* Allocate a new buf log item to go with the given buffer.
* Set the buffer's b_log_item field to point to the new
@@ -976,34 +1067,6 @@ xfs_buf_item_dirty_format(
return false;
}
-STATIC void
-xfs_buf_item_free(
- struct xfs_buf_log_item *bip)
-{
- xfs_buf_item_free_format(bip);
- kvfree(bip->bli_item.li_lv_shadow);
- kmem_cache_free(xfs_buf_item_cache, bip);
-}
-
-/*
- * xfs_buf_item_relse() is called when the buf log item is no longer needed.
- */
-void
-xfs_buf_item_relse(
- struct xfs_buf *bp)
-{
- struct xfs_buf_log_item *bip = bp->b_log_item;
-
- trace_xfs_buf_item_relse(bp, _RET_IP_);
- ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
-
- if (atomic_read(&bip->bli_refcount))
- return;
- bp->b_log_item = NULL;
- xfs_buf_rele(bp);
- xfs_buf_item_free(bip);
-}
-
void
xfs_buf_item_done(
struct xfs_buf *bp)
@@ -1023,5 +1086,5 @@ xfs_buf_item_done(
xfs_trans_ail_delete(&bp->b_log_item->bli_item,
(bp->b_flags & _XBF_LOGRECOVERY) ? 0 :
SHUTDOWN_CORRUPT_INCORE);
- xfs_buf_item_relse(bp);
+ xfs_buf_item_relse(bp->b_log_item);
}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index e10e324cd245..3159325dd17b 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -49,8 +49,7 @@ struct xfs_buf_log_item {
int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
void xfs_buf_item_done(struct xfs_buf *bp);
-void xfs_buf_item_relse(struct xfs_buf *);
-bool xfs_buf_item_put(struct xfs_buf_log_item *);
+void xfs_buf_item_put(struct xfs_buf_log_item *bip);
void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint);
bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
void xfs_buf_inode_iodone(struct xfs_buf *);
@@ -62,7 +61,7 @@ static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp)
}
#endif /* CONFIG_XFS_QUOTA */
void xfs_buf_iodone(struct xfs_buf *);
-bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec);
+bool xfs_buf_log_check_iovec(struct kvec *iovec);
unsigned int xfs_buf_inval_log_space(unsigned int map_count,
unsigned int blocksize);
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index d4c5cef5bc43..5d58e2ae4972 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -159,7 +159,7 @@ STATIC enum xlog_recover_reorder
xlog_recover_buf_reorder(
struct xlog_recover_item *item)
{
- struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
+ struct xfs_buf_log_format *buf_f = item->ri_buf[0].iov_base;
if (buf_f->blf_flags & XFS_BLF_CANCEL)
return XLOG_REORDER_CANCEL_LIST;
@@ -173,7 +173,7 @@ xlog_recover_buf_ra_pass2(
struct xlog *log,
struct xlog_recover_item *item)
{
- struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
+ struct xfs_buf_log_format *buf_f = item->ri_buf[0].iov_base;
xlog_buf_readahead(log, buf_f->blf_blkno, buf_f->blf_len, NULL);
}
@@ -187,11 +187,11 @@ xlog_recover_buf_commit_pass1(
struct xlog *log,
struct xlog_recover_item *item)
{
- struct xfs_buf_log_format *bf = item->ri_buf[0].i_addr;
+ struct xfs_buf_log_format *bf = item->ri_buf[0].iov_base;
if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) {
- xfs_err(log->l_mp, "bad buffer log item size (%d)",
- item->ri_buf[0].i_len);
+ xfs_err(log->l_mp, "bad buffer log item size (%zd)",
+ item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -487,8 +487,8 @@ xlog_recover_do_reg_buffer(
nbits = xfs_contig_bits(buf_f->blf_data_map,
buf_f->blf_map_size, bit);
ASSERT(nbits > 0);
- ASSERT(item->ri_buf[i].i_addr != NULL);
- ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
+ ASSERT(item->ri_buf[i].iov_base != NULL);
+ ASSERT(item->ri_buf[i].iov_len % XFS_BLF_CHUNK == 0);
ASSERT(BBTOB(bp->b_length) >=
((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
@@ -500,8 +500,8 @@ xlog_recover_do_reg_buffer(
* the log. Hence we need to trim nbits back to the length of
* the current region being copied out of the log.
*/
- if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
- nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
+ if (item->ri_buf[i].iov_len < (nbits << XFS_BLF_SHIFT))
+ nbits = item->ri_buf[i].iov_len >> XFS_BLF_SHIFT;
/*
* Do a sanity check if this is a dquot buffer. Just checking
@@ -511,18 +511,18 @@ xlog_recover_do_reg_buffer(
fa = NULL;
if (buf_f->blf_flags &
(XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
- if (item->ri_buf[i].i_addr == NULL) {
+ if (item->ri_buf[i].iov_base == NULL) {
xfs_alert(mp,
"XFS: NULL dquot in %s.", __func__);
goto next;
}
- if (item->ri_buf[i].i_len < size_disk_dquot) {
+ if (item->ri_buf[i].iov_len < size_disk_dquot) {
xfs_alert(mp,
- "XFS: dquot too small (%d) in %s.",
- item->ri_buf[i].i_len, __func__);
+ "XFS: dquot too small (%zd) in %s.",
+ item->ri_buf[i].iov_len, __func__);
goto next;
}
- fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1);
+ fa = xfs_dquot_verify(mp, item->ri_buf[i].iov_base, -1);
if (fa) {
xfs_alert(mp,
"dquot corrupt at %pS trying to replay into block 0x%llx",
@@ -533,7 +533,7 @@ xlog_recover_do_reg_buffer(
memcpy(xfs_buf_offset(bp,
(uint)bit << XFS_BLF_SHIFT), /* dest */
- item->ri_buf[i].i_addr, /* source */
+ item->ri_buf[i].iov_base, /* source */
nbits<<XFS_BLF_SHIFT); /* length */
next:
i++;
@@ -669,8 +669,8 @@ xlog_recover_do_inode_buffer(
if (next_unlinked_offset < reg_buf_offset)
continue;
- ASSERT(item->ri_buf[item_index].i_addr != NULL);
- ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
+ ASSERT(item->ri_buf[item_index].iov_base != NULL);
+ ASSERT((item->ri_buf[item_index].iov_len % XFS_BLF_CHUNK) == 0);
ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length));
/*
@@ -678,7 +678,7 @@ xlog_recover_do_inode_buffer(
* current di_next_unlinked field. Extract its value
* and copy it to the buffer copy.
*/
- logged_nextp = item->ri_buf[item_index].i_addr +
+ logged_nextp = item->ri_buf[item_index].iov_base +
next_unlinked_offset - reg_buf_offset;
if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) {
xfs_alert(mp,
@@ -1002,7 +1002,7 @@ xlog_recover_buf_commit_pass2(
struct xlog_recover_item *item,
xfs_lsn_t current_lsn)
{
- struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
+ struct xfs_buf_log_format *buf_f = item->ri_buf[0].iov_base;
struct xfs_mount *mp = log->l_mp;
struct xfs_buf *bp;
int error;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 94d0873bcd62..ee49f20875af 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -103,24 +103,6 @@ xfs_discard_endio(
bio_put(bio);
}
-static inline struct block_device *
-xfs_group_bdev(
- const struct xfs_group *xg)
-{
- struct xfs_mount *mp = xg->xg_mount;
-
- switch (xg->xg_type) {
- case XG_TYPE_AG:
- return mp->m_ddev_targp->bt_bdev;
- case XG_TYPE_RTG:
- return mp->m_rtdev_targp->bt_bdev;
- default:
- ASSERT(0);
- break;
- }
- return NULL;
-}
-
/*
* Walk the discard list and issue discards on all the busy extents in the
* list. We plug and chain the bios so that we only need a single completion
@@ -138,11 +120,14 @@ xfs_discard_extents(
blk_start_plug(&plug);
list_for_each_entry(busyp, &extents->extent_list, list) {
- trace_xfs_discard_extent(busyp->group, busyp->bno,
- busyp->length);
+ struct xfs_group *xg = busyp->group;
+ struct xfs_buftarg *btp =
+ xfs_group_type_buftarg(xg->xg_mount, xg->xg_type);
- error = __blkdev_issue_discard(xfs_group_bdev(busyp->group),
- xfs_gbno_to_daddr(busyp->group, busyp->bno),
+ trace_xfs_discard_extent(xg, busyp->bno, busyp->length);
+
+ error = __blkdev_issue_discard(btp->bt_bdev,
+ xfs_gbno_to_daddr(xg, busyp->bno),
XFS_FSB_TO_BB(mp, busyp->length),
GFP_KERNEL, &bio);
if (error && error != -EOPNOTSUPP) {
@@ -204,9 +189,7 @@ xfs_trim_gather_extents(
*/
xfs_log_force(mp, XFS_LOG_SYNC);
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
+ tp = xfs_trans_alloc_empty(mp);
error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
if (error)
@@ -598,9 +581,7 @@ xfs_trim_rtextents(
struct xfs_trans *tp;
int error;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
+ tp = xfs_trans_alloc_empty(mp);
/*
* Walk the free ranges between low and high. The query_range function
@@ -716,9 +697,7 @@ xfs_trim_rtgroup_extents(
struct xfs_trans *tp;
int error;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
+ tp = xfs_trans_alloc_empty(mp);
/*
* Walk the free ranges between low and high. The query_range function
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index b4e32f0860b7..0bd8022e47b4 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1398,11 +1398,9 @@ xfs_qm_dqflush(
ASSERT(XFS_DQ_IS_LOCKED(dqp));
ASSERT(!completion_done(&dqp->q_flush));
+ ASSERT(atomic_read(&dqp->q_pincount) == 0);
trace_xfs_dqflush(dqp);
-
- xfs_qm_dqunpin_wait(dqp);
-
fa = xfs_qm_dqflush_check(dqp);
if (fa) {
xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS",
diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c
index 2c2720ce6923..89bc9bcaf51e 100644
--- a/fs/xfs/xfs_dquot_item_recover.c
+++ b/fs/xfs/xfs_dquot_item_recover.c
@@ -34,10 +34,10 @@ xlog_recover_dquot_ra_pass2(
if (mp->m_qflags == 0)
return;
- recddq = item->ri_buf[1].i_addr;
+ recddq = item->ri_buf[1].iov_base;
if (recddq == NULL)
return;
- if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
+ if (item->ri_buf[1].iov_len < sizeof(struct xfs_disk_dquot))
return;
type = recddq->d_type & XFS_DQTYPE_REC_MASK;
@@ -45,7 +45,7 @@ xlog_recover_dquot_ra_pass2(
if (log->l_quotaoffs_flag & type)
return;
- dq_f = item->ri_buf[0].i_addr;
+ dq_f = item->ri_buf[0].iov_base;
ASSERT(dq_f);
ASSERT(dq_f->qlf_len == 1);
@@ -79,14 +79,14 @@ xlog_recover_dquot_commit_pass2(
if (mp->m_qflags == 0)
return 0;
- recddq = item->ri_buf[1].i_addr;
+ recddq = item->ri_buf[1].iov_base;
if (recddq == NULL) {
xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
return -EFSCORRUPTED;
}
- if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) {
- xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
- item->ri_buf[1].i_len, __func__);
+ if (item->ri_buf[1].iov_len < sizeof(struct xfs_disk_dquot)) {
+ xfs_alert(log->l_mp, "dquot too small (%zd) in %s.",
+ item->ri_buf[1].iov_len, __func__);
return -EFSCORRUPTED;
}
@@ -108,7 +108,7 @@ xlog_recover_dquot_commit_pass2(
* The other possibility, of course, is that the quota subsystem was
* removed since the last mount - ENOSYS.
*/
- dq_f = item->ri_buf[0].i_addr;
+ dq_f = item->ri_buf[0].iov_base;
ASSERT(dq_f);
fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id);
if (fa) {
@@ -147,7 +147,7 @@ xlog_recover_dquot_commit_pass2(
}
}
- memcpy(ddq, recddq, item->ri_buf[1].i_len);
+ memcpy(ddq, recddq, item->ri_buf[1].iov_len);
if (xfs_has_crc(mp)) {
xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
@@ -192,7 +192,7 @@ xlog_recover_quotaoff_commit_pass1(
struct xlog *log,
struct xlog_recover_item *item)
{
- struct xfs_qoff_logformat *qoff_f = item->ri_buf[0].i_addr;
+ struct xfs_qoff_logformat *qoff_f = item->ri_buf[0].iov_base;
ASSERT(qoff_f);
/*
diff --git a/fs/xfs/xfs_exchmaps_item.c b/fs/xfs/xfs_exchmaps_item.c
index 264a121c5e16..229cbe0adf17 100644
--- a/fs/xfs/xfs_exchmaps_item.c
+++ b/fs/xfs/xfs_exchmaps_item.c
@@ -558,12 +558,12 @@ xlog_recover_xmi_commit_pass2(
size_t len;
len = sizeof(struct xfs_xmi_log_format);
- if (item->ri_buf[0].i_len != len) {
+ if (item->ri_buf[0].iov_len != len) {
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
return -EFSCORRUPTED;
}
- xmi_formatp = item->ri_buf[0].i_addr;
+ xmi_formatp = item->ri_buf[0].iov_base;
if (xmi_formatp->__pad != 0) {
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
return -EFSCORRUPTED;
@@ -598,8 +598,8 @@ xlog_recover_xmd_commit_pass2(
{
struct xfs_xmd_log_format *xmd_formatp;
- xmd_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_xmd_log_format)) {
+ xmd_formatp = item->ri_buf[0].iov_base;
+ if (item->ri_buf[0].iov_len != sizeof(struct xfs_xmd_log_format)) {
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index f069b04e8ea1..3e6e019b6146 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -68,4 +68,12 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
list_sort(NULL, list, xfs_extent_busy_ag_cmp);
}
+/*
+ * Zoned RTGs don't need to track busy extents, as the actual block freeing only
+ * happens by a zone reset, which forces out all transactions that touched the
+ * to be reset zone first.
+ */
+#define xfs_group_has_extent_busy(mp, type) \
+ ((type) == XG_TYPE_AG || !xfs_has_zoned((mp)))
+
#endif /* __XFS_EXTENT_BUSY_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index d574f5f639fa..47ee598a9827 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -182,15 +182,18 @@ xfs_efi_init(
* It will handle the conversion of formats if necessary.
*/
STATIC int
-xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
+xfs_efi_copy_format(
+ struct kvec *buf,
+ struct xfs_efi_log_format *dst_efi_fmt)
{
- xfs_efi_log_format_t *src_efi_fmt = buf->i_addr;
- uint i;
- uint len = xfs_efi_log_format_sizeof(src_efi_fmt->efi_nextents);
- uint len32 = xfs_efi_log_format32_sizeof(src_efi_fmt->efi_nextents);
- uint len64 = xfs_efi_log_format64_sizeof(src_efi_fmt->efi_nextents);
+ struct xfs_efi_log_format *src_efi_fmt = buf->iov_base;
+ uint len, len32, len64, i;
- if (buf->i_len == len) {
+ len = xfs_efi_log_format_sizeof(src_efi_fmt->efi_nextents);
+ len32 = xfs_efi_log_format32_sizeof(src_efi_fmt->efi_nextents);
+ len64 = xfs_efi_log_format64_sizeof(src_efi_fmt->efi_nextents);
+
+ if (buf->iov_len == len) {
memcpy(dst_efi_fmt, src_efi_fmt,
offsetof(struct xfs_efi_log_format, efi_extents));
for (i = 0; i < src_efi_fmt->efi_nextents; i++)
@@ -198,8 +201,8 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
&src_efi_fmt->efi_extents[i],
sizeof(struct xfs_extent));
return 0;
- } else if (buf->i_len == len32) {
- xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr;
+ } else if (buf->iov_len == len32) {
+ xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->iov_base;
dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type;
dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size;
@@ -212,8 +215,8 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
src_efi_fmt_32->efi_extents[i].ext_len;
}
return 0;
- } else if (buf->i_len == len64) {
- xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->i_addr;
+ } else if (buf->iov_len == len64) {
+ xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->iov_base;
dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type;
dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size;
@@ -227,8 +230,8 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
}
return 0;
}
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, NULL, buf->i_addr,
- buf->i_len);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, NULL, buf->iov_base,
+ buf->iov_len);
return -EFSCORRUPTED;
}
@@ -865,11 +868,11 @@ xlog_recover_efi_commit_pass2(
struct xfs_efi_log_format *efi_formatp;
int error;
- efi_formatp = item->ri_buf[0].i_addr;
+ efi_formatp = item->ri_buf[0].iov_base;
- if (item->ri_buf[0].i_len < xfs_efi_log_format_sizeof(0)) {
+ if (item->ri_buf[0].iov_len < xfs_efi_log_format_sizeof(0)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -904,11 +907,11 @@ xlog_recover_rtefi_commit_pass2(
struct xfs_efi_log_format *efi_formatp;
int error;
- efi_formatp = item->ri_buf[0].i_addr;
+ efi_formatp = item->ri_buf[0].iov_base;
- if (item->ri_buf[0].i_len < xfs_efi_log_format_sizeof(0)) {
+ if (item->ri_buf[0].iov_len < xfs_efi_log_format_sizeof(0)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -933,7 +936,7 @@ xlog_recover_rtefi_commit_pass2(
xfs_lsn_t lsn)
{
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
#endif
@@ -958,9 +961,9 @@ xlog_recover_efd_commit_pass2(
xfs_lsn_t lsn)
{
struct xfs_efd_log_format *efd_formatp;
- int buflen = item->ri_buf[0].i_len;
+ int buflen = item->ri_buf[0].iov_len;
- efd_formatp = item->ri_buf[0].i_addr;
+ efd_formatp = item->ri_buf[0].iov_base;
if (buflen < sizeof(struct xfs_efd_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
@@ -968,9 +971,9 @@ xlog_recover_efd_commit_pass2(
return -EFSCORRUPTED;
}
- if (item->ri_buf[0].i_len != xfs_efd_log_format32_sizeof(
+ if (item->ri_buf[0].iov_len != xfs_efd_log_format32_sizeof(
efd_formatp->efd_nextents) &&
- item->ri_buf[0].i_len != xfs_efd_log_format64_sizeof(
+ item->ri_buf[0].iov_len != xfs_efd_log_format64_sizeof(
efd_formatp->efd_nextents)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
efd_formatp, buflen);
@@ -995,9 +998,9 @@ xlog_recover_rtefd_commit_pass2(
xfs_lsn_t lsn)
{
struct xfs_efd_log_format *efd_formatp;
- int buflen = item->ri_buf[0].i_len;
+ int buflen = item->ri_buf[0].iov_len;
- efd_formatp = item->ri_buf[0].i_addr;
+ efd_formatp = item->ri_buf[0].iov_base;
if (buflen < sizeof(struct xfs_efd_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
@@ -1005,9 +1008,9 @@ xlog_recover_rtefd_commit_pass2(
return -EFSCORRUPTED;
}
- if (item->ri_buf[0].i_len != xfs_efd_log_format32_sizeof(
+ if (item->ri_buf[0].iov_len != xfs_efd_log_format32_sizeof(
efd_formatp->efd_nextents) &&
- item->ri_buf[0].i_len != xfs_efd_log_format64_sizeof(
+ item->ri_buf[0].iov_len != xfs_efd_log_format64_sizeof(
efd_formatp->efd_nextents)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
efd_formatp, buflen);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 48254a72071b..55a304cb3aef 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -497,7 +497,7 @@ restart:
static ssize_t
xfs_zoned_write_space_reserve(
- struct xfs_inode *ip,
+ struct xfs_mount *mp,
struct kiocb *iocb,
struct iov_iter *from,
unsigned int flags,
@@ -533,8 +533,8 @@ xfs_zoned_write_space_reserve(
*
* Any remaining block will be returned after the write.
*/
- return xfs_zoned_space_reserve(ip,
- XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac);
+ return xfs_zoned_space_reserve(mp, XFS_B_TO_FSB(mp, count) + 1 + 2,
+ flags, ac);
}
static int
@@ -718,13 +718,13 @@ xfs_file_dio_write_zoned(
struct xfs_zone_alloc_ctx ac = { };
ssize_t ret;
- ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac);
+ ret = xfs_zoned_write_space_reserve(ip->i_mount, iocb, from, 0, &ac);
if (ret < 0)
return ret;
ret = xfs_file_dio_write_aligned(ip, iocb, from,
&xfs_zoned_direct_write_iomap_ops,
&xfs_dio_zoned_write_ops, &ac);
- xfs_zoned_space_unreserve(ip, &ac);
+ xfs_zoned_space_unreserve(ip->i_mount, &ac);
return ret;
}
@@ -752,7 +752,7 @@ xfs_file_dio_write_atomic(
* HW offload should be faster, so try that first if it is already
* known that the write length is not too large.
*/
- if (ocount > xfs_inode_buftarg(ip)->bt_bdev_awu_max)
+ if (ocount > xfs_inode_buftarg(ip)->bt_awu_max)
dops = &xfs_atomic_write_cow_iomap_ops;
else
dops = &xfs_direct_write_iomap_ops;
@@ -979,7 +979,8 @@ write_retry:
trace_xfs_file_buffered_write(iocb, from);
ret = iomap_file_buffered_write(iocb, from,
- &xfs_buffered_write_iomap_ops, NULL);
+ &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
+ NULL);
/*
* If we hit a space limit, try to free up some lingering preallocated
@@ -1032,7 +1033,7 @@ xfs_file_buffered_write_zoned(
struct xfs_zone_alloc_ctx ac = { };
ssize_t ret;
- ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac);
+ ret = xfs_zoned_write_space_reserve(mp, iocb, from, XFS_ZR_GREEDY, &ac);
if (ret < 0)
return ret;
@@ -1059,7 +1060,8 @@ xfs_file_buffered_write_zoned(
retry:
trace_xfs_file_buffered_write(iocb, from);
ret = iomap_file_buffered_write(iocb, from,
- &xfs_buffered_write_iomap_ops, &ac);
+ &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
+ &ac);
if (ret == -ENOSPC && !cleared_space) {
/*
* Kick off writeback to convert delalloc space and release the
@@ -1073,7 +1075,7 @@ retry:
out_unlock:
xfs_iunlock(ip, iolock);
out_unreserve:
- xfs_zoned_space_unreserve(ip, &ac);
+ xfs_zoned_space_unreserve(ip->i_mount, &ac);
if (ret > 0) {
XFS_STATS_ADD(mp, xs_write_bytes, ret);
ret = generic_write_sync(iocb, ret);
@@ -1335,9 +1337,10 @@ xfs_falloc_allocate_range(
}
#define XFS_FALLOC_FL_SUPPORTED \
- (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
- FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
- FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
+ (FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \
+ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \
+ FALLOC_FL_UNSHARE_RANGE)
STATIC long
__xfs_file_fallocate(
@@ -1413,11 +1416,11 @@ xfs_file_zoned_fallocate(
struct xfs_inode *ip = XFS_I(file_inode(file));
int error;
- error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac);
+ error = xfs_zoned_space_reserve(ip->i_mount, 2, XFS_ZR_RESERVED, &ac);
if (error)
return error;
error = __xfs_file_fallocate(file, mode, offset, len, &ac);
- xfs_zoned_space_unreserve(ip, &ac);
+ xfs_zoned_space_unreserve(ip->i_mount, &ac);
return error;
}
@@ -1729,7 +1732,7 @@ xfs_dax_fault_locked(
bool write_fault)
{
vm_fault_t ret;
- pfn_t pfn;
+ unsigned long pfn;
if (!IS_ENABLED(CONFIG_FS_DAX)) {
ASSERT(0);
@@ -1827,12 +1830,12 @@ xfs_write_fault_zoned(
* But as the overallocation is limited to less than a folio and will be
* release instantly that's just fine.
*/
- error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0,
- &ac);
+ error = xfs_zoned_space_reserve(ip->i_mount,
+ XFS_B_TO_FSB(ip->i_mount, len), 0, &ac);
if (error < 0)
return vmf_fs_error(error);
ret = __xfs_write_fault(vmf, order, &ac);
- xfs_zoned_space_unreserve(ip, &ac);
+ xfs_zoned_space_unreserve(ip->i_mount, &ac);
return ret;
}
@@ -1913,10 +1916,10 @@ static const struct vm_operations_struct xfs_file_vm_ops = {
};
STATIC int
-xfs_file_mmap(
- struct file *file,
- struct vm_area_struct *vma)
+xfs_file_mmap_prepare(
+ struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
struct inode *inode = file_inode(file);
struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode));
@@ -1924,13 +1927,14 @@ xfs_file_mmap(
* We don't support synchronous mappings for non-DAX files and
* for DAX files if underneath dax_device is not synchronous.
*/
- if (!daxdev_mapping_supported(vma, target->bt_daxdev))
+ if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file),
+ target->bt_daxdev))
return -EOPNOTSUPP;
file_accessed(file);
- vma->vm_ops = &xfs_file_vm_ops;
+ desc->vm_ops = &xfs_file_vm_ops;
if (IS_DAX(inode))
- vm_flags_set(vma, VM_HUGEPAGE);
+ desc->vm_flags |= VM_HUGEPAGE;
return 0;
}
@@ -1945,7 +1949,7 @@ const struct file_operations xfs_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,
#endif
- .mmap = xfs_file_mmap,
+ .mmap_prepare = xfs_file_mmap_prepare,
.open = xfs_file_open,
.release = xfs_file_release,
.fsync = xfs_file_fsync,
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 414b27a86458..af68c7de8ee8 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -1270,9 +1270,7 @@ xfs_getfsmap(
* buffer locking abilities to detect cycles in the rmapbt
* without deadlocking.
*/
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- break;
+ tp = xfs_trans_alloc_empty(mp);
info.dev = handlers[i].dev;
info.last = false;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 726e29b837e6..4cf7abe50143 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -893,10 +893,7 @@ xfs_metafile_iget(
struct xfs_trans *tp;
int error;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
-
+ tp = xfs_trans_alloc_empty(mp);
error = xfs_trans_metafile_iget(tp, ino, metafile_type, ipp);
xfs_trans_cancel(tp);
return error;
@@ -979,7 +976,15 @@ xfs_reclaim_inode(
*/
if (xlog_is_shutdown(ip->i_mount->m_log)) {
xfs_iunpin_wait(ip);
+ /*
+ * Avoid a ABBA deadlock on the inode cluster buffer vs
+ * concurrent xfs_ifree_cluster() trying to mark the inode
+ * stale. We don't need the inode locked to run the flush abort
+ * code, but the flush abort needs to lock the cluster buffer.
+ */
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_iflush_shutdown_abort(ip);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
goto reclaim;
}
if (xfs_ipincount(ip))
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 4345db501714..f83ec2bd0583 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -158,7 +158,7 @@ xlog_recover_icreate_commit_pass2(
int nbufs;
int i;
- icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
+ icl = (struct xfs_icreate_log *)item->ri_buf[0].iov_base;
if (icl->icl_type != XFS_LI_ICREATE) {
xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
return -EINVAL;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ee3e0f284287..9c39251961a3 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1635,7 +1635,7 @@ retry:
iip = ip->i_itemp;
if (__xfs_iflags_test(ip, XFS_IFLUSHING)) {
ASSERT(!list_empty(&iip->ili_item.li_bio_list));
- ASSERT(iip->ili_last_fields);
+ ASSERT(iip->ili_last_fields || xlog_is_shutdown(mp->m_log));
goto out_iunlock;
}
@@ -2932,12 +2932,9 @@ xfs_inode_reload_unlinked(
struct xfs_inode *ip)
{
struct xfs_trans *tp;
- int error;
-
- error = xfs_trans_alloc_empty(ip->i_mount, &tp);
- if (error)
- return error;
+ int error = 0;
+ tp = xfs_trans_alloc_empty(ip->i_mount);
xfs_ilock(ip, XFS_ILOCK_SHARED);
if (xfs_inode_unlinked_incomplete(ip))
error = xfs_inode_reload_unlinked_bucket(tp, ip);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index d7e2b902ef5c..07fbdcc4cbf5 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -358,7 +358,7 @@ static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip)
static inline bool xfs_inode_can_hw_atomic_write(const struct xfs_inode *ip)
{
- return xfs_inode_buftarg(ip)->bt_bdev_awu_max > 0;
+ return xfs_inode_buftarg(ip)->bt_awu_max > 0;
}
/*
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c6cb0b6b9e46..829675700fcd 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -758,11 +758,14 @@ xfs_inode_item_push(
* completed and items removed from the AIL before the next push
* attempt.
*/
+ trace_xfs_inode_push_stale(ip, _RET_IP_);
return XFS_ITEM_PINNED;
}
- if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp))
+ if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp)) {
+ trace_xfs_inode_push_pinned(ip, _RET_IP_);
return XFS_ITEM_PINNED;
+ }
if (xfs_iflags_test(ip, XFS_IFLUSHING))
return XFS_ITEM_FLUSHING;
@@ -1179,12 +1182,12 @@ xfs_iflush_shutdown_abort(
*/
int
xfs_inode_item_format_convert(
- struct xfs_log_iovec *buf,
+ struct kvec *buf,
struct xfs_inode_log_format *in_f)
{
- struct xfs_inode_log_format_32 *in_f32 = buf->i_addr;
+ struct xfs_inode_log_format_32 *in_f32 = buf->iov_base;
- if (buf->i_len != sizeof(*in_f32)) {
+ if (buf->iov_len != sizeof(*in_f32)) {
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 377e06007804..ba92ce11a011 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -46,8 +46,8 @@ extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
extern void xfs_inode_item_destroy(struct xfs_inode *);
extern void xfs_iflush_abort(struct xfs_inode *);
extern void xfs_iflush_shutdown_abort(struct xfs_inode *);
-extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
- struct xfs_inode_log_format *);
+int xfs_inode_item_format_convert(struct kvec *buf,
+ struct xfs_inode_log_format *in_f);
extern struct kmem_cache *xfs_ili_cache;
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 7205fd14f6b3..9d1999d41be1 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -30,13 +30,13 @@ xlog_recover_inode_ra_pass2(
struct xlog *log,
struct xlog_recover_item *item)
{
- if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
- struct xfs_inode_log_format *ilfp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].iov_len == sizeof(struct xfs_inode_log_format)) {
+ struct xfs_inode_log_format *ilfp = item->ri_buf[0].iov_base;
xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len,
&xfs_inode_buf_ra_ops);
} else {
- struct xfs_inode_log_format_32 *ilfp = item->ri_buf[0].i_addr;
+ struct xfs_inode_log_format_32 *ilfp = item->ri_buf[0].iov_base;
xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len,
&xfs_inode_buf_ra_ops);
@@ -326,8 +326,8 @@ xlog_recover_inode_commit_pass2(
int need_free = 0;
xfs_failaddr_t fa;
- if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
- in_f = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].iov_len == sizeof(struct xfs_inode_log_format)) {
+ in_f = item->ri_buf[0].iov_base;
} else {
in_f = kmalloc(sizeof(struct xfs_inode_log_format),
GFP_KERNEL | __GFP_NOFAIL);
@@ -366,7 +366,7 @@ xlog_recover_inode_commit_pass2(
error = -EFSCORRUPTED;
goto out_release;
}
- ldip = item->ri_buf[1].i_addr;
+ ldip = item->ri_buf[1].iov_base;
if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) {
xfs_alert(mp,
"%s: Bad inode log record, rec ptr "PTR_FMT", ino %lld",
@@ -472,12 +472,12 @@ xlog_recover_inode_commit_pass2(
goto out_release;
}
isize = xfs_log_dinode_size(mp);
- if (unlikely(item->ri_buf[1].i_len > isize)) {
+ if (unlikely(item->ri_buf[1].iov_len > isize)) {
XFS_CORRUPTION_ERROR("Bad log dinode size", XFS_ERRLEVEL_LOW,
mp, ldip, sizeof(*ldip));
xfs_alert(mp,
- "Bad inode 0x%llx log dinode size 0x%x",
- in_f->ilf_ino, item->ri_buf[1].i_len);
+ "Bad inode 0x%llx log dinode size 0x%zx",
+ in_f->ilf_ino, item->ri_buf[1].iov_len);
error = -EFSCORRUPTED;
goto out_release;
}
@@ -500,8 +500,8 @@ xlog_recover_inode_commit_pass2(
if (in_f->ilf_size == 2)
goto out_owner_change;
- len = item->ri_buf[2].i_len;
- src = item->ri_buf[2].i_addr;
+ len = item->ri_buf[2].iov_len;
+ src = item->ri_buf[2].iov_base;
ASSERT(in_f->ilf_size <= 4);
ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
ASSERT(!(fields & XFS_ILOG_DFORK) ||
@@ -538,8 +538,8 @@ xlog_recover_inode_commit_pass2(
} else {
attr_index = 2;
}
- len = item->ri_buf[attr_index].i_len;
- src = item->ri_buf[attr_index].i_addr;
+ len = item->ri_buf[attr_index].iov_len;
+ src = item->ri_buf[attr_index].iov_base;
ASSERT(len == xlog_calc_iovec_len(in_f->ilf_asize));
switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index d250f7f74e3b..fe1f74a3b6a3 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -444,7 +444,7 @@ static void
xfs_fill_fsxattr(
struct xfs_inode *ip,
int whichfork,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
@@ -496,7 +496,7 @@ xfs_ioc_fsgetxattra(
xfs_inode_t *ip,
void __user *arg)
{
- struct fileattr fa;
+ struct file_kattr fa;
xfs_ilock(ip, XFS_ILOCK_SHARED);
xfs_fill_fsxattr(ip, XFS_ATTR_FORK, &fa);
@@ -508,7 +508,7 @@ xfs_ioc_fsgetxattra(
int
xfs_fileattr_get(
struct dentry *dentry,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct xfs_inode *ip = XFS_I(d_inode(dentry));
@@ -526,7 +526,7 @@ static int
xfs_ioctl_setattr_xflags(
struct xfs_trans *tp,
struct xfs_inode *ip,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
bool rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME);
@@ -582,7 +582,7 @@ xfs_ioctl_setattr_xflags(
static void
xfs_ioctl_setattr_prepare_dax(
struct xfs_inode *ip,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
struct inode *inode = VFS_I(ip);
@@ -642,7 +642,7 @@ out_error:
static int
xfs_ioctl_setattr_check_extsize(
struct xfs_inode *ip,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
xfs_failaddr_t failaddr;
@@ -684,7 +684,7 @@ xfs_ioctl_setattr_check_extsize(
static int
xfs_ioctl_setattr_check_cowextsize(
struct xfs_inode *ip,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
xfs_failaddr_t failaddr;
@@ -709,7 +709,7 @@ xfs_ioctl_setattr_check_cowextsize(
static int
xfs_ioctl_setattr_check_projid(
struct xfs_inode *ip,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
if (!fa->fsx_valid)
return 0;
@@ -725,7 +725,7 @@ int
xfs_fileattr_set(
struct mnt_idmap *idmap,
struct dentry *dentry,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct xfs_inode *ip = XFS_I(d_inode(dentry));
struct xfs_mount *mp = ip->i_mount;
@@ -990,9 +990,8 @@ xfs_ioc_getlabel(
BUILD_BUG_ON(sizeof(sbp->sb_fname) > FSLABEL_MAX);
/* 1 larger than sb_fname, so this ensures a trailing NUL char */
- memset(label, 0, sizeof(label));
spin_lock(&mp->m_sb_lock);
- strncpy(label, sbp->sb_fname, XFSLABEL_MAX);
+ memtostr_pad(label, sbp->sb_fname);
spin_unlock(&mp->m_sb_lock);
if (copy_to_user(user_label, label, sizeof(label)))
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 12124946f347..f5ed5cf9d3df 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -17,13 +17,13 @@ xfs_ioc_swapext(
extern int
xfs_fileattr_get(
struct dentry *dentry,
- struct fileattr *fa);
+ struct file_kattr *fa);
extern int
xfs_fileattr_set(
struct mnt_idmap *idmap,
struct dentry *dentry,
- struct fileattr *fa);
+ struct file_kattr *fa);
extern long
xfs_file_ioctl(
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index ff05e6b1b0bb..2a74f2957341 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -79,6 +79,9 @@ xfs_iomap_valid(
{
struct xfs_inode *ip = XFS_I(inode);
+ if (iomap->type == IOMAP_HOLE)
+ return true;
+
if (iomap->validity_cookie !=
xfs_iomap_inode_sequence(ip, iomap->flags)) {
trace_xfs_iomap_invalid(ip, iomap);
@@ -89,7 +92,7 @@ xfs_iomap_valid(
return true;
}
-static const struct iomap_folio_ops xfs_iomap_folio_ops = {
+const struct iomap_write_ops xfs_iomap_write_ops = {
.iomap_valid = xfs_iomap_valid,
};
@@ -151,7 +154,6 @@ xfs_bmbt_to_iomap(
iomap->flags |= IOMAP_F_DIRTY;
iomap->validity_cookie = sequence_cookie;
- iomap->folio_ops = &xfs_iomap_folio_ops;
return 0;
}
@@ -827,7 +829,7 @@ xfs_bmap_hw_atomic_write_possible(
/*
* The ->iomap_begin caller should ensure this, but check anyway.
*/
- return len <= xfs_inode_buftarg(ip)->bt_bdev_awu_max;
+ return len <= xfs_inode_buftarg(ip)->bt_awu_max;
}
static int
@@ -2198,7 +2200,8 @@ xfs_zero_range(
return dax_zero_range(inode, pos, len, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_zero_range(inode, pos, len, did_zero,
- &xfs_buffered_write_iomap_ops, ac);
+ &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
+ ac);
}
int
@@ -2214,5 +2217,6 @@ xfs_truncate_page(
return dax_truncate_page(inode, pos, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_truncate_page(inode, pos, did_zero,
- &xfs_buffered_write_iomap_ops, ac);
+ &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
+ ac);
}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 674f8ac1b9bd..ebcce7d49446 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -57,5 +57,6 @@ extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
extern const struct iomap_ops xfs_dax_write_iomap_ops;
extern const struct iomap_ops xfs_atomic_write_cow_iomap_ops;
+extern const struct iomap_write_ops xfs_iomap_write_ops;
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 8cddbb7c149b..149b5460fbfd 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -665,7 +665,7 @@ xfs_get_atomic_write_max_opt(
* less than our out of place write limit, but we don't want to exceed
* the awu_max.
*/
- return min(awu_max, xfs_inode_buftarg(ip)->bt_bdev_awu_max);
+ return min(awu_max, xfs_inode_buftarg(ip)->bt_awu_max);
}
static void
@@ -970,7 +970,7 @@ xfs_setattr_size(
* change.
*/
if (xfs_is_zoned_inode(ip)) {
- error = xfs_zoned_space_reserve(ip, 1,
+ error = xfs_zoned_space_reserve(mp, 1,
XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac);
if (error) {
if (error == -EAGAIN)
@@ -998,7 +998,7 @@ xfs_setattr_size(
}
if (xfs_is_zoned_inode(ip))
- xfs_zoned_space_unreserve(ip, &ac);
+ xfs_zoned_space_unreserve(mp, &ac);
if (error)
return error;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 1fa1c0564b0c..c8c9b8d8309f 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -239,14 +239,10 @@ xfs_bulkstat_one(
* Grab an empty transaction so that we can use its recursive buffer
* locking abilities to detect cycles in the inobt without deadlocking.
*/
- error = xfs_trans_alloc_empty(breq->mp, &tp);
- if (error)
- goto out;
-
+ tp = xfs_trans_alloc_empty(breq->mp);
error = xfs_bulkstat_one_int(breq->mp, breq->idmap, tp,
breq->startino, &bc);
xfs_trans_cancel(tp);
-out:
kfree(bc.buf);
/*
@@ -331,17 +327,13 @@ xfs_bulkstat(
* Grab an empty transaction so that we can use its recursive buffer
* locking abilities to detect cycles in the inobt without deadlocking.
*/
- error = xfs_trans_alloc_empty(breq->mp, &tp);
- if (error)
- goto out;
-
+ tp = xfs_trans_alloc_empty(breq->mp);
if (breq->flags & XFS_IBULK_SAME_AG)
iwalk_flags |= XFS_IWALK_SAME_AG;
error = xfs_iwalk(breq->mp, tp, breq->startino, iwalk_flags,
xfs_bulkstat_iwalk, breq->icount, &bc);
xfs_trans_cancel(tp);
-out:
kfree(bc.buf);
/*
@@ -464,14 +456,10 @@ xfs_inumbers(
* Grab an empty transaction so that we can use its recursive buffer
* locking abilities to detect cycles in the inobt without deadlocking.
*/
- error = xfs_trans_alloc_empty(breq->mp, &tp);
- if (error)
- goto out;
-
+ tp = xfs_trans_alloc_empty(breq->mp);
error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->flags,
xfs_inumbers_walk, breq->icount, &ic);
xfs_trans_cancel(tp);
-out:
/*
* We found some inode groups, so clear the error status and return
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 7db3ece370b1..c1c31d1a8e21 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -377,11 +377,8 @@ xfs_iwalk_run_callbacks(
if (!has_more)
return 0;
- if (iwag->drop_trans) {
- error = xfs_trans_alloc_empty(mp, &iwag->tp);
- if (error)
- return error;
- }
+ if (iwag->drop_trans)
+ iwag->tp = xfs_trans_alloc_empty(mp);
/* ...and recreate the cursor just past where we left off. */
error = xfs_ialloc_read_agi(iwag->pag, iwag->tp, 0, agi_bpp);
@@ -617,9 +614,7 @@ xfs_iwalk_ag_work(
* Grab an empty transaction so that we can use its recursive buffer
* locking abilities to detect cycles in the inobt without deadlocking.
*/
- error = xfs_trans_alloc_empty(mp, &iwag->tp);
- if (error)
- goto out;
+ iwag->tp = xfs_trans_alloc_empty(mp);
iwag->drop_trans = 1;
error = xfs_iwalk_ag(iwag);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 793468b4d30d..c8a57e21a1d3 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -109,14 +109,14 @@ xlog_prepare_iovec(
vec = &lv->lv_iovecp[0];
}
- len = lv->lv_buf_len + sizeof(struct xlog_op_header);
+ len = lv->lv_buf_used + sizeof(struct xlog_op_header);
if (!IS_ALIGNED(len, sizeof(uint64_t))) {
- lv->lv_buf_len = round_up(len, sizeof(uint64_t)) -
+ lv->lv_buf_used = round_up(len, sizeof(uint64_t)) -
sizeof(struct xlog_op_header);
}
vec->i_type = type;
- vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+ vec->i_addr = lv->lv_buf + lv->lv_buf_used;
oph = vec->i_addr;
oph->oh_clientid = XFS_TRANSACTION;
@@ -1931,9 +1931,9 @@ xlog_print_trans(
if (!lv)
continue;
xfs_warn(mp, " niovecs = %d", lv->lv_niovecs);
- xfs_warn(mp, " size = %d", lv->lv_size);
+ xfs_warn(mp, " alloc_size = %d", lv->lv_alloc_size);
xfs_warn(mp, " bytes = %d", lv->lv_bytes);
- xfs_warn(mp, " buf len = %d", lv->lv_buf_len);
+ xfs_warn(mp, " buf used= %d", lv->lv_buf_used);
/* dump each iovec for the log item */
vec = lv->lv_iovecp;
@@ -3092,16 +3092,16 @@ xfs_log_force_seq(
*/
void
xfs_log_ticket_put(
- xlog_ticket_t *ticket)
+ struct xlog_ticket *ticket)
{
ASSERT(atomic_read(&ticket->t_ref) > 0);
if (atomic_dec_and_test(&ticket->t_ref))
kmem_cache_free(xfs_log_ticket_cache, ticket);
}
-xlog_ticket_t *
+struct xlog_ticket *
xfs_log_ticket_get(
- xlog_ticket_t *ticket)
+ struct xlog_ticket *ticket)
{
ASSERT(atomic_read(&ticket->t_ref) > 0);
atomic_inc(&ticket->t_ref);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 13455854365f..af6daf4f6792 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -16,8 +16,8 @@ struct xfs_log_vec {
struct xfs_log_item *lv_item; /* owner */
char *lv_buf; /* formatted buffer */
int lv_bytes; /* accounted space in buffer */
- int lv_buf_len; /* aligned size of buffer */
- int lv_size; /* size of allocated lv */
+ int lv_buf_used; /* buffer space used so far */
+ int lv_alloc_size; /* size of allocated lv */
};
#define XFS_LOG_VEC_ORDERED (-1)
@@ -64,12 +64,13 @@ xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec,
oph->oh_len = cpu_to_be32(len);
len += sizeof(struct xlog_op_header);
- lv->lv_buf_len += len;
+ lv->lv_buf_used += len;
lv->lv_bytes += len;
vec->i_len = len;
/* Catch buffer overruns */
- ASSERT((void *)lv->lv_buf + lv->lv_bytes <= (void *)lv + lv->lv_size);
+ ASSERT((void *)lv->lv_buf + lv->lv_bytes <=
+ (void *)lv + lv->lv_alloc_size);
}
/*
@@ -87,13 +88,6 @@ xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
return buf;
}
-static inline void *
-xlog_copy_from_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
- const struct xfs_log_iovec *src)
-{
- return xlog_copy_iovec(lv, vecp, src->i_type, src->i_addr, src->i_len);
-}
-
/*
* By comparing each component, we don't have to worry about extra
* endian issues in treating two 32 bit numbers as one 64 bit number
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index f66d2d430e4f..f443757e93c2 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -275,7 +275,7 @@ xlog_cil_alloc_shadow_bufs(
struct xfs_log_vec *lv;
int niovecs = 0;
int nbytes = 0;
- int buf_size;
+ int alloc_size;
bool ordered = false;
/* Skip items which aren't dirty in this transaction. */
@@ -316,14 +316,14 @@ xlog_cil_alloc_shadow_bufs(
* that space to ensure we can align it appropriately and not
* overrun the buffer.
*/
- buf_size = nbytes + xlog_cil_iovec_space(niovecs);
+ alloc_size = nbytes + xlog_cil_iovec_space(niovecs);
/*
* if we have no shadow buffer, or it is too small, we need to
* reallocate it.
*/
if (!lip->li_lv_shadow ||
- buf_size > lip->li_lv_shadow->lv_size) {
+ alloc_size > lip->li_lv_shadow->lv_alloc_size) {
/*
* We free and allocate here as a realloc would copy
* unnecessary data. We don't use kvzalloc() for the
@@ -332,15 +332,15 @@ xlog_cil_alloc_shadow_bufs(
* storage.
*/
kvfree(lip->li_lv_shadow);
- lv = xlog_kvmalloc(buf_size);
+ lv = xlog_kvmalloc(alloc_size);
memset(lv, 0, xlog_cil_iovec_space(niovecs));
INIT_LIST_HEAD(&lv->lv_list);
lv->lv_item = lip;
- lv->lv_size = buf_size;
+ lv->lv_alloc_size = alloc_size;
if (ordered)
- lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+ lv->lv_buf_used = XFS_LOG_VEC_ORDERED;
else
lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
lip->li_lv_shadow = lv;
@@ -348,9 +348,9 @@ xlog_cil_alloc_shadow_bufs(
/* same or smaller, optimise common overwrite case */
lv = lip->li_lv_shadow;
if (ordered)
- lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+ lv->lv_buf_used = XFS_LOG_VEC_ORDERED;
else
- lv->lv_buf_len = 0;
+ lv->lv_buf_used = 0;
lv->lv_bytes = 0;
}
@@ -370,30 +370,30 @@ xlog_cil_alloc_shadow_bufs(
STATIC void
xfs_cil_prepare_item(
struct xlog *log,
+ struct xfs_log_item *lip,
struct xfs_log_vec *lv,
- struct xfs_log_vec *old_lv,
int *diff_len)
{
/* Account for the new LV being passed in */
- if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
+ if (lv->lv_buf_used != XFS_LOG_VEC_ORDERED)
*diff_len += lv->lv_bytes;
/*
* If there is no old LV, this is the first time we've seen the item in
* this CIL context and so we need to pin it. If we are replacing the
- * old_lv, then remove the space it accounts for and make it the shadow
+ * old lv, then remove the space it accounts for and make it the shadow
* buffer for later freeing. In both cases we are now switching to the
* shadow buffer, so update the pointer to it appropriately.
*/
- if (!old_lv) {
+ if (!lip->li_lv) {
if (lv->lv_item->li_ops->iop_pin)
lv->lv_item->li_ops->iop_pin(lv->lv_item);
lv->lv_item->li_lv_shadow = NULL;
- } else if (old_lv != lv) {
- ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
+ } else if (lip->li_lv != lv) {
+ ASSERT(lv->lv_buf_used != XFS_LOG_VEC_ORDERED);
- *diff_len -= old_lv->lv_bytes;
- lv->lv_item->li_lv_shadow = old_lv;
+ *diff_len -= lip->li_lv->lv_bytes;
+ lv->lv_item->li_lv_shadow = lip->li_lv;
}
/* attach new log vector to log item */
@@ -452,10 +452,8 @@ xlog_cil_insert_format_items(
}
list_for_each_entry(lip, &tp->t_items, li_trans) {
- struct xfs_log_vec *lv;
- struct xfs_log_vec *old_lv = NULL;
- struct xfs_log_vec *shadow;
- bool ordered = false;
+ struct xfs_log_vec *lv = lip->li_lv;
+ struct xfs_log_vec *shadow = lip->li_lv_shadow;
/* Skip items which aren't dirty in this transaction. */
if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
@@ -465,22 +463,23 @@ xlog_cil_insert_format_items(
* The formatting size information is already attached to
* the shadow lv on the log item.
*/
- shadow = lip->li_lv_shadow;
- if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED)
- ordered = true;
+ if (shadow->lv_buf_used == XFS_LOG_VEC_ORDERED) {
+ if (!lv) {
+ lv = shadow;
+ lv->lv_item = lip;
+ }
+ ASSERT(shadow->lv_alloc_size == lv->lv_alloc_size);
+ xfs_cil_prepare_item(log, lip, lv, diff_len);
+ continue;
+ }
/* Skip items that do not have any vectors for writing */
- if (!shadow->lv_niovecs && !ordered)
+ if (!shadow->lv_niovecs)
continue;
/* compare to existing item size */
- old_lv = lip->li_lv;
- if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
+ if (lv && shadow->lv_alloc_size <= lv->lv_alloc_size) {
/* same or smaller, optimise common overwrite case */
- lv = lip->li_lv;
-
- if (ordered)
- goto insert;
/*
* set the item up as though it is a new insertion so
@@ -492,7 +491,7 @@ xlog_cil_insert_format_items(
lv->lv_niovecs = shadow->lv_niovecs;
/* reset the lv buffer information for new formatting */
- lv->lv_buf_len = 0;
+ lv->lv_buf_used = 0;
lv->lv_bytes = 0;
lv->lv_buf = (char *)lv +
xlog_cil_iovec_space(lv->lv_niovecs);
@@ -500,17 +499,11 @@ xlog_cil_insert_format_items(
/* switch to shadow buffer! */
lv = shadow;
lv->lv_item = lip;
- if (ordered) {
- /* track as an ordered logvec */
- ASSERT(lip->li_lv == NULL);
- goto insert;
- }
}
ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
lip->li_ops->iop_format(lip, lv);
-insert:
- xfs_cil_prepare_item(log, lv, old_lv, diff_len);
+ xfs_cil_prepare_item(log, lip, lv, diff_len);
}
}
@@ -793,8 +786,10 @@ xlog_cil_ail_insert(
struct xfs_log_item *lip = lv->lv_item;
xfs_lsn_t item_lsn;
- if (aborted)
+ if (aborted) {
+ trace_xlog_ail_insert_abort(lip);
set_bit(XFS_LI_ABORTED, &lip->li_flags);
+ }
if (lip->li_ops->flags & XFS_ITEM_RELEASE_WHEN_COMMITTED) {
lip->li_ops->iop_release(lip);
@@ -1243,7 +1238,7 @@ xlog_cil_build_lv_chain(
lv->lv_order_id = item->li_order_id;
/* we don't write ordered log vectors */
- if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
+ if (lv->lv_buf_used != XFS_LOG_VEC_ORDERED)
*num_bytes += lv->lv_bytes;
*num_iovecs += lv->lv_niovecs;
list_add_tail(&lv->lv_list, &ctx->lv_chain);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 39a102cc1b43..a9a7a271c15b 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -144,7 +144,7 @@ enum xlog_iclog_state {
#define XLOG_COVER_OPS 5
-typedef struct xlog_ticket {
+struct xlog_ticket {
struct list_head t_queue; /* reserve/write queue */
struct task_struct *t_task; /* task that owns this ticket */
xlog_tid_t t_tid; /* transaction identifier */
@@ -155,7 +155,7 @@ typedef struct xlog_ticket {
char t_cnt; /* current unit count */
uint8_t t_flags; /* properties of reservation */
int t_iclog_hdrs; /* iclog hdrs in t_curr_res */
-} xlog_ticket_t;
+};
/*
* - A log record header is 512 bytes. There is plenty of room to grow the
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 2f76531842f8..e6ed9e09c027 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2131,15 +2131,15 @@ xlog_recover_add_to_cont_trans(
item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
ri_list);
- old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
- old_len = item->ri_buf[item->ri_cnt-1].i_len;
+ old_ptr = item->ri_buf[item->ri_cnt-1].iov_base;
+ old_len = item->ri_buf[item->ri_cnt-1].iov_len;
ptr = kvrealloc(old_ptr, len + old_len, GFP_KERNEL);
if (!ptr)
return -ENOMEM;
memcpy(&ptr[old_len], dp, len);
- item->ri_buf[item->ri_cnt-1].i_len += len;
- item->ri_buf[item->ri_cnt-1].i_addr = ptr;
+ item->ri_buf[item->ri_cnt-1].iov_len += len;
+ item->ri_buf[item->ri_cnt-1].iov_base = ptr;
trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
return 0;
}
@@ -2223,7 +2223,7 @@ xlog_recover_add_to_trans(
}
item->ri_total = in_f->ilf_size;
- item->ri_buf = kzalloc(item->ri_total * sizeof(xfs_log_iovec_t),
+ item->ri_buf = kcalloc(item->ri_total, sizeof(*item->ri_buf),
GFP_KERNEL | __GFP_NOFAIL);
}
@@ -2237,8 +2237,8 @@ xlog_recover_add_to_trans(
}
/* Description region is ri_buf[0] */
- item->ri_buf[item->ri_cnt].i_addr = ptr;
- item->ri_buf[item->ri_cnt].i_len = len;
+ item->ri_buf[item->ri_cnt].iov_base = ptr;
+ item->ri_buf[item->ri_cnt].iov_len = len;
item->ri_cnt++;
trace_xfs_log_recover_item_add(log, trans, item, 0);
return 0;
@@ -2262,7 +2262,7 @@ xlog_recover_free_trans(
/* Free the regions in the item. */
list_del(&item->ri_list);
for (i = 0; i < item->ri_cnt; i++)
- kvfree(item->ri_buf[i].i_addr);
+ kvfree(item->ri_buf[i].iov_base);
/* Free the item itself */
kfree(item->ri_buf);
kfree(item);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 29276fe60df9..2133fbaf1766 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -171,19 +171,16 @@ xfs_readsb(
ASSERT(mp->m_ddev_targp != NULL);
/*
- * For the initial read, we must guess at the sector
- * size based on the block device. It's enough to
- * get the sb_sectsize out of the superblock and
- * then reread with the proper length.
- * We don't verify it yet, because it may not be complete.
+ * In the first pass, use the device sector size to just read enough
+ * of the superblock to extract the XFS sector size.
+ *
+ * The device sector size must be smaller than or equal to the XFS
+ * sector size and thus we can always read the superblock. Once we know
+ * the XFS sector size, re-read it and run the buffer verifier.
*/
- sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
+ sector_size = mp->m_ddev_targp->bt_logical_sectorsize;
buf_ops = NULL;
- /*
- * Allocate a (locked) buffer to hold the superblock. This will be kept
- * around at all times to optimize access to the superblock.
- */
reread:
error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
BTOBB(sector_size), &bp, buf_ops);
@@ -247,6 +244,10 @@ reread:
/* no need to be quiet anymore, so reset the buf ops */
bp->b_ops = &xfs_sb_buf_ops;
+ /*
+ * Keep a pointer of the sb buffer around instead of caching it in the
+ * buffer cache because we access it frequently.
+ */
mp->m_sb_bp = bp;
xfs_buf_unlock(bp);
return 0;
@@ -672,74 +673,47 @@ static inline xfs_extlen_t xfs_calc_atomic_write_max(struct xfs_mount *mp)
return rounddown_pow_of_two(XFS_B_TO_FSB(mp, MAX_RW_COUNT));
}
-static inline unsigned int max_pow_of_two_factor(const unsigned int nr)
-{
- return 1 << (ffs(nr) - 1);
-}
-
/*
- * If the data device advertises atomic write support, limit the size of data
- * device atomic writes to the greatest power-of-two factor of the AG size so
- * that every atomic write unit aligns with the start of every AG. This is
- * required so that the per-AG allocations for an atomic write will always be
+ * If the underlying device advertises atomic write support, limit the size of
+ * atomic writes to the greatest power-of-two factor of the group size so
+ * that every atomic write unit aligns with the start of every group. This is
+ * required so that the allocations for an atomic write will always be
* aligned compatibly with the alignment requirements of the storage.
*
- * If the data device doesn't advertise atomic writes, then there are no
- * alignment restrictions and the largest out-of-place write we can do
- * ourselves is the number of blocks that user files can allocate from any AG.
+ * If the device doesn't advertise atomic writes, then there are no alignment
+ * restrictions and the largest out-of-place write we can do ourselves is the
+ * number of blocks that user files can allocate from any group.
*/
-static inline xfs_extlen_t xfs_calc_perag_awu_max(struct xfs_mount *mp)
-{
- if (mp->m_ddev_targp->bt_bdev_awu_min > 0)
- return max_pow_of_two_factor(mp->m_sb.sb_agblocks);
- return rounddown_pow_of_two(mp->m_ag_max_usable);
-}
-
-/*
- * Reflink on the realtime device requires rtgroups, and atomic writes require
- * reflink.
- *
- * If the realtime device advertises atomic write support, limit the size of
- * data device atomic writes to the greatest power-of-two factor of the rtgroup
- * size so that every atomic write unit aligns with the start of every rtgroup.
- * This is required so that the per-rtgroup allocations for an atomic write
- * will always be aligned compatibly with the alignment requirements of the
- * storage.
- *
- * If the rt device doesn't advertise atomic writes, then there are no
- * alignment restrictions and the largest out-of-place write we can do
- * ourselves is the number of blocks that user files can allocate from any
- * rtgroup.
- */
-static inline xfs_extlen_t xfs_calc_rtgroup_awu_max(struct xfs_mount *mp)
+static xfs_extlen_t
+xfs_calc_group_awu_max(
+ struct xfs_mount *mp,
+ enum xfs_group_type type)
{
- struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG];
+ struct xfs_groups *g = &mp->m_groups[type];
+ struct xfs_buftarg *btp = xfs_group_type_buftarg(mp, type);
- if (rgs->blocks == 0)
+ if (g->blocks == 0)
return 0;
- if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_bdev_awu_min > 0)
- return max_pow_of_two_factor(rgs->blocks);
- return rounddown_pow_of_two(rgs->blocks);
+ if (btp && btp->bt_awu_min > 0)
+ return max_pow_of_two_factor(g->blocks);
+ return rounddown_pow_of_two(g->blocks);
}
/* Compute the maximum atomic write unit size for each section. */
static inline void
xfs_calc_atomic_write_unit_max(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ enum xfs_group_type type)
{
- struct xfs_groups *ags = &mp->m_groups[XG_TYPE_AG];
- struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG];
+ struct xfs_groups *g = &mp->m_groups[type];
const xfs_extlen_t max_write = xfs_calc_atomic_write_max(mp);
const xfs_extlen_t max_ioend = xfs_reflink_max_atomic_cow(mp);
- const xfs_extlen_t max_agsize = xfs_calc_perag_awu_max(mp);
- const xfs_extlen_t max_rgsize = xfs_calc_rtgroup_awu_max(mp);
-
- ags->awu_max = min3(max_write, max_ioend, max_agsize);
- rgs->awu_max = min3(max_write, max_ioend, max_rgsize);
+ const xfs_extlen_t max_gsize = xfs_calc_group_awu_max(mp, type);
- trace_xfs_calc_atomic_write_unit_max(mp, max_write, max_ioend,
- max_agsize, max_rgsize);
+ g->awu_max = min3(max_write, max_ioend, max_gsize);
+ trace_xfs_calc_atomic_write_unit_max(mp, type, max_write, max_ioend,
+ max_gsize, g->awu_max);
}
/*
@@ -757,7 +731,8 @@ xfs_set_max_atomic_write_opt(
max(mp->m_groups[XG_TYPE_AG].blocks,
mp->m_groups[XG_TYPE_RTG].blocks);
const xfs_extlen_t max_group_write =
- max(xfs_calc_perag_awu_max(mp), xfs_calc_rtgroup_awu_max(mp));
+ max(xfs_calc_group_awu_max(mp, XG_TYPE_AG),
+ xfs_calc_group_awu_max(mp, XG_TYPE_RTG));
int error;
if (new_max_bytes == 0)
@@ -813,7 +788,8 @@ set_limit:
return error;
}
- xfs_calc_atomic_write_unit_max(mp);
+ xfs_calc_atomic_write_unit_max(mp, XG_TYPE_AG);
+ xfs_calc_atomic_write_unit_max(mp, XG_TYPE_RTG);
mp->m_awu_max_bytes = new_max_bytes;
return 0;
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index d85084f9f317..97de44c32272 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -802,4 +802,21 @@ static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta)
int xfs_set_max_atomic_write_opt(struct xfs_mount *mp,
unsigned long long new_max_bytes);
+static inline struct xfs_buftarg *
+xfs_group_type_buftarg(
+ struct xfs_mount *mp,
+ enum xfs_group_type type)
+{
+ switch (type) {
+ case XG_TYPE_AG:
+ return mp->m_ddev_targp;
+ case XG_TYPE_RTG:
+ return mp->m_rtdev_targp;
+ default:
+ ASSERT(0);
+ break;
+ }
+ return NULL;
+}
+
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 08443ceec329..866c71d9fbae 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -320,7 +320,7 @@ xfs_mru_cache_create(
xfs_mru_cache_free_func_t free_func)
{
struct xfs_mru_cache *mru = NULL;
- int err = 0, grp;
+ int grp;
unsigned int grp_time;
if (mrup)
@@ -341,8 +341,8 @@ xfs_mru_cache_create(
mru->lists = kzalloc(mru->grp_count * sizeof(*mru->lists),
GFP_KERNEL | __GFP_NOFAIL);
if (!mru->lists) {
- err = -ENOMEM;
- goto exit;
+ kfree(mru);
+ return -ENOMEM;
}
for (grp = 0; grp < mru->grp_count; grp++)
@@ -361,14 +361,7 @@ xfs_mru_cache_create(
mru->free_func = free_func;
mru->data = data;
*mrup = mru;
-
-exit:
- if (err && mru && mru->lists)
- kfree(mru->lists);
- if (err && mru)
- kfree(mru);
-
- return err;
+ return 0;
}
/*
@@ -425,10 +418,6 @@ xfs_mru_cache_insert(
{
int error = -EINVAL;
- ASSERT(mru && mru->lists);
- if (!mru || !mru->lists)
- goto out_free;
-
error = -ENOMEM;
if (radix_tree_preload(GFP_KERNEL))
goto out_free;
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index 3545dc1d953c..fbeddcac4792 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -253,8 +253,7 @@ xfs_dax_notify_dev_failure(
return -EOPNOTSUPP;
}
- error = xfs_dax_translate_range(type == XG_TYPE_RTG ?
- mp->m_rtdev_targp : mp->m_ddev_targp,
+ error = xfs_dax_translate_range(xfs_group_type_buftarg(mp, type),
offset, len, &daddr, &bblen);
if (error)
return error;
@@ -280,10 +279,7 @@ xfs_dax_notify_dev_failure(
kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0;
}
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- goto out;
-
+ tp = xfs_trans_alloc_empty(mp);
start_gno = xfs_fsb_to_gno(mp, start_bno, type);
end_gno = xfs_fsb_to_gno(mp, end_bno, type);
while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
@@ -354,7 +350,6 @@ xfs_dax_notify_dev_failure(
error = -EFSCORRUPTED;
}
-out:
/* Thaw the fs if it has been frozen before. */
if (mf_flags & MF_MEM_PRE_REMOVE)
xfs_dax_notify_failure_thaw(mp, kernel_frozen);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 417439b58785..23ba84ec919a 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -134,6 +134,7 @@ xfs_qm_dqpurge(
dqp->q_flags |= XFS_DQFLAG_FREEING;
+ xfs_qm_dqunpin_wait(dqp);
xfs_dqflock(dqp);
/*
@@ -465,6 +466,7 @@ xfs_qm_dquot_isolate(
struct xfs_dquot *dqp = container_of(item,
struct xfs_dquot, q_lru);
struct xfs_qm_isolate *isol = arg;
+ enum lru_status ret = LRU_SKIP;
if (!xfs_dqlock_nowait(dqp))
goto out_miss_busy;
@@ -478,6 +480,16 @@ xfs_qm_dquot_isolate(
goto out_miss_unlock;
/*
+ * If the dquot is pinned or dirty, rotate it to the end of the LRU to
+ * give some time for it to be cleaned before we try to isolate it
+ * again.
+ */
+ ret = LRU_ROTATE;
+ if (XFS_DQ_IS_DIRTY(dqp) || atomic_read(&dqp->q_pincount) > 0) {
+ goto out_miss_unlock;
+ }
+
+ /*
* This dquot has acquired a reference in the meantime remove it from
* the freelist and try again.
*/
@@ -492,41 +504,14 @@ xfs_qm_dquot_isolate(
}
/*
- * If the dquot is dirty, flush it. If it's already being flushed, just
- * skip it so there is time for the IO to complete before we try to
- * reclaim it again on the next LRU pass.
+ * The dquot may still be under IO, in which case the flush lock will be
+ * held. If we can't get the flush lock now, just skip over the dquot as
+ * if it was dirty.
*/
if (!xfs_dqflock_nowait(dqp))
goto out_miss_unlock;
- if (XFS_DQ_IS_DIRTY(dqp)) {
- struct xfs_buf *bp = NULL;
- int error;
-
- trace_xfs_dqreclaim_dirty(dqp);
-
- /* we have to drop the LRU lock to flush the dquot */
- spin_unlock(&lru->lock);
-
- error = xfs_dquot_use_attached_buf(dqp, &bp);
- if (!bp || error == -EAGAIN) {
- xfs_dqfunlock(dqp);
- goto out_unlock_dirty;
- }
-
- /*
- * dqflush completes dqflock on error, and the delwri ioend
- * does it on success.
- */
- error = xfs_qm_dqflush(dqp, bp);
- if (error)
- goto out_unlock_dirty;
-
- xfs_buf_delwri_queue(bp, &isol->buffers);
- xfs_buf_relse(bp);
- goto out_unlock_dirty;
- }
-
+ ASSERT(!XFS_DQ_IS_DIRTY(dqp));
xfs_dquot_detach_buf(dqp);
xfs_dqfunlock(dqp);
@@ -548,13 +533,7 @@ out_miss_unlock:
out_miss_busy:
trace_xfs_dqreclaim_busy(dqp);
XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
- return LRU_SKIP;
-
-out_unlock_dirty:
- trace_xfs_dqreclaim_busy(dqp);
- XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
- xfs_dqunlock(dqp);
- return LRU_RETRY;
+ return ret;
}
static unsigned long
@@ -681,10 +660,7 @@ xfs_qm_load_metadir_qinos(
struct xfs_trans *tp;
int error;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
-
+ tp = xfs_trans_alloc_empty(mp);
error = xfs_dqinode_load_parent(tp, &qi->qi_dirip);
if (error == -ENOENT) {
/* no quota dir directory, but we'll create one later */
@@ -1486,7 +1462,6 @@ xfs_qm_flush_one(
struct xfs_dquot *dqp,
void *data)
{
- struct xfs_mount *mp = dqp->q_mount;
struct list_head *buffer_list = data;
struct xfs_buf *bp = NULL;
int error = 0;
@@ -1497,34 +1472,8 @@ xfs_qm_flush_one(
if (!XFS_DQ_IS_DIRTY(dqp))
goto out_unlock;
- /*
- * The only way the dquot is already flush locked by the time quotacheck
- * gets here is if reclaim flushed it before the dqadjust walk dirtied
- * it for the final time. Quotacheck collects all dquot bufs in the
- * local delwri queue before dquots are dirtied, so reclaim can't have
- * possibly queued it for I/O. The only way out is to push the buffer to
- * cycle the flush lock.
- */
- if (!xfs_dqflock_nowait(dqp)) {
- /* buf is pinned in-core by delwri list */
- error = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen, 0, &bp);
- if (error)
- goto out_unlock;
-
- if (!(bp->b_flags & _XBF_DELWRI_Q)) {
- error = -EAGAIN;
- xfs_buf_relse(bp);
- goto out_unlock;
- }
- xfs_buf_unlock(bp);
-
- xfs_buf_delwri_pushbuf(bp, buffer_list);
- xfs_buf_rele(bp);
-
- error = -EAGAIN;
- goto out_unlock;
- }
+ xfs_qm_dqunpin_wait(dqp);
+ xfs_dqflock(dqp);
error = xfs_dquot_use_attached_buf(dqp, &bp);
if (error)
@@ -1803,10 +1752,7 @@ xfs_qm_qino_load(
struct xfs_inode *dp = NULL;
int error;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
-
+ tp = xfs_trans_alloc_empty(mp);
if (xfs_has_metadir(mp)) {
error = xfs_dqinode_load_parent(tp, &dp);
if (error)
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 076501123d89..3728234699a2 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -717,18 +717,18 @@ xlog_recover_cui_commit_pass2(
struct xfs_cui_log_format *cui_formatp;
size_t len;
- cui_formatp = item->ri_buf[0].i_addr;
+ cui_formatp = item->ri_buf[0].iov_base;
- if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) {
+ if (item->ri_buf[0].iov_len < xfs_cui_log_format_sizeof(0)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents);
- if (item->ri_buf[0].i_len != len) {
+ if (item->ri_buf[0].iov_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -759,18 +759,18 @@ xlog_recover_rtcui_commit_pass2(
struct xfs_cui_log_format *cui_formatp;
size_t len;
- cui_formatp = item->ri_buf[0].i_addr;
+ cui_formatp = item->ri_buf[0].iov_base;
- if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) {
+ if (item->ri_buf[0].iov_len < xfs_cui_log_format_sizeof(0)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents);
- if (item->ri_buf[0].i_len != len) {
+ if (item->ri_buf[0].iov_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -791,7 +791,7 @@ xlog_recover_rtcui_commit_pass2(
xfs_lsn_t lsn)
{
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
#endif
@@ -817,10 +817,10 @@ xlog_recover_cud_commit_pass2(
{
struct xfs_cud_log_format *cud_formatp;
- cud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) {
+ cud_formatp = item->ri_buf[0].iov_base;
+ if (item->ri_buf[0].iov_len != sizeof(struct xfs_cud_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -843,10 +843,10 @@ xlog_recover_rtcud_commit_pass2(
{
struct xfs_cud_log_format *cud_formatp;
- cud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) {
+ cud_formatp = item->ri_buf[0].iov_base;
+ if (item->ri_buf[0].iov_len != sizeof(struct xfs_cud_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index ad3bcb76d805..3f177b4ec131 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1881,7 +1881,8 @@ xfs_reflink_unshare(
&xfs_dax_write_iomap_ops);
else
error = iomap_file_unshare(inode, offset, len,
- &xfs_buffered_write_iomap_ops);
+ &xfs_buffered_write_iomap_ops,
+ &xfs_iomap_write_ops);
if (error)
goto out;
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index c99700318ec2..15f0903f6fd4 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -746,18 +746,18 @@ xlog_recover_rui_commit_pass2(
struct xfs_rui_log_format *rui_formatp;
size_t len;
- rui_formatp = item->ri_buf[0].i_addr;
+ rui_formatp = item->ri_buf[0].iov_base;
- if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) {
+ if (item->ri_buf[0].iov_len < xfs_rui_log_format_sizeof(0)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents);
- if (item->ri_buf[0].i_len != len) {
+ if (item->ri_buf[0].iov_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -788,18 +788,18 @@ xlog_recover_rtrui_commit_pass2(
struct xfs_rui_log_format *rui_formatp;
size_t len;
- rui_formatp = item->ri_buf[0].i_addr;
+ rui_formatp = item->ri_buf[0].iov_base;
- if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) {
+ if (item->ri_buf[0].iov_len < xfs_rui_log_format_sizeof(0)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents);
- if (item->ri_buf[0].i_len != len) {
+ if (item->ri_buf[0].iov_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -820,7 +820,7 @@ xlog_recover_rtrui_commit_pass2(
xfs_lsn_t lsn)
{
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
#endif
@@ -846,10 +846,10 @@ xlog_recover_rud_commit_pass2(
{
struct xfs_rud_log_format *rud_formatp;
- rud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) {
+ rud_formatp = item->ri_buf[0].iov_base;
+ if (item->ri_buf[0].iov_len != sizeof(struct xfs_rud_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- rud_formatp, item->ri_buf[0].i_len);
+ rud_formatp, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -872,10 +872,10 @@ xlog_recover_rtrud_commit_pass2(
{
struct xfs_rud_log_format *rud_formatp;
- rud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) {
+ rud_formatp = item->ri_buf[0].iov_base;
+ if (item->ri_buf[0].iov_len != sizeof(struct xfs_rud_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- rud_formatp, item->ri_buf[0].i_len);
+ rud_formatp, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 6484c596ecea..6907e871fa15 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -729,9 +729,7 @@ xfs_rtginode_ensure(
if (rtg->rtg_inodes[type])
return 0;
- error = xfs_trans_alloc_empty(rtg_mount(rtg), &tp);
- if (error)
- return error;
+ tp = xfs_trans_alloc_empty(rtg_mount(rtg));
error = xfs_rtginode_load(rtg, type, tp);
xfs_trans_cancel(tp);
@@ -1259,6 +1257,8 @@ xfs_growfs_check_rtgeom(
kfree(nmp);
+ trace_xfs_growfs_check_rtgeom(mp, min_logfsbs);
+
if (min_logfsbs > mp->m_sb.sb_logblocks)
return -EINVAL;
@@ -1303,9 +1303,7 @@ xfs_growfs_rt_prep_groups(
if (!mp->m_rtdirip) {
struct xfs_trans *tp;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
+ tp = xfs_trans_alloc_empty(mp);
error = xfs_rtginode_load_parent(tp);
xfs_trans_cancel(tp);
@@ -1672,10 +1670,7 @@ xfs_rtmount_inodes(
struct xfs_rtgroup *rtg = NULL;
int error;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
-
+ tp = xfs_trans_alloc_empty(mp);
if (xfs_has_rtgroups(mp) && mp->m_sb.sb_rgcount > 0) {
error = xfs_rtginode_load_parent(tp);
if (error)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 0bc4b5489078..bb0a82635a77 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -2020,14 +2020,13 @@ xfs_remount_rw(
int error;
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp &&
- bdev_read_only(mp->m_logdev_targp->bt_bdev)) {
+ xfs_readonly_buftarg(mp->m_logdev_targp)) {
xfs_warn(mp,
"ro->rw transition prohibited by read-only logdev");
return -EACCES;
}
- if (mp->m_rtdev_targp &&
- bdev_read_only(mp->m_rtdev_targp->bt_bdev)) {
+ if (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp)) {
xfs_warn(mp,
"ro->rw transition prohibited by read-only rtdev");
return -EACCES;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 01d284a1c759..e1794e3e3156 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -171,36 +171,33 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);
TRACE_EVENT(xfs_calc_atomic_write_unit_max,
- TP_PROTO(struct xfs_mount *mp, unsigned int max_write,
- unsigned int max_ioend, unsigned int max_agsize,
- unsigned int max_rgsize),
- TP_ARGS(mp, max_write, max_ioend, max_agsize, max_rgsize),
+ TP_PROTO(struct xfs_mount *mp, enum xfs_group_type type,
+ unsigned int max_write, unsigned int max_ioend,
+ unsigned int max_gsize, unsigned int awu_max),
+ TP_ARGS(mp, type, max_write, max_ioend, max_gsize, awu_max),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(unsigned int, max_write)
__field(unsigned int, max_ioend)
- __field(unsigned int, max_agsize)
- __field(unsigned int, max_rgsize)
- __field(unsigned int, data_awu_max)
- __field(unsigned int, rt_awu_max)
+ __field(unsigned int, max_gsize)
+ __field(unsigned int, awu_max)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
+ __entry->type = type;
__entry->max_write = max_write;
__entry->max_ioend = max_ioend;
- __entry->max_agsize = max_agsize;
- __entry->max_rgsize = max_rgsize;
- __entry->data_awu_max = mp->m_groups[XG_TYPE_AG].awu_max;
- __entry->rt_awu_max = mp->m_groups[XG_TYPE_RTG].awu_max;
+ __entry->max_gsize = max_gsize;
+ __entry->awu_max = awu_max;
),
- TP_printk("dev %d:%d max_write %u max_ioend %u max_agsize %u max_rgsize %u data_awu_max %u rt_awu_max %u",
+ TP_printk("dev %d:%d %s max_write %u max_ioend %u max_gsize %u awu_max %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->max_write,
__entry->max_ioend,
- __entry->max_agsize,
- __entry->max_rgsize,
- __entry->data_awu_max,
- __entry->rt_awu_max)
+ __entry->max_gsize,
+ __entry->awu_max)
);
TRACE_EVENT(xfs_calc_max_atomic_write_fsblocks,
@@ -428,8 +425,8 @@ DECLARE_EVENT_CLASS(xfs_zone_alloc_class,
__field(dev_t, dev)
__field(xfs_rgnumber_t, rgno)
__field(xfs_rgblock_t, used)
+ __field(xfs_rgblock_t, allocated)
__field(xfs_rgblock_t, written)
- __field(xfs_rgblock_t, write_pointer)
__field(xfs_rgblock_t, rgbno)
__field(xfs_extlen_t, len)
),
@@ -437,17 +434,17 @@ DECLARE_EVENT_CLASS(xfs_zone_alloc_class,
__entry->dev = rtg_mount(oz->oz_rtg)->m_super->s_dev;
__entry->rgno = rtg_rgno(oz->oz_rtg);
__entry->used = rtg_rmap(oz->oz_rtg)->i_used_blocks;
+ __entry->allocated = oz->oz_allocated;
__entry->written = oz->oz_written;
- __entry->write_pointer = oz->oz_write_pointer;
__entry->rgbno = rgbno;
__entry->len = len;
),
- TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x",
+ TP_printk("dev %d:%d rgno 0x%x used 0x%x alloced 0x%x written 0x%x rgbno 0x%x len 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->rgno,
__entry->used,
+ __entry->allocated,
__entry->written,
- __entry->write_pointer,
__entry->rgbno,
__entry->len)
);
@@ -778,7 +775,6 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done);
DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
DEFINE_BUF_EVENT(xfs_buf_delwri_split);
-DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf);
DEFINE_BUF_EVENT(xfs_buf_get_uncached);
DEFINE_BUF_EVENT(xfs_buf_item_relse);
DEFINE_BUF_EVENT(xfs_buf_iodone_async);
@@ -1083,7 +1079,9 @@ DEFINE_INODE_EVENT(xfs_get_acl);
#endif
DEFINE_INODE_EVENT(xfs_vm_bmap);
DEFINE_INODE_EVENT(xfs_file_ioctl);
+#ifdef CONFIG_COMPAT
DEFINE_INODE_EVENT(xfs_file_compat_ioctl);
+#endif
DEFINE_INODE_EVENT(xfs_ioctl_setattr);
DEFINE_INODE_EVENT(xfs_dir_fsync);
DEFINE_INODE_EVENT(xfs_file_fsync);
@@ -1147,6 +1145,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class,
__field(xfs_ino_t, ino)
__field(int, count)
__field(int, pincount)
+ __field(unsigned long, iflags)
__field(unsigned long, caller_ip)
),
TP_fast_assign(
@@ -1154,13 +1153,15 @@ DECLARE_EVENT_CLASS(xfs_iref_class,
__entry->ino = ip->i_ino;
__entry->count = atomic_read(&VFS_I(ip)->i_count);
__entry->pincount = atomic_read(&ip->i_pincount);
+ __entry->iflags = ip->i_flags;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pS",
+ TP_printk("dev %d:%d ino 0x%llx count %d pincount %d iflags 0x%lx caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->count,
__entry->pincount,
+ __entry->iflags,
(char *)__entry->caller_ip)
)
@@ -1250,6 +1251,8 @@ DEFINE_IREF_EVENT(xfs_irele);
DEFINE_IREF_EVENT(xfs_inode_pin);
DEFINE_IREF_EVENT(xfs_inode_unpin);
DEFINE_IREF_EVENT(xfs_inode_unpin_nowait);
+DEFINE_IREF_EVENT(xfs_inode_push_pinned);
+DEFINE_IREF_EVENT(xfs_inode_push_stale);
DECLARE_EVENT_CLASS(xfs_namespace_class,
TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name),
@@ -1393,7 +1396,6 @@ DEFINE_EVENT(xfs_dquot_class, name, \
TP_ARGS(dqp))
DEFINE_DQUOT_EVENT(xfs_dqadjust);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
-DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_busy);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_done);
DEFINE_DQUOT_EVENT(xfs_dqattach_found);
@@ -1598,7 +1600,6 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait);
-DEFINE_LOGGRANT_EVENT(xfs_log_cil_return);
DECLARE_EVENT_CLASS(xfs_log_item_class,
TP_PROTO(struct xfs_log_item *lip),
@@ -1654,6 +1655,8 @@ DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing);
DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_mark);
DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_skip);
DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_unpin);
+DEFINE_LOG_ITEM_EVENT(xlog_ail_insert_abort);
+DEFINE_LOG_ITEM_EVENT(xfs_trans_free_abort);
DECLARE_EVENT_CLASS(xfs_ail_class,
TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn),
@@ -1859,8 +1862,6 @@ DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
-DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
-DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read);
DEFINE_SIMPLE_IO_EVENT(xfs_zoned_map_blocks);
@@ -1893,31 +1894,6 @@ DEFINE_EVENT(xfs_itrunc_class, name, \
DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start);
DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end);
-TRACE_EVENT(xfs_pagecache_inval,
- TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
- TP_ARGS(ip, start, finish),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_ino_t, ino)
- __field(xfs_fsize_t, size)
- __field(xfs_off_t, start)
- __field(xfs_off_t, finish)
- ),
- TP_fast_assign(
- __entry->dev = VFS_I(ip)->i_sb->s_dev;
- __entry->ino = ip->i_ino;
- __entry->size = ip->i_disk_size;
- __entry->start = start;
- __entry->finish = finish;
- ),
- TP_printk("dev %d:%d ino 0x%llx disize 0x%llx start 0x%llx finish 0x%llx",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
- __entry->size,
- __entry->start,
- __entry->finish)
-);
-
TRACE_EVENT(xfs_bunmap,
TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t fileoff, xfs_filblks_t len,
int flags, unsigned long caller_ip),
@@ -2263,14 +2239,12 @@ DEFINE_EVENT(xfs_alloc_class, name, \
DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
DEFINE_ALLOC_EVENT(xfs_alloc_cur);
DEFINE_ALLOC_EVENT(xfs_alloc_cur_right);
DEFINE_ALLOC_EVENT(xfs_alloc_cur_left);
DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup);
DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup_done);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);
DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
@@ -2465,13 +2439,8 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall);
DEFINE_ATTR_EVENT(xfs_attr_node_addname);
DEFINE_ATTR_EVENT(xfs_attr_node_get);
DEFINE_ATTR_EVENT(xfs_attr_node_replace);
-DEFINE_ATTR_EVENT(xfs_attr_node_removename);
-
-DEFINE_ATTR_EVENT(xfs_attr_fillstate);
-DEFINE_ATTR_EVENT(xfs_attr_refillstate);
DEFINE_ATTR_EVENT(xfs_attr_rmtval_get);
-DEFINE_ATTR_EVENT(xfs_attr_rmtval_set);
#define DEFINE_DA_EVENT(name) \
DEFINE_EVENT(xfs_da_class, name, \
@@ -2889,7 +2858,6 @@ DEFINE_EVENT(xfs_rtdiscard_class, name, \
TP_ARGS(mp, rtbno, len))
DEFINE_RTDISCARD_EVENT(xfs_discard_rtextent);
DEFINE_RTDISCARD_EVENT(xfs_discard_rttoosmall);
-DEFINE_RTDISCARD_EVENT(xfs_discard_rtrelax);
DECLARE_EVENT_CLASS(xfs_btree_cur_class,
TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp),
@@ -4206,36 +4174,6 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error);
DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_src);
DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_dest);
-/* dedupe tracepoints */
-DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error);
-
-/* ioctl tracepoints */
-TRACE_EVENT(xfs_ioctl_clone,
- TP_PROTO(struct inode *src, struct inode *dest),
- TP_ARGS(src, dest),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(unsigned long, src_ino)
- __field(loff_t, src_isize)
- __field(unsigned long, dest_ino)
- __field(loff_t, dest_isize)
- ),
- TP_fast_assign(
- __entry->dev = src->i_sb->s_dev;
- __entry->src_ino = src->i_ino;
- __entry->src_isize = i_size_read(src);
- __entry->dest_ino = dest->i_ino;
- __entry->dest_isize = i_size_read(dest);
- ),
- TP_printk("dev %d:%d ino 0x%lx isize 0x%llx -> ino 0x%lx isize 0x%llx",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->src_ino,
- __entry->src_isize,
- __entry->dest_ino,
- __entry->dest_isize)
-);
-
/* unshare tracepoints */
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
@@ -4243,7 +4181,6 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
/* copy on write */
DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
@@ -5028,7 +4965,6 @@ DEFINE_ICLOG_EVENT(xlog_iclog_switch);
DEFINE_ICLOG_EVENT(xlog_iclog_sync);
DEFINE_ICLOG_EVENT(xlog_iclog_syncing);
DEFINE_ICLOG_EVENT(xlog_iclog_sync_done);
-DEFINE_ICLOG_EVENT(xlog_iclog_want_sync);
DEFINE_ICLOG_EVENT(xlog_iclog_wait_on);
DEFINE_ICLOG_EVENT(xlog_iclog_write);
@@ -5077,7 +5013,6 @@ DEFINE_DAS_STATE_EVENT(xfs_attr_sf_addname_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_set_iter_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_leaf_addname_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_node_addname_return);
-DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc);
DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index c6657072361a..ece374d622b3 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -134,18 +134,14 @@ xfs_trans_dup(
}
/*
- * This is called to reserve free disk blocks and log space for the
- * given transaction. This must be done before allocating any resources
- * within the transaction.
+ * This is called to reserve free disk blocks and log space for the given
+ * transaction before allocating any resources within the transaction.
*
* This will return ENOSPC if there are not enough blocks available.
* It will sleep waiting for available log space.
- * The only valid value for the flags parameter is XFS_RES_LOG_PERM, which
- * is used by long running transactions. If any one of the reservations
- * fails then they will all be backed out.
*
- * This does not do quota reservations. That typically is done by the
- * caller afterwards.
+ * This does not do quota reservations. That typically is done by the caller
+ * afterwards.
*/
static int
xfs_trans_reserve(
@@ -158,10 +154,12 @@ xfs_trans_reserve(
int error = 0;
bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+ ASSERT(resp->tr_logres > 0);
+
/*
- * Attempt to reserve the needed disk blocks by decrementing
- * the number needed from the number available. This will
- * fail if the count would go below zero.
+ * Attempt to reserve the needed disk blocks by decrementing the number
+ * needed from the number available. This will fail if the count would
+ * go below zero.
*/
if (blocks > 0) {
error = xfs_dec_fdblocks(mp, blocks, rsvd);
@@ -173,42 +171,20 @@ xfs_trans_reserve(
/*
* Reserve the log space needed for this transaction.
*/
- if (resp->tr_logres > 0) {
- bool permanent = false;
-
- ASSERT(tp->t_log_res == 0 ||
- tp->t_log_res == resp->tr_logres);
- ASSERT(tp->t_log_count == 0 ||
- tp->t_log_count == resp->tr_logcount);
-
- if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
- tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
- permanent = true;
- } else {
- ASSERT(tp->t_ticket == NULL);
- ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES));
- }
-
- if (tp->t_ticket != NULL) {
- ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
- error = xfs_log_regrant(mp, tp->t_ticket);
- } else {
- error = xfs_log_reserve(mp, resp->tr_logres,
- resp->tr_logcount,
- &tp->t_ticket, permanent);
- }
-
- if (error)
- goto undo_blocks;
+ if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES)
+ tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
+ error = xfs_log_reserve(mp, resp->tr_logres, resp->tr_logcount,
+ &tp->t_ticket, (tp->t_flags & XFS_TRANS_PERM_LOG_RES));
+ if (error)
+ goto undo_blocks;
- tp->t_log_res = resp->tr_logres;
- tp->t_log_count = resp->tr_logcount;
- }
+ tp->t_log_res = resp->tr_logres;
+ tp->t_log_count = resp->tr_logcount;
/*
- * Attempt to reserve the needed realtime extents by decrementing
- * the number needed from the number available. This will
- * fail if the count would go below zero.
+ * Attempt to reserve the needed realtime extents by decrementing the
+ * number needed from the number available. This will fail if the
+ * count would go below zero.
*/
if (rtextents > 0) {
error = xfs_dec_frextents(mp, rtextents);
@@ -221,18 +197,11 @@ xfs_trans_reserve(
return 0;
- /*
- * Error cases jump to one of these labels to undo any
- * reservations which have already been performed.
- */
undo_log:
- if (resp->tr_logres > 0) {
- xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
- tp->t_ticket = NULL;
- tp->t_log_res = 0;
- tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
- }
-
+ xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
+ tp->t_ticket = NULL;
+ tp->t_log_res = 0;
+ tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
undo_blocks:
if (blocks > 0) {
xfs_add_fdblocks(mp, blocks);
@@ -241,6 +210,28 @@ undo_blocks:
return error;
}
+static struct xfs_trans *
+__xfs_trans_alloc(
+ struct xfs_mount *mp,
+ uint flags)
+{
+ struct xfs_trans *tp;
+
+ ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) || xfs_has_lazysbcount(mp));
+
+ tp = kmem_cache_zalloc(xfs_trans_cache, GFP_KERNEL | __GFP_NOFAIL);
+ if (!(flags & XFS_TRANS_NO_WRITECOUNT))
+ sb_start_intwrite(mp->m_super);
+ xfs_trans_set_context(tp);
+ tp->t_flags = flags;
+ tp->t_mountp = mp;
+ INIT_LIST_HEAD(&tp->t_items);
+ INIT_LIST_HEAD(&tp->t_busy);
+ INIT_LIST_HEAD(&tp->t_dfops);
+ tp->t_highest_agno = NULLAGNUMBER;
+ return tp;
+}
+
int
xfs_trans_alloc(
struct xfs_mount *mp,
@@ -254,33 +245,16 @@ xfs_trans_alloc(
bool want_retry = true;
int error;
+ ASSERT(resp->tr_logres > 0);
+
/*
* Allocate the handle before we do our freeze accounting and setting up
* GFP_NOFS allocation context so that we avoid lockdep false positives
* by doing GFP_KERNEL allocations inside sb_start_intwrite().
*/
retry:
- tp = kmem_cache_zalloc(xfs_trans_cache, GFP_KERNEL | __GFP_NOFAIL);
- if (!(flags & XFS_TRANS_NO_WRITECOUNT))
- sb_start_intwrite(mp->m_super);
- xfs_trans_set_context(tp);
-
- /*
- * Zero-reservation ("empty") transactions can't modify anything, so
- * they're allowed to run while we're frozen.
- */
- WARN_ON(resp->tr_logres > 0 &&
- mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
- ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) ||
- xfs_has_lazysbcount(mp));
-
- tp->t_flags = flags;
- tp->t_mountp = mp;
- INIT_LIST_HEAD(&tp->t_items);
- INIT_LIST_HEAD(&tp->t_busy);
- INIT_LIST_HEAD(&tp->t_dfops);
- tp->t_highest_agno = NULLAGNUMBER;
-
+ WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
+ tp = __xfs_trans_alloc(mp, flags);
error = xfs_trans_reserve(tp, resp, blocks, rtextents);
if (error == -ENOSPC && want_retry) {
xfs_trans_cancel(tp);
@@ -324,14 +298,11 @@ retry:
* where we can be grabbing buffers at the same time that freeze is trying to
* drain the buffer LRU list.
*/
-int
+struct xfs_trans *
xfs_trans_alloc_empty(
- struct xfs_mount *mp,
- struct xfs_trans **tpp)
+ struct xfs_mount *mp)
{
- struct xfs_trans_res resv = {0};
-
- return xfs_trans_alloc(mp, &resv, 0, 0, XFS_TRANS_NO_WRITECOUNT, tpp);
+ return __xfs_trans_alloc(mp, XFS_TRANS_NO_WRITECOUNT);
}
/*
@@ -742,8 +713,10 @@ xfs_trans_free_items(
list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
xfs_trans_del_item(lip);
- if (abort)
+ if (abort) {
+ trace_xfs_trans_free_abort(lip);
set_bit(XFS_LI_ABORTED, &lip->li_flags);
+ }
if (lip->li_ops->iop_release)
lip->li_ops->iop_release(lip);
}
@@ -1024,51 +997,57 @@ xfs_trans_cancel(
}
/*
- * Roll from one trans in the sequence of PERMANENT transactions to
- * the next: permanent transactions are only flushed out when
- * committed with xfs_trans_commit(), but we still want as soon
- * as possible to let chunks of it go to the log. So we commit the
- * chunk we've been working on and get a new transaction to continue.
+ * Roll from one trans in the sequence of PERMANENT transactions to the next:
+ * permanent transactions are only flushed out when committed with
+ * xfs_trans_commit(), but we still want as soon as possible to let chunks of it
+ * go to the log. So we commit the chunk we've been working on and get a new
+ * transaction to continue.
*/
int
xfs_trans_roll(
struct xfs_trans **tpp)
{
- struct xfs_trans *trans = *tpp;
- struct xfs_trans_res tres;
+ struct xfs_trans *tp = *tpp;
+ unsigned int log_res = tp->t_log_res;
+ unsigned int log_count = tp->t_log_count;
int error;
- trace_xfs_trans_roll(trans, _RET_IP_);
+ trace_xfs_trans_roll(tp, _RET_IP_);
+
+ ASSERT(log_res > 0);
/*
* Copy the critical parameters from one trans to the next.
*/
- tres.tr_logres = trans->t_log_res;
- tres.tr_logcount = trans->t_log_count;
-
- *tpp = xfs_trans_dup(trans);
+ *tpp = xfs_trans_dup(tp);
/*
* Commit the current transaction.
- * If this commit failed, then it'd just unlock those items that
- * are not marked ihold. That also means that a filesystem shutdown
- * is in progress. The caller takes the responsibility to cancel
- * the duplicate transaction that gets returned.
+ *
+ * If this commit failed, then it'd just unlock those items that are not
+ * marked ihold. That also means that a filesystem shutdown is in
+ * progress. The caller takes the responsibility to cancel the
+ * duplicate transaction that gets returned.
*/
- error = __xfs_trans_commit(trans, true);
+ error = __xfs_trans_commit(tp, true);
if (error)
return error;
/*
* Reserve space in the log for the next transaction.
- * This also pushes items in the "AIL", the list of logged items,
- * out to disk if they are taking up space at the tail of the log
- * that we want to use. This requires that either nothing be locked
- * across this call, or that anything that is locked be logged in
- * the prior and the next transactions.
+ *
+ * This also pushes items in the AIL out to disk if they are taking up
+ * space at the tail of the log that we want to use. This requires that
+ * either nothing be locked across this call, or that anything that is
+ * locked be logged in the prior and the next transactions.
*/
- tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
- return xfs_trans_reserve(*tpp, &tres, 0, 0);
+ tp = *tpp;
+ error = xfs_log_regrant(tp->t_mountp, tp->t_ticket);
+ if (error)
+ return error;
+ tp->t_log_res = log_res;
+ tp->t_log_count = log_count;
+ return 0;
}
/*
@@ -1144,9 +1123,18 @@ xfs_trans_reserve_more(
unsigned int blocks,
unsigned int rtextents)
{
- struct xfs_trans_res resv = { };
-
- return xfs_trans_reserve(tp, &resv, blocks, rtextents);
+ bool rsvd = tp->t_flags & XFS_TRANS_RESERVE;
+
+ if (blocks && xfs_dec_fdblocks(tp->t_mountp, blocks, rsvd))
+ return -ENOSPC;
+ if (rtextents && xfs_dec_frextents(tp->t_mountp, rtextents)) {
+ if (blocks)
+ xfs_add_fdblocks(tp->t_mountp, blocks);
+ return -ENOSPC;
+ }
+ tp->t_blk_res += blocks;
+ tp->t_rtx_res += rtextents;
+ return 0;
}
/*
@@ -1161,14 +1149,13 @@ xfs_trans_reserve_more_inode(
unsigned int rblocks,
bool force_quota)
{
- struct xfs_trans_res resv = { };
struct xfs_mount *mp = ip->i_mount;
unsigned int rtx = xfs_extlen_to_rtxlen(mp, rblocks);
int error;
xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve(tp, &resv, dblocks, rtx);
+ error = xfs_trans_reserve_more(tp, dblocks, rtx);
if (error)
return error;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 2b366851e9a4..7fb860f645a3 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -15,7 +15,6 @@ struct xfs_efd_log_item;
struct xfs_efi_log_item;
struct xfs_inode;
struct xfs_item_ops;
-struct xfs_log_iovec;
struct xfs_mount;
struct xfs_trans;
struct xfs_trans_res;
@@ -168,8 +167,7 @@ int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
struct xfs_trans **tpp);
int xfs_trans_reserve_more(struct xfs_trans *tp,
unsigned int blocks, unsigned int rtextents);
-int xfs_trans_alloc_empty(struct xfs_mount *mp,
- struct xfs_trans **tpp);
+struct xfs_trans *xfs_trans_alloc_empty(struct xfs_mount *mp);
void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
int xfs_trans_get_buf_map(struct xfs_trans *tp, struct xfs_buftarg *target,
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 0f641a9091ec..ac5cecec9aa1 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -243,7 +243,7 @@ __xfs_xattr_put_listent(
offset = context->buffer + context->count;
memcpy(offset, prefix, prefix_len);
offset += prefix_len;
- strncpy(offset, (char *)name, namelen); /* real name */
+ memcpy(offset, (char *)name, namelen); /* real name */
offset += namelen;
*offset = '\0';
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index 80add26c0111..33f7eee521a8 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -434,7 +434,7 @@ xfs_init_open_zone(
spin_lock_init(&oz->oz_alloc_lock);
atomic_set(&oz->oz_ref, 1);
oz->oz_rtg = rtg;
- oz->oz_write_pointer = write_pointer;
+ oz->oz_allocated = write_pointer;
oz->oz_written = write_pointer;
oz->oz_write_hint = write_hint;
oz->oz_is_gc = is_gc;
@@ -569,7 +569,7 @@ xfs_try_use_zone(
struct xfs_open_zone *oz,
bool lowspace)
{
- if (oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
+ if (oz->oz_allocated == rtg_blocks(oz->oz_rtg))
return false;
if (!lowspace && !xfs_good_hint_match(oz, file_hint))
return false;
@@ -654,13 +654,6 @@ static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip)
!(ip->i_diflags & XFS_DIFLAG_APPEND);
}
-/*
- * Pick a new zone for writes.
- *
- * If we aren't using up our budget of open zones just open a new one from the
- * freelist. Else try to find one that matches the expected data lifetime. If
- * we don't find one that is good pick any zone that is available.
- */
static struct xfs_open_zone *
xfs_select_zone_nowait(
struct xfs_mount *mp,
@@ -688,7 +681,8 @@ xfs_select_zone_nowait(
goto out_unlock;
/*
- * See if we can open a new zone and use that.
+ * See if we can open a new zone and use that so that data for different
+ * files is mixed as little as possible.
*/
oz = xfs_try_open_zone(mp, write_hint);
if (oz)
@@ -727,7 +721,7 @@ xfs_select_zone(
for (;;) {
prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE);
oz = xfs_select_zone_nowait(mp, write_hint, pack_tight);
- if (oz)
+ if (oz || xfs_is_shutdown(mp))
break;
schedule();
}
@@ -744,25 +738,25 @@ xfs_zone_alloc_blocks(
{
struct xfs_rtgroup *rtg = oz->oz_rtg;
struct xfs_mount *mp = rtg_mount(rtg);
- xfs_rgblock_t rgbno;
+ xfs_rgblock_t allocated;
spin_lock(&oz->oz_alloc_lock);
count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN,
- (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_write_pointer);
+ (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_allocated);
if (!count_fsb) {
spin_unlock(&oz->oz_alloc_lock);
return 0;
}
- rgbno = oz->oz_write_pointer;
- oz->oz_write_pointer += count_fsb;
+ allocated = oz->oz_allocated;
+ oz->oz_allocated += count_fsb;
spin_unlock(&oz->oz_alloc_lock);
- trace_xfs_zone_alloc_blocks(oz, rgbno, count_fsb);
+ trace_xfs_zone_alloc_blocks(oz, allocated, count_fsb);
*sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *sector);
if (!*is_seq)
- *sector += XFS_FSB_TO_BB(mp, rgbno);
+ *sector += XFS_FSB_TO_BB(mp, allocated);
return XFS_FSB_TO_B(mp, count_fsb);
}
@@ -777,26 +771,6 @@ xfs_mark_rtg_boundary(
ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
}
-static void
-xfs_submit_zoned_bio(
- struct iomap_ioend *ioend,
- struct xfs_open_zone *oz,
- bool is_seq)
-{
- ioend->io_bio.bi_iter.bi_sector = ioend->io_sector;
- ioend->io_private = oz;
- atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */
-
- if (is_seq) {
- ioend->io_bio.bi_opf &= ~REQ_OP_WRITE;
- ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND;
- } else {
- xfs_mark_rtg_boundary(ioend);
- }
-
- submit_bio(&ioend->io_bio);
-}
-
/*
* Cache the last zone written to for an inode so that it is considered first
* for subsequent writes.
@@ -891,6 +865,26 @@ xfs_zone_cache_create_association(
xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru);
}
+static void
+xfs_submit_zoned_bio(
+ struct iomap_ioend *ioend,
+ struct xfs_open_zone *oz,
+ bool is_seq)
+{
+ ioend->io_bio.bi_iter.bi_sector = ioend->io_sector;
+ ioend->io_private = oz;
+ atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */
+
+ if (is_seq) {
+ ioend->io_bio.bi_opf &= ~REQ_OP_WRITE;
+ ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND;
+ } else {
+ xfs_mark_rtg_boundary(ioend);
+ }
+
+ submit_bio(&ioend->io_bio);
+}
+
void
xfs_zone_alloc_and_submit(
struct iomap_ioend *ioend,
@@ -983,7 +977,7 @@ xfs_zone_rgbno_is_valid(
lockdep_assert_held(&rtg_rmap(rtg)->i_lock);
if (rtg->rtg_open_zone)
- return rgbno < rtg->rtg_open_zone->oz_write_pointer;
+ return rgbno < rtg->rtg_open_zone->oz_allocated;
return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa,
rtg_rgno(rtg), XFS_RTG_FREE);
}
@@ -1017,7 +1011,7 @@ xfs_init_zone(
{
struct xfs_mount *mp = rtg_mount(rtg);
struct xfs_zone_info *zi = mp->m_zone_info;
- uint64_t used = rtg_rmap(rtg)->i_used_blocks;
+ uint32_t used = rtg_rmap(rtg)->i_used_blocks;
xfs_rgblock_t write_pointer, highest_rgbno;
int error;
@@ -1114,24 +1108,27 @@ xfs_get_zone_info_cb(
}
/*
- * Calculate the max open zone limit based on the of number of
- * backing zones available
+ * Calculate the max open zone limit based on the of number of backing zones
+ * available.
*/
static inline uint32_t
xfs_max_open_zones(
struct xfs_mount *mp)
{
unsigned int max_open, max_open_data_zones;
+
/*
- * We need two zones for every open data zone,
- * one in reserve as we don't reclaim open zones. One data zone
- * and its spare is included in XFS_MIN_ZONES.
+ * We need two zones for every open data zone, one in reserve as we
+ * don't reclaim open zones. One data zone and its spare is included
+ * in XFS_MIN_ZONES to support at least one user data writer.
*/
max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1;
max_open = max_open_data_zones + XFS_OPEN_GC_ZONES;
/*
- * Cap the max open limit to 1/4 of available space
+ * Cap the max open limit to 1/4 of available space. Without this we'd
+ * run out of easy reclaim targets too quickly and storage devices don't
+ * handle huge numbers of concurrent write streams overly well.
*/
max_open = min(max_open, mp->m_sb.sb_rgcount / 4);
diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h
index ecf39106704c..4db02816d0fd 100644
--- a/fs/xfs/xfs_zone_alloc.h
+++ b/fs/xfs/xfs_zone_alloc.h
@@ -23,9 +23,9 @@ struct xfs_zone_alloc_ctx {
*/
#define XFS_ZR_RESERVED (1U << 2)
-int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb,
+int xfs_zoned_space_reserve(struct xfs_mount *mp, xfs_filblks_t count_fsb,
unsigned int flags, struct xfs_zone_alloc_ctx *ac);
-void xfs_zoned_space_unreserve(struct xfs_inode *ip,
+void xfs_zoned_space_unreserve(struct xfs_mount *mp,
struct xfs_zone_alloc_ctx *ac);
void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb);
diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
index d613a4094db6..064cd1a857a0 100644
--- a/fs/xfs/xfs_zone_gc.c
+++ b/fs/xfs/xfs_zone_gc.c
@@ -290,8 +290,6 @@ xfs_zone_gc_query_cb(
return 0;
}
-#define cmp_int(l, r) ((l > r) - (l < r))
-
static int
xfs_zone_gc_rmap_rec_cmp(
const void *a,
@@ -330,10 +328,7 @@ xfs_zone_gc_query(
iter->rec_idx = 0;
iter->rec_count = 0;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
-
+ tp = xfs_trans_alloc_empty(mp);
xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
cur = xfs_rtrmapbt_init_cursor(tp, rtg);
error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
@@ -535,8 +530,7 @@ xfs_zone_gc_steal_open(
spin_lock(&zi->zi_open_zones_lock);
list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
- if (!found ||
- oz->oz_write_pointer < found->oz_write_pointer)
+ if (!found || oz->oz_allocated < found->oz_allocated)
found = oz;
}
@@ -586,7 +580,7 @@ xfs_zone_gc_ensure_target(
{
struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone;
- if (!oz || oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
+ if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
return xfs_zone_gc_select_target(mp);
return oz;
}
@@ -607,7 +601,7 @@ xfs_zone_gc_space_available(
oz = xfs_zone_gc_ensure_target(data->mp);
if (!oz)
return false;
- return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) &&
+ return oz->oz_allocated < rtg_blocks(oz->oz_rtg) &&
xfs_zone_gc_scratch_available(data);
}
@@ -649,7 +643,7 @@ xfs_zone_gc_alloc_blocks(
*/
spin_lock(&mp->m_sb_lock);
*count_fsb = min(*count_fsb,
- rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer);
+ rtg_blocks(oz->oz_rtg) - oz->oz_allocated);
*count_fsb = min3(*count_fsb,
mp->m_free[XC_FREE_RTEXTENTS].res_avail,
mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
@@ -663,8 +657,8 @@ xfs_zone_gc_alloc_blocks(
*daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
if (!*is_seq)
- *daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer);
- oz->oz_write_pointer += *count_fsb;
+ *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated);
+ oz->oz_allocated += *count_fsb;
atomic_inc(&oz->oz_ref);
return oz;
}
diff --git a/fs/xfs/xfs_zone_info.c b/fs/xfs/xfs_zone_info.c
index 733bcc2f8645..07e30c596975 100644
--- a/fs/xfs/xfs_zone_info.c
+++ b/fs/xfs/xfs_zone_info.c
@@ -32,7 +32,7 @@ xfs_show_open_zone(
{
seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s\n",
rtg_rgno(oz->oz_rtg),
- oz->oz_write_pointer, oz->oz_written,
+ oz->oz_allocated, oz->oz_written,
rtg_rmap(oz->oz_rtg)->i_used_blocks,
xfs_write_hint_to_str(oz->oz_write_hint));
}
diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
index ab696975a993..35e6de3d25ed 100644
--- a/fs/xfs/xfs_zone_priv.h
+++ b/fs/xfs/xfs_zone_priv.h
@@ -11,18 +11,18 @@ struct xfs_open_zone {
atomic_t oz_ref;
/*
- * oz_write_pointer is the write pointer at which space is handed out
- * for conventional zones, or simple the count of blocks handed out
- * so far for sequential write required zones and is protected by
- * oz_alloc_lock/
+ * oz_allocated is the amount of space already allocated out of the zone
+ * and is protected by oz_alloc_lock.
+ *
+ * For conventional zones it also is the offset of the next write.
*/
spinlock_t oz_alloc_lock;
- xfs_rgblock_t oz_write_pointer;
+ xfs_rgblock_t oz_allocated;
/*
- * oz_written is the number of blocks for which we've received a
- * write completion. oz_written must always be <= oz_write_pointer
- * and is protected by the ILOCK of the rmap inode.
+ * oz_written is the number of blocks for which we've received a write
+ * completion. oz_written must always be <= oz_allocated and is
+ * protected by the ILOCK of the rmap inode.
*/
xfs_rgblock_t oz_written;
diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c
index 93c9a7721139..1313c55b8cbe 100644
--- a/fs/xfs/xfs_zone_space_resv.c
+++ b/fs/xfs/xfs_zone_space_resv.c
@@ -117,11 +117,10 @@ xfs_zoned_space_wait_error(
static int
xfs_zoned_reserve_available(
- struct xfs_inode *ip,
+ struct xfs_mount *mp,
xfs_filblks_t count_fsb,
unsigned int flags)
{
- struct xfs_mount *mp = ip->i_mount;
struct xfs_zone_info *zi = mp->m_zone_info;
struct xfs_zone_reservation reservation = {
.task = current,
@@ -198,11 +197,10 @@ xfs_zoned_reserve_available(
*/
static int
xfs_zoned_reserve_extents_greedy(
- struct xfs_inode *ip,
+ struct xfs_mount *mp,
xfs_filblks_t *count_fsb,
unsigned int flags)
{
- struct xfs_mount *mp = ip->i_mount;
struct xfs_zone_info *zi = mp->m_zone_info;
s64 len = *count_fsb;
int error = -ENOSPC;
@@ -220,12 +218,11 @@ xfs_zoned_reserve_extents_greedy(
int
xfs_zoned_space_reserve(
- struct xfs_inode *ip,
+ struct xfs_mount *mp,
xfs_filblks_t count_fsb,
unsigned int flags,
struct xfs_zone_alloc_ctx *ac)
{
- struct xfs_mount *mp = ip->i_mount;
int error;
ASSERT(ac->reserved_blocks == 0);
@@ -234,11 +231,11 @@ xfs_zoned_space_reserve(
error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
flags & XFS_ZR_RESERVED);
if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
- error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags);
+ error = xfs_zoned_reserve_extents_greedy(mp, &count_fsb, flags);
if (error)
return error;
- error = xfs_zoned_reserve_available(ip, count_fsb, flags);
+ error = xfs_zoned_reserve_available(mp, count_fsb, flags);
if (error) {
xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb);
return error;
@@ -249,12 +246,10 @@ xfs_zoned_space_reserve(
void
xfs_zoned_space_unreserve(
- struct xfs_inode *ip,
+ struct xfs_mount *mp,
struct xfs_zone_alloc_ctx *ac)
{
if (ac->reserved_blocks > 0) {
- struct xfs_mount *mp = ip->i_mount;
-
xfs_zoned_add_available(mp, ac->reserved_blocks);
xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks);
}
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 42e2c0065bb3..fd3a5922f6c3 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -124,37 +124,46 @@ static void zonefs_readahead(struct readahead_control *rac)
* Map blocks for page writeback. This is used only on conventional zone files,
* which implies that the page range can only be within the fixed inode size.
*/
-static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
- struct inode *inode, loff_t offset,
- unsigned int len)
+static ssize_t zonefs_writeback_range(struct iomap_writepage_ctx *wpc,
+ struct folio *folio, u64 offset, unsigned len, u64 end_pos)
{
- struct zonefs_zone *z = zonefs_inode_zone(inode);
+ struct zonefs_zone *z = zonefs_inode_zone(wpc->inode);
if (WARN_ON_ONCE(zonefs_zone_is_seq(z)))
return -EIO;
- if (WARN_ON_ONCE(offset >= i_size_read(inode)))
+ if (WARN_ON_ONCE(offset >= i_size_read(wpc->inode)))
return -EIO;
/* If the mapping is already OK, nothing needs to be done */
- if (offset >= wpc->iomap.offset &&
- offset < wpc->iomap.offset + wpc->iomap.length)
- return 0;
+ if (offset < wpc->iomap.offset ||
+ offset >= wpc->iomap.offset + wpc->iomap.length) {
+ int error;
+
+ error = zonefs_write_iomap_begin(wpc->inode, offset,
+ z->z_capacity - offset, IOMAP_WRITE,
+ &wpc->iomap, NULL);
+ if (error)
+ return error;
+ }
- return zonefs_write_iomap_begin(inode, offset,
- z->z_capacity - offset,
- IOMAP_WRITE, &wpc->iomap, NULL);
+ return iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
}
static const struct iomap_writeback_ops zonefs_writeback_ops = {
- .map_blocks = zonefs_write_map_blocks,
+ .writeback_range = zonefs_writeback_range,
+ .writeback_submit = iomap_ioend_writeback_submit,
};
static int zonefs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct iomap_writepage_ctx wpc = { };
+ struct iomap_writepage_ctx wpc = {
+ .inode = mapping->host,
+ .wbc = wbc,
+ .ops = &zonefs_writeback_ops,
+ };
- return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops);
+ return iomap_writepages(&wpc);
}
static int zonefs_swap_activate(struct swap_info_struct *sis,
@@ -312,8 +321,10 @@ static const struct vm_operations_struct zonefs_file_vm_ops = {
.page_mkwrite = zonefs_filemap_page_mkwrite,
};
-static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int zonefs_file_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
+
/*
* Conventional zones accept random writes, so their files can support
* shared writable mappings. For sequential zone files, only read
@@ -321,11 +332,11 @@ static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma)
* ordering between msync() and page cache writeback.
*/
if (zonefs_inode_is_seq(file_inode(file)) &&
- (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+ (desc->vm_flags & VM_SHARED) && (desc->vm_flags & VM_MAYWRITE))
return -EINVAL;
file_accessed(file);
- vma->vm_ops = &zonefs_file_vm_ops;
+ desc->vm_ops = &zonefs_file_vm_ops;
return 0;
}
@@ -563,7 +574,8 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
if (ret <= 0)
goto inode_unlock;
- ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops, NULL);
+ ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops,
+ NULL, NULL);
if (ret == -EIO)
zonefs_io_error(inode, true);
@@ -850,7 +862,7 @@ const struct file_operations zonefs_file_operations = {
.open = zonefs_file_open,
.release = zonefs_file_release,
.fsync = zonefs_file_fsync,
- .mmap = zonefs_file_mmap,
+ .mmap_prepare = zonefs_file_mmap_prepare,
.llseek = zonefs_file_llseek,
.read_iter = zonefs_file_read_iter,
.write_iter = zonefs_file_write_iter,
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index d165eb979f21..4dc7f967c861 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -1113,11 +1113,12 @@ static int zonefs_read_super(struct super_block *sb)
u32 crc, stored_crc;
int ret;
- super = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ super = kmalloc(ZONEFS_SUPER_SIZE, GFP_KERNEL);
if (!super)
return -ENOMEM;
- ret = bdev_rw_virt(sb->s_bdev, 0, super, PAGE_SIZE, REQ_OP_READ);
+ ret = bdev_rw_virt(sb->s_bdev, 0, super, ZONEFS_SUPER_SIZE,
+ REQ_OP_READ);
if (ret)
goto free_super;